Skip to content

Commit

Permalink
Merge pull request #28 from greyblake/prepare-v06
Browse files Browse the repository at this point in the history
Prepare for v 0.6.0
  • Loading branch information
greyblake authored Nov 9, 2018
2 parents e23fb7d + 201c89e commit 73e1e2f
Show file tree
Hide file tree
Showing 16 changed files with 359 additions and 261 deletions.
8 changes: 8 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,11 @@ language: rust
rust:
- 1.30.1
- stable
install:
- rustup component add rustfmt-preview
- rustup component add clippy-preview
script:
- cargo fmt -- --check
- cargo clippy -- -D warnings
- cargo test
- cargo package
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ include = [
"src/**/*",
"test/**/*",
"misc/data.json",
"misc/supported_laguages.csv",
"misc/supported_languages.csv",
"templates/lang.rs",
"build.rs",
"Cargo.toml",
"README.md"
Expand Down
2 changes: 1 addition & 1 deletion benches/example.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#[macro_use]
extern crate bencher;
extern crate whatlang;
extern crate serde_json;
extern crate whatlang;

use bencher::Bencher;
use std::collections::HashMap;
Expand Down
42 changes: 30 additions & 12 deletions build.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
extern crate csv;
extern crate skeptic;
extern crate serde_json;
extern crate serde;
extern crate serde_json;
extern crate skeptic;
#[macro_use]
extern crate serde_derive;
extern crate tera;

use std::io::{Write, BufReader, BufWriter};
use std::collections::HashMap;
use std::env;
use std::fs::File;
use std::io::{BufReader, BufWriter, Write};
use std::path::Path;
use std::env;

const DATA_PATH: &'static str = "misc/data.json";
const SUPPORTED_LANG_PATH: &'static str = "misc/supported_languages.csv";
Expand Down Expand Up @@ -53,16 +53,21 @@ fn generate_source_files() {

fn load_data() -> (Vec<LangInfo>, HashMap<String, Vec<Lang>>) {
let data_file = BufReader::new(File::open(DATA_PATH).unwrap());
let mut lang_reader = csv::ReaderBuilder::new().flexible(true).from_path(SUPPORTED_LANG_PATH).unwrap();
let mut lang_reader = csv::ReaderBuilder::new()
.flexible(true)
.from_path(SUPPORTED_LANG_PATH)
.unwrap();

let mut lang_infos: Vec<LangInfo> = lang_reader.deserialize().map(Result::unwrap).collect();
lang_infos.sort_by(|left, right| left.code.cmp(&right.code));

let supported_lang_codes: HashMap<String, LangInfo> = lang_infos.iter()
let supported_lang_codes: HashMap<String, LangInfo> = lang_infos
.iter()
.map(|lang| (lang.code.clone(), lang.clone()))
.collect();

let lang_data: HashMap<String, HashMap<String, String>> = serde_json::from_reader(data_file).unwrap();
let lang_data: HashMap<String, HashMap<String, String>> =
serde_json::from_reader(data_file).unwrap();

let mut scripts: HashMap<String, Vec<Lang>> = HashMap::with_capacity(lang_data.len());
let mut all_langs: Vec<Lang> = Vec::new();
Expand All @@ -75,23 +80,36 @@ fn load_data() -> (Vec<LangInfo>, HashMap<String, Vec<Lang>>) {
let lang = Lang {
info: (*info).clone(),
script: script.clone(),
trigrams: trigrams.split('|').map(Into::into).collect()
trigrams: trigrams.split('|').map(Into::into).collect(),
};
if lang.trigrams.len() != TRIGRAM_COUNT {
panic!("Language {} has {} trigrams, instead of {}", code, lang.trigrams.len(), TRIGRAM_COUNT);
panic!(
"Language {} has {} trigrams, instead of {}",
code,
lang.trigrams.len(),
TRIGRAM_COUNT
);
}

all_langs.push(lang.clone());
scripts.entry(script.clone()).or_insert_with(Vec::new).push(lang);
scripts
.entry(script.clone())
.or_insert_with(Vec::new)
.push(lang);
}
}

(lang_infos, scripts)
}

fn render_lang_rs(buf: &mut BufWriter<File>, lang_infos: &[LangInfo], scripts: &HashMap<String, Vec<Lang>>) {
fn render_lang_rs(
buf: &mut BufWriter<File>,
lang_infos: &[LangInfo],
scripts: &HashMap<String, Vec<Lang>>,
) {
let mut tera = tera::Tera::default();
tera.add_template_file(TEMPLATE_LANG_RS_PATH, Some("lang.rs")).unwrap();
tera.add_template_file(TEMPLATE_LANG_RS_PATH, Some("lang.rs"))
.unwrap();

let mut ctx = tera::Context::new();
ctx.insert("lang_infos", lang_infos);
Expand Down
5 changes: 3 additions & 2 deletions examples/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ use whatlang::detect;
fn main() {
let mut text = String::new();
println!("Please enter a text:");
io::stdin().read_line(&mut text).expect("Failed to read line");
io::stdin()
.read_line(&mut text)
.expect("Failed to read line");

if let Some(info) = detect(&text) {
println!("Language: {}", info.lang());
Expand All @@ -16,4 +18,3 @@ fn main() {
println!("Cannot recognize a language :(");
}
}

106 changes: 63 additions & 43 deletions src/detect.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use hashbrown::HashMap;

use constants::{MAX_TOTAL_DISTANCE, MAX_TRIGRAM_DISTANCE};
use info::Info;
use lang::*;
use options::{List, Options};
use script::*;
use trigrams::*;
use info::Info;
use options::{Options, List};
use constants::{MAX_TRIGRAM_DISTANCE, MAX_TOTAL_DISTANCE};

/// Detect a language and a script by a given text.
///
Expand Down Expand Up @@ -39,56 +39,66 @@ pub fn detect_lang_with_options(text: &str, options: &Options) -> Option<Lang> {

pub fn detect_with_options(text: &str, options: &Options) -> Option<Info> {
detect_script(text).and_then(|script| {
detect_lang_based_on_script(text, options, script).map( |(lang, confidence)| {
Info { lang, script, confidence }
detect_lang_based_on_script(text, options, script).map(|(lang, confidence)| Info {
lang,
script,
confidence,
})
})
}

fn detect_lang_based_on_script(text: &str, options: &Options, script : Script) -> Option<(Lang, f64)> {
fn detect_lang_based_on_script(
text: &str,
options: &Options,
script: Script,
) -> Option<(Lang, f64)> {
match script {
Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS),
Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS),
Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS),
Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS),
Script::Devanagari => detect_lang_in_profiles(text, options, DEVANAGARI_LANGS),
Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS),
Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS),
Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS),
Script::Mandarin => Some((Lang::Cmn, 1.0)),
Script::Bengali => Some((Lang::Ben, 1.0)),
Script::Hangul => Some((Lang::Kor, 1.0)),
Script::Georgian => Some((Lang::Kat, 1.0)),
Script::Greek => Some((Lang::Ell, 1.0)),
Script::Kannada => Some((Lang::Kan, 1.0)),
Script::Tamil => Some((Lang::Tam, 1.0)),
Script::Thai => Some((Lang::Tha, 1.0)),
Script::Gujarati => Some((Lang::Guj, 1.0)),
Script::Gurmukhi => Some((Lang::Pan, 1.0)),
Script::Telugu => Some((Lang::Tel, 1.0)),
Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS),
Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS),
Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS),
Script::Mandarin => Some((Lang::Cmn, 1.0)),
Script::Bengali => Some((Lang::Ben, 1.0)),
Script::Hangul => Some((Lang::Kor, 1.0)),
Script::Georgian => Some((Lang::Kat, 1.0)),
Script::Greek => Some((Lang::Ell, 1.0)),
Script::Kannada => Some((Lang::Kan, 1.0)),
Script::Tamil => Some((Lang::Tam, 1.0)),
Script::Thai => Some((Lang::Tha, 1.0)),
Script::Gujarati => Some((Lang::Guj, 1.0)),
Script::Gurmukhi => Some((Lang::Pan, 1.0)),
Script::Telugu => Some((Lang::Tel, 1.0)),
Script::Malayalam => Some((Lang::Mal, 1.0)),
Script::Oriya => Some((Lang::Ori, 1.0)),
Script::Myanmar => Some((Lang::Mya, 1.0)),
Script::Sinhala => Some((Lang::Sin, 1.0)),
Script::Khmer => Some((Lang::Khm, 1.0)),
Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0))
Script::Oriya => Some((Lang::Ori, 1.0)),
Script::Myanmar => Some((Lang::Mya, 1.0)),
Script::Sinhala => Some((Lang::Sin, 1.0)),
Script::Khmer => Some((Lang::Khm, 1.0)),
Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0)),
}
}

fn detect_lang_in_profiles(text: &str, options: &Options, lang_profile_list : LangProfileList) -> Option<(Lang, f64)> {
let mut lang_distances : Vec<(Lang, u32)> = vec![];
fn detect_lang_in_profiles(
text: &str,
options: &Options,
lang_profile_list: LangProfileList,
) -> Option<(Lang, f64)> {
let mut lang_distances: Vec<(Lang, u32)> = vec![];
let trigrams = get_trigrams_with_positions(text);

for &(ref lang, lang_trigrams) in lang_profile_list {
match options.list {
Some(List::White(ref whitelist)) if !whitelist.contains(lang) => continue,
Some(List::Black(ref blacklist)) if blacklist.contains(lang) => continue,
_ => {},
_ => {}
}
let dist = calculate_distance(lang_trigrams, &trigrams);
lang_distances.push(((*lang), dist));
}

// Sort languages by distance
lang_distances.sort_by_key(|key| key.1 );
lang_distances.sort_by_key(|key| key.1);

// Return None if lang_distances is empty
// Return the only language with is_reliable=true if there is only 1 item
Expand Down Expand Up @@ -117,37 +127,36 @@ fn detect_lang_in_profiles(text: &str, options: &Options, lang_profile_list : La
// * Text really matches one of the languages.
//
// Number 500.0 is based on experiments and common sense expectations.
let mut confidence = (score1 as f64) / 500.0;
let mut confidence = f64::from(score1) / 500.0;
if confidence > 1.0 {
confidence = 1.0;
}
return Some((lang_dist1.0, confidence));
}

let rate = (score1 - score2) as f64 / (score2 as f64);
let rate = f64::from(score1 - score2) / f64::from(score2);

// Hyperbola function. Everything that is above the function has confidence = 1.0
// If rate is below, confidence is calculated proportionally.
// Numbers 12.0 and 0.05 are obtained experimentally, so the function represents common sense.
//
let confident_rate = (12.0 / trigrams.len() as f64) + 0.05;
let confidence =
if rate > confident_rate {
1.0
} else {
rate / confident_rate
};
let confidence = if rate > confident_rate {
1.0
} else {
rate / confident_rate
};

Some((lang_dist1.0, confidence))
}

fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 {
fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 {
let mut total_dist = 0u32;

for (i, &trigram) in lang_trigrams.iter().enumerate() {
let dist = match text_trigrams.get(trigram) {
Some(&n) => (n as i32 - i as i32).abs() as u32,
None => MAX_TRIGRAM_DISTANCE
None => MAX_TRIGRAM_DISTANCE,
};
total_dist += dist;
}
Expand Down Expand Up @@ -186,7 +195,16 @@ mod tests {
assert_eq!(info.lang, Lang::Tgl);

// with blacklist
let blacklist = vec![Lang::Tgl, Lang::Jav, Lang::Nld, Lang::Uzb, Lang::Swe, Lang::Nob, Lang::Ceb, Lang::Ilo];
let blacklist = vec![
Lang::Tgl,
Lang::Jav,
Lang::Nld,
Lang::Uzb,
Lang::Swe,
Lang::Nob,
Lang::Ceb,
Lang::Ilo,
];
let options = Options::new().set_blacklist(blacklist);
let output = detect_with_options(text, &options);
assert_eq!(output.is_some(), true);
Expand Down Expand Up @@ -224,7 +242,9 @@ mod tests {
let info = detect("qwertyuioasdfghjklzxcvbnm").unwrap();
assert!(!info.is_reliable());

let info = detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm").unwrap();
let info =
detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm")
.unwrap();
assert!(!info.is_reliable());

// 1000 chars of randomly generated Cyrillic text
Expand Down
13 changes: 8 additions & 5 deletions src/detector.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use lang::Lang;
use script::Script;
use script::detect_script;
use detect;
use info::Info;
use lang::Lang;
use options::Options;
use detect;
use script::detect_script;
use script::Script;

/// Configurable structure that holds detection options and provides functions
/// to detect language and script.
Expand Down Expand Up @@ -72,7 +72,10 @@ mod tests {
#[test]
fn test_detect_script() {
// Russian, Cyrillic
assert_eq!(Detector::new().detect_script("Кириллица"), Some(Script::Cyrillic));
assert_eq!(
Detector::new().detect_script("Кириллица"),
Some(Script::Cyrillic)
);
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion src/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const RELIABLE_CONFIDENCE_THRESHOLD: f64 = 0.8;
pub struct Info {
pub(crate) lang: Lang,
pub(crate) script: Script,
pub(crate) confidence: f64
pub(crate) confidence: f64,
}

impl Info {
Expand Down
8 changes: 4 additions & 4 deletions src/lang.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ impl Lang {
/// use whatlang::Lang;
/// assert_eq!(Lang::Ukr.name(), "Українська");
/// ```
pub fn name(&self) -> &'static str {
lang_to_name(*self)
pub fn name(self) -> &'static str {
lang_to_name(self)
}

/// Get a human readable name of the language in English.
Expand All @@ -44,8 +44,8 @@ impl Lang {
/// use whatlang::Lang;
/// assert_eq!(Lang::Deu.eng_name(), "German");
/// ```
pub fn eng_name(&self) -> &'static str {
lang_to_eng_name(*self)
pub fn eng_name(self) -> &'static str {
lang_to_eng_name(self)
}
}

Expand Down
Loading

0 comments on commit 73e1e2f

Please sign in to comment.