From 8f56488c1744949f74986ab916ec26050a20442b Mon Sep 17 00:00:00 2001 From: Sergey Potapov Date: Fri, 9 Nov 2018 21:11:34 +0100 Subject: [PATCH 1/6] Add templates/lang.rs to Cargo.toml --- .travis.yml | 4 ++++ Cargo.toml | 1 + 2 files changed, 5 insertions(+) diff --git a/.travis.yml b/.travis.yml index 2ef885d..d0fed17 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,3 +2,7 @@ language: rust rust: - 1.30.1 - stable +script: + - cargo fmt -- --check + - cargo test + - cargo package diff --git a/Cargo.toml b/Cargo.toml index 060da1c..69f3ac9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ include = [ "test/**/*", "misc/data.json", "misc/supported_laguages.csv", + "templates/lang.rs", "build.rs", "Cargo.toml", "README.md" From a604c8c6abf70094f6a7219ec98a4b90fc1f3c05 Mon Sep 17 00:00:00 2001 From: Sergey Potapov Date: Fri, 9 Nov 2018 21:24:12 +0100 Subject: [PATCH 2/6] Fix typo: misc/supported_laguages.csv -> misc/supported_languages.csv --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 69f3ac9..5865d62 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ include = [ "src/**/*", "test/**/*", "misc/data.json", - "misc/supported_laguages.csv", + "misc/supported_languages.csv", "templates/lang.rs", "build.rs", "Cargo.toml", From b26296e752231d6cd4e125a23e6ea83c319a1979 Mon Sep 17 00:00:00 2001 From: Sergey Potapov Date: Fri, 9 Nov 2018 21:25:07 +0100 Subject: [PATCH 3/6] Fix formatting --- benches/example.rs | 2 +- build.rs | 42 ++++-- examples/cli.rs | 5 +- src/detect.rs | 102 +++++++------ src/detector.rs | 13 +- src/info.rs | 2 +- src/lib.rs | 18 +-- src/options.rs | 4 +- src/script.rs | 346 ++++++++++++++++++++++++--------------------- src/trigrams.rs | 54 ++++--- src/utils.rs | 4 +- tests/detect.rs | 2 +- tests/proptests.rs | 3 +- 13 files changed, 343 insertions(+), 254 deletions(-) diff --git a/benches/example.rs b/benches/example.rs index c26a660..f0776fc 100644 --- a/benches/example.rs +++ b/benches/example.rs @@ -1,7 +1,7 @@ #[macro_use] extern crate bencher; -extern crate whatlang; extern crate serde_json; +extern crate whatlang; use bencher::Bencher; use std::collections::HashMap; diff --git a/build.rs b/build.rs index 24a160a..101ce47 100644 --- a/build.rs +++ b/build.rs @@ -1,16 +1,16 @@ extern crate csv; -extern crate skeptic; -extern crate serde_json; extern crate serde; +extern crate serde_json; +extern crate skeptic; #[macro_use] extern crate serde_derive; extern crate tera; -use std::io::{Write, BufReader, BufWriter}; use std::collections::HashMap; +use std::env; use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; use std::path::Path; -use std::env; const DATA_PATH: &'static str = "misc/data.json"; const SUPPORTED_LANG_PATH: &'static str = "misc/supported_languages.csv"; @@ -53,16 +53,21 @@ fn generate_source_files() { fn load_data() -> (Vec, HashMap>) { let data_file = BufReader::new(File::open(DATA_PATH).unwrap()); - let mut lang_reader = csv::ReaderBuilder::new().flexible(true).from_path(SUPPORTED_LANG_PATH).unwrap(); + let mut lang_reader = csv::ReaderBuilder::new() + .flexible(true) + .from_path(SUPPORTED_LANG_PATH) + .unwrap(); let mut lang_infos: Vec = lang_reader.deserialize().map(Result::unwrap).collect(); lang_infos.sort_by(|left, right| left.code.cmp(&right.code)); - let supported_lang_codes: HashMap = lang_infos.iter() + let supported_lang_codes: HashMap = lang_infos + .iter() .map(|lang| (lang.code.clone(), lang.clone())) .collect(); - let lang_data: HashMap> = serde_json::from_reader(data_file).unwrap(); + let lang_data: HashMap> = + serde_json::from_reader(data_file).unwrap(); let mut scripts: HashMap> = HashMap::with_capacity(lang_data.len()); let mut all_langs: Vec = Vec::new(); @@ -75,23 +80,36 @@ fn load_data() -> (Vec, HashMap>) { let lang = Lang { info: (*info).clone(), script: script.clone(), - trigrams: trigrams.split('|').map(Into::into).collect() + trigrams: trigrams.split('|').map(Into::into).collect(), }; if lang.trigrams.len() != TRIGRAM_COUNT { - panic!("Language {} has {} trigrams, instead of {}", code, lang.trigrams.len(), TRIGRAM_COUNT); + panic!( + "Language {} has {} trigrams, instead of {}", + code, + lang.trigrams.len(), + TRIGRAM_COUNT + ); } all_langs.push(lang.clone()); - scripts.entry(script.clone()).or_insert_with(Vec::new).push(lang); + scripts + .entry(script.clone()) + .or_insert_with(Vec::new) + .push(lang); } } (lang_infos, scripts) } -fn render_lang_rs(buf: &mut BufWriter, lang_infos: &[LangInfo], scripts: &HashMap>) { +fn render_lang_rs( + buf: &mut BufWriter, + lang_infos: &[LangInfo], + scripts: &HashMap>, +) { let mut tera = tera::Tera::default(); - tera.add_template_file(TEMPLATE_LANG_RS_PATH, Some("lang.rs")).unwrap(); + tera.add_template_file(TEMPLATE_LANG_RS_PATH, Some("lang.rs")) + .unwrap(); let mut ctx = tera::Context::new(); ctx.insert("lang_infos", lang_infos); diff --git a/examples/cli.rs b/examples/cli.rs index 99cb047..9e984af 100644 --- a/examples/cli.rs +++ b/examples/cli.rs @@ -6,7 +6,9 @@ use whatlang::detect; fn main() { let mut text = String::new(); println!("Please enter a text:"); - io::stdin().read_line(&mut text).expect("Failed to read line"); + io::stdin() + .read_line(&mut text) + .expect("Failed to read line"); if let Some(info) = detect(&text) { println!("Language: {}", info.lang()); @@ -16,4 +18,3 @@ fn main() { println!("Cannot recognize a language :("); } } - diff --git a/src/detect.rs b/src/detect.rs index 2b4eae8..f53cf80 100644 --- a/src/detect.rs +++ b/src/detect.rs @@ -1,11 +1,11 @@ use hashbrown::HashMap; +use constants::{MAX_TOTAL_DISTANCE, MAX_TRIGRAM_DISTANCE}; +use info::Info; use lang::*; +use options::{List, Options}; use script::*; use trigrams::*; -use info::Info; -use options::{Options, List}; -use constants::{MAX_TRIGRAM_DISTANCE, MAX_TOTAL_DISTANCE}; /// Detect a language and a script by a given text. /// @@ -39,56 +39,66 @@ pub fn detect_lang_with_options(text: &str, options: &Options) -> Option { pub fn detect_with_options(text: &str, options: &Options) -> Option { detect_script(text).and_then(|script| { - detect_lang_based_on_script(text, options, script).map( |(lang, confidence)| { - Info { lang, script, confidence } + detect_lang_based_on_script(text, options, script).map(|(lang, confidence)| Info { + lang, + script, + confidence, }) }) } -fn detect_lang_based_on_script(text: &str, options: &Options, script : Script) -> Option<(Lang, f64)> { +fn detect_lang_based_on_script( + text: &str, + options: &Options, + script: Script, +) -> Option<(Lang, f64)> { match script { - Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS), - Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS), + Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS), + Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS), Script::Devanagari => detect_lang_in_profiles(text, options, DEVANAGARI_LANGS), - Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS), - Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS), - Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS), - Script::Mandarin => Some((Lang::Cmn, 1.0)), - Script::Bengali => Some((Lang::Ben, 1.0)), - Script::Hangul => Some((Lang::Kor, 1.0)), - Script::Georgian => Some((Lang::Kat, 1.0)), - Script::Greek => Some((Lang::Ell, 1.0)), - Script::Kannada => Some((Lang::Kan, 1.0)), - Script::Tamil => Some((Lang::Tam, 1.0)), - Script::Thai => Some((Lang::Tha, 1.0)), - Script::Gujarati => Some((Lang::Guj, 1.0)), - Script::Gurmukhi => Some((Lang::Pan, 1.0)), - Script::Telugu => Some((Lang::Tel, 1.0)), + Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS), + Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS), + Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS), + Script::Mandarin => Some((Lang::Cmn, 1.0)), + Script::Bengali => Some((Lang::Ben, 1.0)), + Script::Hangul => Some((Lang::Kor, 1.0)), + Script::Georgian => Some((Lang::Kat, 1.0)), + Script::Greek => Some((Lang::Ell, 1.0)), + Script::Kannada => Some((Lang::Kan, 1.0)), + Script::Tamil => Some((Lang::Tam, 1.0)), + Script::Thai => Some((Lang::Tha, 1.0)), + Script::Gujarati => Some((Lang::Guj, 1.0)), + Script::Gurmukhi => Some((Lang::Pan, 1.0)), + Script::Telugu => Some((Lang::Tel, 1.0)), Script::Malayalam => Some((Lang::Mal, 1.0)), - Script::Oriya => Some((Lang::Ori, 1.0)), - Script::Myanmar => Some((Lang::Mya, 1.0)), - Script::Sinhala => Some((Lang::Sin, 1.0)), - Script::Khmer => Some((Lang::Khm, 1.0)), - Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0)) + Script::Oriya => Some((Lang::Ori, 1.0)), + Script::Myanmar => Some((Lang::Mya, 1.0)), + Script::Sinhala => Some((Lang::Sin, 1.0)), + Script::Khmer => Some((Lang::Khm, 1.0)), + Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0)), } } -fn detect_lang_in_profiles(text: &str, options: &Options, lang_profile_list : LangProfileList) -> Option<(Lang, f64)> { - let mut lang_distances : Vec<(Lang, u32)> = vec![]; +fn detect_lang_in_profiles( + text: &str, + options: &Options, + lang_profile_list: LangProfileList, +) -> Option<(Lang, f64)> { + let mut lang_distances: Vec<(Lang, u32)> = vec![]; let trigrams = get_trigrams_with_positions(text); for &(ref lang, lang_trigrams) in lang_profile_list { match options.list { Some(List::White(ref whitelist)) if !whitelist.contains(lang) => continue, Some(List::Black(ref blacklist)) if blacklist.contains(lang) => continue, - _ => {}, + _ => {} } let dist = calculate_distance(lang_trigrams, &trigrams); lang_distances.push(((*lang), dist)); } // Sort languages by distance - lang_distances.sort_by_key(|key| key.1 ); + lang_distances.sort_by_key(|key| key.1); // Return None if lang_distances is empty // Return the only language with is_reliable=true if there is only 1 item @@ -131,23 +141,22 @@ fn detect_lang_in_profiles(text: &str, options: &Options, lang_profile_list : La // Numbers 12.0 and 0.05 are obtained experimentally, so the function represents common sense. // let confident_rate = (12.0 / trigrams.len() as f64) + 0.05; - let confidence = - if rate > confident_rate { - 1.0 - } else { - rate / confident_rate - }; + let confidence = if rate > confident_rate { + 1.0 + } else { + rate / confident_rate + }; Some((lang_dist1.0, confidence)) } -fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap) -> u32 { +fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap) -> u32 { let mut total_dist = 0u32; for (i, &trigram) in lang_trigrams.iter().enumerate() { let dist = match text_trigrams.get(trigram) { Some(&n) => (n as i32 - i as i32).abs() as u32, - None => MAX_TRIGRAM_DISTANCE + None => MAX_TRIGRAM_DISTANCE, }; total_dist += dist; } @@ -186,7 +195,16 @@ mod tests { assert_eq!(info.lang, Lang::Tgl); // with blacklist - let blacklist = vec![Lang::Tgl, Lang::Jav, Lang::Nld, Lang::Uzb, Lang::Swe, Lang::Nob, Lang::Ceb, Lang::Ilo]; + let blacklist = vec![ + Lang::Tgl, + Lang::Jav, + Lang::Nld, + Lang::Uzb, + Lang::Swe, + Lang::Nob, + Lang::Ceb, + Lang::Ilo, + ]; let options = Options::new().set_blacklist(blacklist); let output = detect_with_options(text, &options); assert_eq!(output.is_some(), true); @@ -224,7 +242,9 @@ mod tests { let info = detect("qwertyuioasdfghjklzxcvbnm").unwrap(); assert!(!info.is_reliable()); - let info = detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm").unwrap(); + let info = + detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm") + .unwrap(); assert!(!info.is_reliable()); // 1000 chars of randomly generated Cyrillic text diff --git a/src/detector.rs b/src/detector.rs index 0102c3d..2d4bb20 100644 --- a/src/detector.rs +++ b/src/detector.rs @@ -1,9 +1,9 @@ -use lang::Lang; -use script::Script; -use script::detect_script; +use detect; use info::Info; +use lang::Lang; use options::Options; -use detect; +use script::detect_script; +use script::Script; /// Configurable structure that holds detection options and provides functions /// to detect language and script. @@ -72,7 +72,10 @@ mod tests { #[test] fn test_detect_script() { // Russian, Cyrillic - assert_eq!(Detector::new().detect_script("Кириллица"), Some(Script::Cyrillic)); + assert_eq!( + Detector::new().detect_script("Кириллица"), + Some(Script::Cyrillic) + ); } #[test] diff --git a/src/info.rs b/src/info.rs index 808ee61..73837f6 100644 --- a/src/info.rs +++ b/src/info.rs @@ -8,7 +8,7 @@ const RELIABLE_CONFIDENCE_THRESHOLD: f64 = 0.8; pub struct Info { pub(crate) lang: Lang, pub(crate) script: Script, - pub(crate) confidence: f64 + pub(crate) confidence: f64, } impl Info { diff --git a/src/lib.rs b/src/lib.rs index 63b9ea0..f640329 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,21 +32,21 @@ //! assert_eq!(lang, Some(Lang::Eng)); extern crate hashbrown; -mod lang; -mod script; -mod info; -mod utils; -mod trigrams; +mod constants; mod detect; mod detector; +mod info; +mod lang; mod options; -mod constants; +mod script; +mod trigrams; +mod utils; -pub use lang::Lang; -pub use script::Script; -pub use info::Info; pub use detector::Detector; +pub use info::Info; +pub use lang::Lang; pub use options::Options; +pub use script::Script; pub use detect::detect; pub use detect::detect_lang; diff --git a/src/options.rs b/src/options.rs index 38aff8b..f83a7a1 100644 --- a/src/options.rs +++ b/src/options.rs @@ -3,13 +3,13 @@ use lang::Lang; #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) enum List { White(Vec), - Black(Vec) + Black(Vec), } /// Allows to customize behaviour of [Detector](struct.Detector.html). #[derive(Debug, Clone, PartialEq, Eq, Default)] pub struct Options { - pub(crate) list: Option + pub(crate) list: Option, } impl Options { diff --git a/src/script.rs b/src/script.rs index a8d266c..f724a6f 100644 --- a/src/script.rs +++ b/src/script.rs @@ -1,5 +1,5 @@ -use utils::is_stop_char; use std::fmt; +use utils::is_stop_char; /// Represents a writing system (Latin, Cyrillic, Arabic, etc). #[derive(PartialEq, Eq, Debug, Clone, Copy)] @@ -34,30 +34,30 @@ pub enum Script { impl Script { pub fn name(&self) -> &str { match *self { - Script::Latin => "Latin", - Script::Cyrillic => "Cyrillic", - Script::Arabic => "Arabic", + Script::Latin => "Latin", + Script::Cyrillic => "Cyrillic", + Script::Arabic => "Arabic", Script::Devanagari => "Devanagari", - Script::Hiragana => "Hiragana", - Script::Katakana => "Katakana", - Script::Ethiopic => "Ethiopic", - Script::Hebrew => "Hebrew", - Script::Bengali => "Bengali", - Script::Georgian => "Georgian", - Script::Mandarin => "Mandarin", - Script::Hangul => "Hangul", - Script::Greek => "Greek", - Script::Kannada => "Kannada", - Script::Tamil => "Tamil", - Script::Thai => "Thai", - Script::Gujarati => "Gujarati", - Script::Gurmukhi => "Gurmukhi", - Script::Telugu => "Telugu", - Script::Malayalam => "Malayalam", - Script::Oriya => "Oriya", - Script::Myanmar => "Myanmar", - Script::Sinhala => "Sinhala", - Script::Khmer => "Khmer" + Script::Hiragana => "Hiragana", + Script::Katakana => "Katakana", + Script::Ethiopic => "Ethiopic", + Script::Hebrew => "Hebrew", + Script::Bengali => "Bengali", + Script::Georgian => "Georgian", + Script::Mandarin => "Mandarin", + Script::Hangul => "Hangul", + Script::Greek => "Greek", + Script::Kannada => "Kannada", + Script::Tamil => "Tamil", + Script::Thai => "Thai", + Script::Gujarati => "Gujarati", + Script::Gurmukhi => "Gurmukhi", + Script::Telugu => "Telugu", + Script::Malayalam => "Malayalam", + Script::Oriya => "Oriya", + Script::Myanmar => "Myanmar", + Script::Sinhala => "Sinhala", + Script::Khmer => "Khmer", } } } @@ -80,36 +80,38 @@ type ScriptCounter = (Script, fn(char) -> bool, usize); /// ``` pub fn detect_script(text: &str) -> Option