diff --git a/Andaluh.sln b/Andaluh.sln index 49aef28..b3a551f 100644 --- a/Andaluh.sln +++ b/Andaluh.sln @@ -15,21 +15,77 @@ EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU + Debug|ARM = Debug|ARM + Debug|ARM64 = Debug|ARM64 + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 Release|Any CPU = Release|Any CPU + Release|ARM = Release|ARM + Release|ARM64 = Release|ARM64 + Release|x64 = Release|x64 + Release|x86 = Release|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {25F19C8A-6301-41E7-B127-90387F73ABF6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {25F19C8A-6301-41E7-B127-90387F73ABF6}.Debug|Any CPU.Build.0 = Debug|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Debug|ARM.ActiveCfg = Debug|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Debug|ARM.Build.0 = Debug|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Debug|ARM64.ActiveCfg = Debug|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Debug|ARM64.Build.0 = Debug|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Debug|x64.ActiveCfg = Debug|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Debug|x64.Build.0 = Debug|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Debug|x86.ActiveCfg = Debug|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Debug|x86.Build.0 = Debug|Any CPU {25F19C8A-6301-41E7-B127-90387F73ABF6}.Release|Any CPU.ActiveCfg = Release|Any CPU {25F19C8A-6301-41E7-B127-90387F73ABF6}.Release|Any CPU.Build.0 = Release|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Release|ARM.ActiveCfg = Release|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Release|ARM.Build.0 = Release|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Release|ARM64.ActiveCfg = Release|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Release|ARM64.Build.0 = Release|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Release|x64.ActiveCfg = Release|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Release|x64.Build.0 = Release|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Release|x86.ActiveCfg = Release|Any CPU + {25F19C8A-6301-41E7-B127-90387F73ABF6}.Release|x86.Build.0 = Release|Any CPU {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Debug|Any CPU.Build.0 = Debug|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Debug|ARM.ActiveCfg = Debug|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Debug|ARM.Build.0 = Debug|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Debug|ARM64.ActiveCfg = Debug|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Debug|ARM64.Build.0 = Debug|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Debug|x64.ActiveCfg = Debug|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Debug|x64.Build.0 = Debug|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Debug|x86.ActiveCfg = Debug|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Debug|x86.Build.0 = Debug|Any CPU {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Release|Any CPU.ActiveCfg = Release|Any CPU {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Release|Any CPU.Build.0 = Release|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Release|ARM.ActiveCfg = Release|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Release|ARM.Build.0 = Release|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Release|ARM64.ActiveCfg = Release|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Release|ARM64.Build.0 = Release|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Release|x64.ActiveCfg = Release|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Release|x64.Build.0 = Release|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Release|x86.ActiveCfg = Release|Any CPU + {E0097AB9-0B72-4D41-81BB-53D64A282CB1}.Release|x86.Build.0 = Release|Any CPU {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Debug|Any CPU.Build.0 = Debug|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Debug|ARM.ActiveCfg = Debug|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Debug|ARM.Build.0 = Debug|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Debug|ARM64.ActiveCfg = Debug|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Debug|ARM64.Build.0 = Debug|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Debug|x64.ActiveCfg = Debug|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Debug|x64.Build.0 = Debug|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Debug|x86.ActiveCfg = Debug|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Debug|x86.Build.0 = Debug|Any CPU {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Release|Any CPU.ActiveCfg = Release|Any CPU {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Release|Any CPU.Build.0 = Release|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Release|ARM.ActiveCfg = Release|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Release|ARM.Build.0 = Release|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Release|ARM64.ActiveCfg = Release|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Release|ARM64.Build.0 = Release|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Release|x64.ActiveCfg = Release|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Release|x64.Build.0 = Release|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Release|x86.ActiveCfg = Release|Any CPU + {3A542431-3F5D-4F90-9151-58B13FE5BBFA}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/Andaluh/Andaluh.csproj b/Andaluh/Andaluh.csproj index 3072221..81fb875 100644 --- a/Andaluh/Andaluh.csproj +++ b/Andaluh/Andaluh.csproj @@ -1,10 +1,24 @@  - netcoreapp3.1 + netstandard2.0 8.0 Library - + true + false + GPL-3.0-or-later + Chan (aburrio@outlook.com) | AndaluGeeks + AndaluGeeks + Use this extension to transcript any spanish text to Andaluh + Copyright (C) AndaluGeeks 2020 + https://andaluh.es/ + https://github.com/andalugeeks/andaluh-net + Transliteration, Transcription, Andaluz, Andalu, Andalûh, Andalú + Transcriptor Andaluh + 1.0.3 + 1.0.3.0 + 1.0.3.0 + Downgraded framework version to increase compatibility @@ -13,4 +27,9 @@ + + + + + diff --git a/Andaluh/EPA.cs b/Andaluh/EPA.cs index 73d1600..aa924e2 100644 --- a/Andaluh/EPA.cs +++ b/Andaluh/EPA.cs @@ -5,7 +5,7 @@ namespace Andaluh { public static class EPA { - public static string Transcribe(this string text, string vaf = "VAF", string vvf = "VVF") => + public static string Transcribe(this string text) => text.IsNullOrEmpty() ? string.Empty : new EPAEngine().Transcribe(text); public static string ToAndaluh(this string text) => text.Transcribe(); diff --git a/Andaluh/EPAEngine.cs b/Andaluh/EPAEngine.cs index 4418c8b..3997daa 100644 --- a/Andaluh/EPAEngine.cs +++ b/Andaluh/EPAEngine.cs @@ -31,8 +31,6 @@ internal class EPAEngine new WordInteractionRules() }; - - public string Transcribe(string text) { var tokenizedString = new TokenEvaluator(text); diff --git a/Andaluh/Extensions/CharExtensions.cs b/Andaluh/Extensions/CharExtensions.cs index a2a8bb5..038bc1d 100644 --- a/Andaluh/Extensions/CharExtensions.cs +++ b/Andaluh/Extensions/CharExtensions.cs @@ -11,7 +11,7 @@ public static char GetVowelTilde(this char vowel) // If no tilde, replace with circumflex if (i != -1) return Constants.VOWELS_ALL_TILDE[i]; - if (Constants.VOWELS_ALL_TILDE.Contains(vowel)) return vowel; + if (Constants.VOWELS_ALL_TILDE.Contains(vowel.ToString())) return vowel; return '#'; } @@ -23,7 +23,7 @@ public static char GetVowelCircumflex(this char vowel) if (i != -1) return Constants.VOWELS_ALL_NOTILDE[i + 5]; - if (Constants.VOWELS_ALL_TILDE.Contains(vowel)) return vowel; + if (Constants.VOWELS_ALL_TILDE.Contains(vowel.ToString())) return vowel; return '#'; } diff --git a/Andaluh/Extensions/MatchCollectionExtensions.cs b/Andaluh/Extensions/MatchCollectionExtensions.cs new file mode 100644 index 0000000..ee94ae2 --- /dev/null +++ b/Andaluh/Extensions/MatchCollectionExtensions.cs @@ -0,0 +1,18 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Text.RegularExpressions; + +namespace Andaluh.Extensions +{ + public static class MatchCollectionExtensions + { + public static bool Any(this MatchCollection matches) => matches.Count != 0; + public static IEnumerable Where(this MatchCollection matches, Func func) + { + foreach (Match match in matches) + if (func(match)) yield return match; + } + } +} diff --git a/Andaluh/Extensions/StringExtensions.cs b/Andaluh/Extensions/StringExtensions.cs index 6b130d3..611ed4f 100644 --- a/Andaluh/Extensions/StringExtensions.cs +++ b/Andaluh/Extensions/StringExtensions.cs @@ -38,20 +38,10 @@ public static string ReplaceFirst(this string text, Match match, string replace, public static string GetWholeWord(this string text, int index) { - int startIndex, endIndex; - startIndex = text.GetWordStartIndex(index); - endIndex = text.GetWordEndIndex(index); - return text.Substring(startIndex, endIndex - startIndex); - } - - public static int GetWordEndIndex(this string text, int index) - { - if (index >= text.Length) return text.Length; + var startIndex = text.GetWordStartIndex(index); + var endIndex = text.GetWordEndIndex(index); - for (int i = index; i < text.Length; i++) - if (Constants.CARACTERES_NO_PALABRA.Any(c => c == text[i])) return i; - - return text.Length; + return text.Substring(startIndex, endIndex - startIndex); } public static int GetWordStartIndex(this string text, int index) @@ -63,18 +53,32 @@ public static int GetWordStartIndex(this string text, int index) return 0; } + + public static int GetWordEndIndex(this string text, int index) + { + if (index >= text.Length) return text.Length; + + for (int i = index; i < text.Length; i++) + if (Constants.CARACTERES_NO_PALABRA.Any(c => c == text[i])) return i; + + return text.Length; + } + public static string GetPrefix(this string text, Match match, int bias) { - var palabra = text.GetWholeWord(match.Index + bias); + var matchIndex = match.Index + bias; + var startIndex = text.GetWordStartIndex(matchIndex); - return palabra.Substring(0, palabra.IndexOf(match.Value)); + return text.Substring(startIndex, matchIndex - startIndex); } public static string GetSuffix(this string text, Match match, int bias) { - var palabra = text.GetWholeWord(match.Index + bias); + var matchIndex = match.Index + bias; + + var endIndex = text.GetWordEndIndex(matchIndex); - return palabra.Substring(palabra.IndexOf(match.Value) + match.Value.Length); + return text.Substring(matchIndex + match.Value.Length, endIndex - matchIndex - match.Value.Length); } public static string KeepCase(this string word, string replacement_word) @@ -104,5 +108,14 @@ public static string ReplaceFirstKeepingCase(this string text, string search, st } public static bool IsNullOrEmpty(this string str) => str == null || str.Trim().Length == 0; + + public static string GetRange(this string text, int start, int end) => + text.Substring(start, end - start); + + public static string GetRangeMinusRight(this string text, int start, int minusEnd) => + text.Substring(start, text.Length - minusEnd); + + public static char GetCharMinusRight(this string text, int minusEnd) => + text[text.Length - minusEnd]; } } \ No newline at end of file diff --git a/Andaluh/Rules/Base/Rule.cs b/Andaluh/Rules/Base/Rule.cs index 87447a8..fedfbdf 100644 --- a/Andaluh/Rules/Base/Rule.cs +++ b/Andaluh/Rules/Base/Rule.cs @@ -30,6 +30,7 @@ public string Execute(Dictionary dynamicRuleExceptions, string t return ReplaceMany(text); } + private string ReplaceMany(string text) { var matches = Pattern?.Matches(text); @@ -37,7 +38,7 @@ private string ReplaceMany(string text) var bias = 0; - foreach (Match match in matches.Where(x=>x.Success)) + foreach (Match match in matches.Where(x => x.Success)) { if (NotException(match, text, bias)) { @@ -52,7 +53,7 @@ private string ReplaceMany(string text) private bool NotException(Match match, string text, int bias) => !IsException(text.GetWholeWord(match.Index + bias)); - private bool IsException(string palabra) => + private bool IsException(string palabra) => Exceptions?.ContainsKey(palabra.ToLower()) == true || DynamicRuleExceptions?.ContainsKey(palabra.ToLower()) == true; diff --git a/Andaluh/Rules/Base/RuleBundle.cs b/Andaluh/Rules/Base/RuleBundle.cs index 565d929..ff2b004 100644 --- a/Andaluh/Rules/Base/RuleBundle.cs +++ b/Andaluh/Rules/Base/RuleBundle.cs @@ -5,6 +5,7 @@ namespace Andaluh.Rules.Base internal abstract class RuleBundle { protected readonly Dictionary DynamicRuleExceptions; + protected Dictionary DelayedAfterRuleDynamicRuleExceptions; protected abstract IEnumerable Rules { get; } public RuleBundle(Dictionary dynamicRuleExceptions = null) @@ -14,9 +15,24 @@ public RuleBundle(Dictionary dynamicRuleExceptions = null) public string Execute(string text) { foreach (var rule in Rules) + { + DelayedAfterRuleDynamicRuleExceptions = new Dictionary(); text = rule.Execute(DynamicRuleExceptions, text); + UpdateDynamicRulesAfterCurrentRule(); + } return text; } + + private void UpdateDynamicRulesAfterCurrentRule() + { + foreach (var exception in DelayedAfterRuleDynamicRuleExceptions) + { + if (!DynamicRuleExceptions.ContainsKey(exception.Key)) + DynamicRuleExceptions.Add(exception.Key, exception.Value); + else DynamicRuleExceptions[exception.Key] = exception.Value; + } + } + } } diff --git a/Andaluh/Rules/Base/RuleConstants.cs b/Andaluh/Rules/Base/RuleConstants.cs new file mode 100644 index 0000000..276e993 --- /dev/null +++ b/Andaluh/Rules/Base/RuleConstants.cs @@ -0,0 +1,9 @@ +using System.Text.RegularExpressions; + +namespace Andaluh.Rules.Base +{ + public static class RuleConstants + { + public static readonly Regex pattern_begin_lh = new Regex(@"(?i)\b[aáeéiíoóuú](lh)[aáeéiíoóuú]"); + } +} diff --git a/Andaluh/Rules/DigraphRules.cs b/Andaluh/Rules/DigraphRules.cs index 1c4ec57..b164ca6 100644 --- a/Andaluh/Rules/DigraphRules.cs +++ b/Andaluh/Rules/DigraphRules.cs @@ -11,17 +11,28 @@ internal class DigraphRules : RuleBundle private static readonly Regex pattern_digraph_special_2 = new Regex("(?i)(tr|p)([ao])(?:ns|st)([bcçdfghjklmnpqstvwxyz])"); private static readonly Regex pattern_digraph_special_3 = new Regex("(?i)([aeiouáéíóú])([bdnr])(s)([bcçdfghjklmnpqstvwxyz])"); private static readonly Regex pattern_digraph_special_4 = new Regex("(?i)([aeiouáéíóú])[djrstxz](l)"); - private static readonly Regex pattern_digraph_general = new Regex("(?i)([aeiouáéíóú])(" + string.Join("|", Constants.DIGRAPHS) + ")"); + private static readonly Regex pattern_digraph_general = new Regex(@"(?i)([aeiouáéíóú])(" + string.Join("|", Constants.DIGRAPHS) + ")"); + private readonly Dictionary Digraph_RULES_EXCEPT = new Dictionary(); protected override IEnumerable Rules => new[] { new Rule(pattern_digraph_special_1, digraph_special1_rules_replacer), new Rule(pattern_digraph_special_2, digraph_special2_rules_replacer), new Rule(pattern_digraph_special_3, digraph_special3_rules_replacer), - new Rule(pattern_digraph_special_4, digraph_special4_rules_replacer), - new Rule(pattern_digraph_general, digraph_general_rules_replacer) + new Rule(RuleConstants.pattern_begin_lh, exceptuar_patron), + new Rule(pattern_digraph_special_4, digraph_special4_rules_replacer, Digraph_RULES_EXCEPT), + new Rule(pattern_digraph_general, digraph_general_rules_replacer, Digraph_RULES_EXCEPT) }; + private string exceptuar_patron(Match match, string text, int bias) + { + var palabra = text.GetWholeWord(match.Index + bias); + if (!Digraph_RULES_EXCEPT.ContainsKey(palabra)) + Digraph_RULES_EXCEPT.Add(palabra, palabra); + + return match.Value; + } + private string digraph_special1_rules_replacer(Match match, string text, int bias) => match.Value[1] switch { @@ -34,7 +45,7 @@ private string digraph_special2_rules_replacer(Match match, string text, int bia { string init_char = match.Groups[1].Value; char vowel_char = match.Groups[2].Value[0]; - char cons_char = match.Groups[0].Value[^1]; + char cons_char = match.Groups[0].Value.GetCharMinusRight(1); return cons_char.ToLower() == 'l' ? init_char + vowel_char.apply_repl_rules() + cons_char + "-" + cons_char : @@ -46,7 +57,7 @@ private string digraph_special3_rules_replacer(Match match, string text, int bia var vowel_char = match.Value[0].ToString(); var cons_char = match.Value[1].ToString(); var s_char = match.Value[2]; - var digraph_char = match.Value[^1]; + var digraph_char = match.Value.GetCharMinusRight(1); return cons_char.ToLower() == "r" && s_char.ToLower() == 's' ? vowel_char + cons_char + digraph_char + digraph_char : @@ -56,7 +67,7 @@ private string digraph_special3_rules_replacer(Match match, string text, int bia private string digraph_special4_rules_replacer(Match match, string text, int bias) { var vowel_char = match.Value[0].ToString(); - var digraph_char = match.Value[^1]; + var digraph_char = match.Value.GetCharMinusRight(1); return vowel_char.apply_repl_rules() + digraph_char + "-" + digraph_char; } diff --git a/Andaluh/Rules/FinalRules.cs b/Andaluh/Rules/FinalRules.cs index 8b234cb..dde8efd 100644 --- a/Andaluh/Rules/FinalRules.cs +++ b/Andaluh/Rules/FinalRules.cs @@ -7,8 +7,9 @@ namespace Andaluh.Rules { internal class FinalRules : RuleBundle { - private static readonly Regex pattern_ador = new Regex(@"(?i)\w+(adôh|edôh|idá)"); - private static readonly Regex pattern_dura = new Regex(@"(?i)\w+(dura|dero|dera|dora)"); + private static readonly Regex pattern_ador = new Regex(@"(?i)(adôh|edôh|idá)\b"); + private static readonly Regex pattern_dura = new Regex(@"(?i)(\w)(dura|durâ|duro|dero|durô|derô|dera|dora|derâ|dorâ)\b"); + private static readonly Regex pattern_deder = new Regex(@"(?i)(b|d)(eder)([aeiouáâçéíóú])\b"); private readonly Dictionary ADOR_RULES_EXCEPT = new Dictionary() { @@ -19,18 +20,30 @@ internal class FinalRules : RuleBundle protected override IEnumerable Rules => new[] { new Rule(pattern_ador, FinalesAdor, ADOR_RULES_EXCEPT), - new Rule(pattern_dura, FinalesDura, null) + new Rule(pattern_dura, FinalesDura, null), + new Rule(pattern_deder, FinalesDeder, null) }; - private string FinalesAdor(Match match, string text, int bias)=> - match.Groups[0].Value[0..^match.Groups[1].Length] + match.Groups[1].Value[0] + match.Groups[1].Value[2..]; + private string FinalesAdor(Match match, string text, int bias) => + match.Groups[0].Value.GetRangeMinusRight(0, match.Groups[1].Length) + match.Groups[1].Value[0] + match.Groups[1].Value.Substring(2); private string FinalesDura(Match match, string text, int bias) { - var prefijo = match.Groups[0].Value[0..^4]; - var vocalAcentuada = match.Groups[1].Value[1].KeepCase(match.Groups[1].Value[1].GetVowelTilde()); - var final = match.Groups[1].Value[2..4]; + var charBefore = match.Groups[1].Value; + if (charBefore == "n" || charBefore == "r") return match.Groups[0].Value; - return prefijo + vocalAcentuada + final; + var prefijo = match.Groups[0].Value.GetCharMinusRight(3); + var vocalAcentuada = prefijo.KeepCase(prefijo.GetVowelTilde()); + var final = match.Groups[0].Value.GetRange(3, 5); + + return charBefore + vocalAcentuada + final; + } + + private string FinalesDeder(Match match, string text, int bias) + { + var prefijo = match.Groups[0].Value.Substring(0, match.Groups[0].Value.IndexOf(match.Groups[1].Value)); + var reemplazo = match.Groups[2].Value.KeepCase("eér"); + + return prefijo + match.Groups[1].Value + reemplazo + match.Groups[3].Value; } public FinalRules() : base() diff --git a/Andaluh/Rules/GJRules.cs b/Andaluh/Rules/GJRules.cs index bf3469e..7f4ae4b 100644 --- a/Andaluh/Rules/GJRules.cs +++ b/Andaluh/Rules/GJRules.cs @@ -7,6 +7,8 @@ namespace Andaluh.Rules { internal class GJRules : RuleBundle { + private static readonly Regex pattern_lge = new Regex("(?i)(lge|lgé|lgi|lgí)"); + private static readonly Regex pattern_lj = new Regex("(?i)(lj)"); private static readonly Regex pattern_gj = new Regex("(?i)(g(?=[eiéí])|j)([aeiouáéíóú])"); private static readonly Regex pattern_gue_gui = new Regex("(?i)(g)u([eiéí])"); private static readonly Regex pattern_guue_guui = new Regex("(?i)(g)(ü)([eiéí])"); @@ -22,6 +24,8 @@ internal class GJRules : RuleBundle protected override IEnumerable Rules => new[] { + new Rule(pattern_lge, lge_rules_replacer), + new Rule(pattern_lj, lj_rules_replacer), new Rule(pattern_gj, gj_rules_replacer, GJ_RULES_EXCEPT), new Rule(pattern_gue_gui, gue_gui_rules_replacer), //DynamicRuleExceptions new Rule(pattern_guue_guui, guue_guui_rules_replacer), @@ -29,12 +33,14 @@ internal class GJRules : RuleBundle new Rule(pattern_guel_gues, guel_gues_rules_replacer) }; + private string lge_rules_replacer(Match match, string text, int bias) + => match.Value[0].KeepCase('r') + match.Value[1].KeepCase('h') + match.Value[2]; - private string gj_rules_replacer(Match match, string text, int bias) - { - string x_correct_capitalization = match.Value[0].IsUpperCase() ? Constants.VVF_mayus : Constants.VVF; - return x_correct_capitalization + match.Value[1]; - } + private string lj_rules_replacer(Match match, string text, int bias) + => match.Value[0].KeepCase('r') + match.Value[1].KeepCase('h'); + + private string gj_rules_replacer(Match match, string text, int bias) => + match.Value[0].KeepCase(Constants.VVF[0]) + match.Value[1]; private string gue_gui_rules_replacer(Match match, string text, int bias) => match.Value[0].ToString() + match.Value[2].ToString(); diff --git a/Andaluh/Rules/HRules.cs b/Andaluh/Rules/HRules.cs index 86c5010..ab56828 100644 --- a/Andaluh/Rules/HRules.cs +++ b/Andaluh/Rules/HRules.cs @@ -8,11 +8,10 @@ namespace Andaluh.Rules { internal class HRules : RuleBundle { - private static readonly Regex pattern_aha = new Regex("(?i)(aha|aho)"); - //private static readonly Regex pattern_aha = new Regex("(?i)([aá])(h)([aá])"); + private static readonly Regex pattern_aha = new Regex("(?i)([aá]h[aáeéíuú]|aho(?!rr|ra|ri)|ehe|ehi(?!sto)|oho|ih[ií]|uhu)"); private static readonly Regex pattern_h_general = new Regex("(?i)(? H_RULES_EXCEPT = new Dictionary() { @@ -27,13 +26,14 @@ internal class HRules : RuleBundle protected override IEnumerable Rules => new[] { + new Rule(RuleConstants.pattern_begin_lh, exceptuar_patron), new Rule(pattern_h_hua, h_hua_rules_replacer), new Rule(pattern_h_hue, h_hue_rules_replacer), - new Rule(pattern_aha, exceptuar_aha), + new Rule(pattern_aha, exceptuar_patron), new Rule(pattern_h_general, h_rules_replacer, H_RULES_EXCEPT) }; - private string exceptuar_aha(Match match, string text, int bias) + private string exceptuar_patron(Match match, string text, int bias) { var palabra = text.GetWholeWord(match.Index + bias); if (!H_RULES_EXCEPT.ContainsKey(palabra)) @@ -44,7 +44,7 @@ private string exceptuar_aha(Match match, string text, int bias) private string h_hue_rules_replacer(Match match, string text, int bias) { - string g_correct_capitalization = match.Value[0].IsUpperCase() ? "G" : "g"; + string g_correct_capitalization = match.Value[0].KeepCase('g'); var result = g_correct_capitalization + match.Value.Substring(1); AddTransliteratedWordAsExceptionForGueGui(match, text, bias, result); @@ -56,8 +56,8 @@ private void AddTransliteratedWordAsExceptionForGueGui(Match match, string text, { var palabra = text.GetWholeWord(match.Index + bias); var newWord = palabra.Replace(match.Value, result).ToLower(); - if (!DynamicRuleExceptions.ContainsKey(newWord)) - DynamicRuleExceptions.Add(newWord, newWord); + if (!DelayedAfterRuleDynamicRuleExceptions.ContainsKey(newWord)) + DelayedAfterRuleDynamicRuleExceptions.Add(newWord, newWord); } private string h_hua_rules_replacer(Match match, string text, int bias) diff --git a/Andaluh/Rules/LRules.cs b/Andaluh/Rules/LRules.cs index f87a8c6..f3ff0d5 100644 --- a/Andaluh/Rules/LRules.cs +++ b/Andaluh/Rules/LRules.cs @@ -9,15 +9,26 @@ internal class LRules : RuleBundle { private static readonly Regex pattern_l = new Regex("(?i)(l)([bcçgsdfghkmpqrtxz])"); + private readonly Dictionary L_RULES_EXCEPT = new Dictionary(); + protected override IEnumerable Rules => new[] { - new Rule(pattern_l, l_rules_replacer) + new Rule(RuleConstants.pattern_begin_lh, exceptuar_patron), + new Rule(pattern_l, l_rules_replacer, L_RULES_EXCEPT) }; + private string exceptuar_patron(Match match, string text, int bias) + { + var palabra = text.GetWholeWord(match.Index + bias); + if (!L_RULES_EXCEPT.ContainsKey(palabra)) + L_RULES_EXCEPT.Add(palabra, palabra); + + return match.Value; + } + private string l_rules_replacer(Match match, string text, int bias) => match.Value[0].KeepCase('r') + match.Value[1]; - public LRules() : base() { } } diff --git a/Andaluh/Rules/WordEndingRules.cs b/Andaluh/Rules/WordEndingRules.cs index 24d7dd3..41b556a 100644 --- a/Andaluh/Rules/WordEndingRules.cs +++ b/Andaluh/Rules/WordEndingRules.cs @@ -8,12 +8,13 @@ namespace Andaluh.Rules { internal class WordEndingRules : RuleBundle { - private static readonly Regex pattern_intervowel_d_end = new Regex(@"(?i)([aiíÍ])(d)([oa])(s?)\b"); + private static readonly Regex pattern_intervowel_d_end_exceptions = new Regex(@"(?i)[áéíóú][^aeiouáéíóú]\b"); + private static readonly Regex pattern_intervowel_d_end = new Regex(@"(?i)([aií])(d)([oa])(s?)\b"); private static readonly Regex pattern_eps_end = new Regex("(?i)(e)(ps)"); private static readonly Regex pattern_d_end = new Regex(@"(?i)([aeiouáéíóú])(d)\b"); private static readonly Regex pattern_s_end = new Regex(@"(?i)([aeiouáéíóú])(s)\b"); private static readonly Regex pattern_const_end = new Regex(@"(?i)([aeiouáâçéíóú])([bcfgjkprtxz]\b)"); - private static readonly Regex pattern_l_end = new Regex(@"(?i)([aeiouáâçéíóú])(l\b)"); + private static readonly Regex pattern_l_end = new Regex(@"(?i)([aeiouáâçéíóú])l\b"); private static readonly Regex pattern_vocal_tilde = new Regex("(?i)á|é|í|ó|ú"); private static readonly Dictionary WORDEND_D_INTERVOWEL_RULES_EXCEPT = new Dictionary() @@ -89,7 +90,6 @@ internal class WordEndingRules : RuleBundle private static readonly Dictionary WORDEND_CONST_RULES_EXCEPT = new Dictionary() { {"al", "al"}, - {"cual", "cuâ"}, {"del", "del"}, {"dél", "dél"}, {"el", "el"}, @@ -114,8 +114,8 @@ internal class WordEndingRules : RuleBundle new Rule(pattern_eps_end, eps_end_rules_replacer), new Rule(pattern_d_end, d_end_rules_replacer, WORDEND_D_RULES_EXCEPT), new Rule(pattern_s_end, s_end_rules_replacer, WORDEND_S_RULES_EXCEPT), - new Rule(pattern_const_end, const_end_rules_replacer, WORDEND_CONST_RULES_EXCEPT), - new Rule(pattern_l_end, const_end_rules_replacer, WORDEND_CONST_RULES_EXCEPT) + new Rule(pattern_l_end, const_end_rules_replacer, WORDEND_CONST_RULES_EXCEPT), + new Rule(pattern_const_end, const_end_rules_replacer, WORDEND_CONST_RULES_EXCEPT) }; private bool contain_vocal_tilde(string text) => pattern_vocal_tilde.Match(text).Success; @@ -123,10 +123,15 @@ internal class WordEndingRules : RuleBundle private string intervowel_d_end_rules_replacer(Match match, string text, int bias) { + var prefix = text.GetPrefix(match, bias); + + if (pattern_intervowel_d_end_exceptions.IsMatch(prefix)) return match.Value; + var firstVowel = match.Value[0]; var lastVowel = match.Value[2]; - if (contain_vocal_tilde(firstVowel)) return match.Value; + + if (contain_vocal_tilde(prefix)) return match.Value; switch (match.Value) { @@ -190,7 +195,7 @@ private string const_end_rules_replacer(Match match, string text, int bias) if (contain_vocal_tilde(prefix)) return suffixFirstChar.apply_repl_rules(); - return contain_vocal_tilde(suffixFirstChar) ? + return suffixFirstChar != 'í' && suffixFirstChar != 'ú' && contain_vocal_tilde(suffixFirstChar) ? suffixFirstChar.apply_repl_rules() : suffixFirstChar.apply_repl_rules() + suffixFirstChar.KeepCase('h'); } diff --git a/Andaluh/Rules/WordInteractionRules.cs b/Andaluh/Rules/WordInteractionRules.cs index f8dd3f2..c53b93e 100644 --- a/Andaluh/Rules/WordInteractionRules.cs +++ b/Andaluh/Rules/WordInteractionRules.cs @@ -16,8 +16,8 @@ internal class WordInteractionRules : RuleBundle private static string word_interaction_rules_replacer(Match match, string text, int bias)=> match.Value[0].ToLower() == 'd' ? - match.Value[0..2] + (match.Value[2].IsUpperCase() ? "R" : "r") + match.Value[3..] : - match.Value[0] + (match.Value[1].IsUpperCase() ? "R" : "r") + match.Value[2..]; + match.Value.GetRange(0, 2) + (match.Value[2].IsUpperCase() ? "R" : "r") + match.Value.Substring(3) : + match.Value[0] + (match.Value[1].IsUpperCase() ? "R" : "r") + match.Value.Substring(2); public WordInteractionRules() : base() { } diff --git a/Andaluh/SentenceMethods/SentenceExceptionConstants.cs b/Andaluh/SentenceMethods/SentenceExceptionConstants.cs index 20aebcd..c55c906 100644 --- a/Andaluh/SentenceMethods/SentenceExceptionConstants.cs +++ b/Andaluh/SentenceMethods/SentenceExceptionConstants.cs @@ -4,8 +4,14 @@ namespace Andaluh.SentenceMethods { public static class SentenceExceptions { - public static Dictionary Exceptions = new Dictionary + private static string[] TradeMarks = new string[] + { + "google", "twitter", "facebook", "outlook" + }; + + private static Dictionary Exceptions = new Dictionary { + { "et", "et" }, { "a capela","a capela"}, { "a contráriis","a contrárî"}, { "a contrario sensu","a contrario çençu"}, @@ -45,6 +51,7 @@ public static class SentenceExceptions { "ad referéndum","ârreferendum"}, { "ad tempus","âttempû"}, { "ad valórem","âbbalórem"}, + { "álter ego", "árterego" }, { "ex abrupto","ehabrûtto" }, { "ex aequo","ehaecuo" }, { "ex cáthedra","êccátedra" }, @@ -54,5 +61,31 @@ public static class SentenceExceptions { "ut supra","ut çupra" }, { "vox pópuli", "bôppópuli" } }; + + public static Dictionary allExceptions; + + public static Dictionary AllExceptions + { + get + { + if (allExceptions == null) + allExceptions = CreateAllExceptions(); + + return allExceptions; + } + } + private static Dictionary CreateAllExceptions() + { + var allExceptions = new Dictionary(); + + foreach (var exception in Exceptions) + allExceptions.Add(exception.Key, exception.Value); + + foreach (var tradeMark in TradeMarks) + allExceptions.Add(tradeMark, tradeMark); + + + return allExceptions; + } } } diff --git a/Andaluh/SentenceMethods/Token.cs b/Andaluh/SentenceMethods/Token.cs index 4c9820b..d71c239 100644 --- a/Andaluh/SentenceMethods/Token.cs +++ b/Andaluh/SentenceMethods/Token.cs @@ -1,4 +1,5 @@ -using System.Linq; +using System; +using System.Linq; using System.Text.RegularExpressions; using static Andaluh.SentenceMethods.TokenEvaluator; @@ -22,12 +23,19 @@ private Token(string value, int position, TranscriptionTypes transcription) public static Token GetEscapedToken(Match match) => new Token(match.Value, match.Index, TranscriptionTypes.Escaped); - public static Token GetExceptionToken(string exception, int position) => - new Token(exception, position, TranscriptionTypes.Exception); + public static Token GetExceptionToken(Match match) => + new Token(match.Value, match.Index, TranscriptionTypes.Exception); public static Token GetStandardToken(string str, int position) => new Token(str, position, TranscriptionTypes.Standard); public override string ToString() => $"{StartIndex}, {EndIndex}, {Value}"; + + internal void Copy(Token newToken) + { + Value = newToken.Value; + StartIndex = newToken.StartIndex; + EndIndex = newToken.EndIndex; + } } } diff --git a/Andaluh/SentenceMethods/TokenEvaluator.cs b/Andaluh/SentenceMethods/TokenEvaluator.cs index 70e0966..cdf33a1 100644 --- a/Andaluh/SentenceMethods/TokenEvaluator.cs +++ b/Andaluh/SentenceMethods/TokenEvaluator.cs @@ -1,4 +1,5 @@ -using System; +using Andaluh.Extensions; +using System; using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; @@ -7,7 +8,7 @@ namespace Andaluh.SentenceMethods { internal class TokenEvaluator { - private Regex EscapeStringsPattern = new Regex(@"(?i)(http[^ ]+)|(@\w+)|(#\w+)"); + private Regex EscapeStringsPattern = new Regex(@"(?i)(http[^ ]+)|(@\w+)|(#\w+)|(\w+@\w+)|(\w+\.es)|(\w+\.com)"); public enum TranscriptionTypes { Exception, Escaped, Standard } List Tokens; @@ -30,7 +31,7 @@ public TokenEvaluator(string text) private void ReplaceExceptions() { foreach (var token in Tokens.Where(x => x.Transcription == TranscriptionTypes.Exception)) - token.Value = SentenceExceptions.Exceptions[token.Value]; + token.Value = token.Value.KeepCase(SentenceExceptions.AllExceptions[token.Value.ToLower()]); } internal IEnumerable GetWordsToTransliterate() => @@ -44,11 +45,11 @@ private void FillGaps(string text) var gaps = new List(); - if (Tokens[0].StartIndex > 0) gaps.Add(Token.GetStandardToken(text[0..Tokens[0].StartIndex], 0)); + if (Tokens[0].StartIndex > 0) gaps.Add(Token.GetStandardToken(text.GetRange(0, Tokens[0].StartIndex), 0)); for (int i = 0; i < Tokens.Count - 1; i++) { - var gapText = text[Tokens[i].EndIndex..Tokens[i + 1].StartIndex]; + var gapText = text.GetRange(Tokens[i].EndIndex, Tokens[i + 1].StartIndex); if (gapText.Length == 0) continue; gaps.Add(Token.GetStandardToken(gapText, Tokens[i].EndIndex)); @@ -70,11 +71,11 @@ private int GetTokenBefore(Token gap) private void AddExceptions(string text) { - var exceptions = new Regex($"(?i)({string.Join('|', SentenceExceptions.Exceptions.Keys)})"); + var exceptions = new Regex(@$"(?i)(\b{string.Join("|", SentenceExceptions.AllExceptions.Keys)})\b"); var matches = exceptions.Matches(text).Where(x => x.Success); foreach (var match in matches) - Tokens.Add(Token.GetExceptionToken(text, match.Index)); + AddExceptionTokenIfNotContained(Token.GetExceptionToken(match)); } private void AddEscapedStrings(string text) @@ -84,5 +85,16 @@ private void AddEscapedStrings(string text) foreach (var match in matches) Tokens.Add(Token.GetEscapedToken(match)); } + + private void AddExceptionTokenIfNotContained(Token newToken) + { + var tokenContaining = Tokens.FirstOrDefault(x => x.StartIndex <= newToken.StartIndex && x.EndIndex >= newToken.EndIndex); + if (tokenContaining != null) return; //The token we are adding is already contained in an existing token + + var tokenContained = Tokens.FirstOrDefault(x => x.StartIndex >= newToken.StartIndex && x.EndIndex <= newToken.EndIndex); + + if (tokenContained != null) tokenContained.Copy(newToken); + else Tokens.Add(newToken); + } } } diff --git a/Tests/Acceptance.cs b/Tests/Acceptance.cs index 549bdc4..96db744 100644 --- a/Tests/Acceptance.cs +++ b/Tests/Acceptance.cs @@ -41,6 +41,6 @@ public void Test11() => [Fact] public void Test12() => - Assert.Equal("Oye y @miguel, la wh HTTPS://andaluh.es no ale en https://www.google.es pero i en http://google.com #porqueseor", EPA.Transcribe("Oye sexy @miguel, la web HTTPS://andaluh.es no sale en https://www.google.es pero si en http://google.com #porqueseor")); + Assert.Equal("Oye y @miguel, la wh HTTPS://andaluh.es no ale en google.es pero i en http://google.com #porqueseor", EPA.Transcribe("Oye sexy @miguel, la web HTTPS://andaluh.es no sale en google.es pero si en http://google.com #porqueseor")); } } diff --git a/Tests/Lemario.cs b/Tests/Lemario.cs index cfb7a4a..03af9cf 100644 --- a/Tests/Lemario.cs +++ b/Tests/Lemario.cs @@ -1,5 +1,6 @@ using Andaluh; using System; +using System.Linq; using System.Threading.Tasks; using Tests.CSVUtils; using Xunit; @@ -11,6 +12,8 @@ public class Lemario [Fact] public void LemarioCompleto() { + var aciertosLock = new object(); + var fallosLock = new object(); var aciertos = 0; var fallos = 0; var listaDeFallos = string.Empty; @@ -19,18 +22,20 @@ public void LemarioCompleto() Parallel.ForEach(todasLasPalabras, palabra => { - if (palabra.Andaluh == palabra.Castellano.ToAndaluh()) aciertos++; + if (palabra.Andaluh == palabra.Castellano.ToAndaluh()) sumaAcierto(); else { - fallos++; + sumaFallo(); listaDeFallos += $"Error: {palabra.Castellano} => {palabra.Castellano.ToAndaluh()} se esperaba {palabra.Andaluh}\r\n"; } }); - if (fallos != 0) throw new Exception($"Aciertos {aciertos} | Fallos {fallos} => {aciertos * 100/(aciertos+fallos)}%\r\nLISTA DE ERRORES\r\n{listaDeFallos}"); + if (fallos != 0) throw new Exception($"Aciertos {aciertos} | Fallos {fallos} => {aciertos * 100/ todasLasPalabras.Count() }%\r\nLISTA DE ERRORES\r\n{listaDeFallos}"); Assert.Equal(0, fallos); + void sumaAcierto() { lock (aciertosLock) aciertos++; } + void sumaFallo() { lock (fallosLock) fallos++; } } } } diff --git a/Tests/Unit.cs b/Tests/Unit.cs index 59fdd94..f84af81 100644 --- a/Tests/Unit.cs +++ b/Tests/Unit.cs @@ -21,6 +21,30 @@ public void CHCambiaX() Assert.Equal("X x Xan xan", res); } + [Fact] + public void Entendederas() + { + var res = EPA.Transcribe("absolvederas entendederas"); + + Assert.Equal("orber entender", res); + } + + [Fact] + public void Alhurreca() + { + var res = EPA.Transcribe("alhurreca"); + + Assert.Equal("alhurreca", res); + } + + [Fact] + public void Ahuehue() + { + var res = EPA.Transcribe("ahuehu"); + + Assert.Equal("aguegu", res); + } + [Fact] public void ElQueAcanala() { @@ -29,7 +53,6 @@ public void ElQueAcanala() Assert.Equal("acanalah", res); } - [Fact] public void D_Intervocalica() { @@ -62,6 +85,22 @@ public void Around() Assert.Equal("Arrededh", res); } + [Fact] + public void Acetamida() + { + var res = EPA.Transcribe("acetamida"); + + Assert.Equal("aetamida", res); + } + + [Fact] + public void Aljarafe() + { + var res = EPA.Transcribe("aljarafe algbrico"); + + Assert.Equal("arharafe arhbrico", res); + } + [Fact] public void El() { @@ -77,7 +116,14 @@ public void AFortiori() Assert.Equal("afortiori", res); } - + + [Fact] + public void Apegaderas() + { + var res = EPA.Transcribe("apegaderas"); + + Assert.Equal("apegar", res); + } [Fact] public void Del() @@ -86,8 +132,31 @@ public void Del() Assert.Equal("del ttrtto", res); } + [Fact] + public void Desposeido() + { + var res = EPA.Transcribe("desposedo"); + + Assert.Equal("dppoeo", res); + } + [Fact] + public void Dulce() + { + var res = EPA.Transcribe("almbar"); + + Assert.Equal("armb", res); + } + + [Fact] + public void Tonto() + { + var res = EPA.Transcribe("estpido"); + + Assert.Equal("ttpido", res); + } + [Fact] public void Manuel() { @@ -96,6 +165,22 @@ public void Manuel() Assert.Equal("er Manuh", res); } + [Fact] + public void Nicolas() + { + var res = EPA.Transcribe("maduro"); + + Assert.Equal("maro", res); + } + + [Fact] + public void Escuchar() + { + var res = EPA.Transcribe("or"); + + Assert.Equal("oh", res); + } + [Fact] public void Silvar() { @@ -120,6 +205,13 @@ public void Triceps() Assert.Equal("tr", res); } + [Fact] + public void Bebedero() + { + var res = EPA.Transcribe("aljibe aljuba"); + + Assert.Equal("arhibe arhuba", res); + } [Fact] @@ -156,8 +248,6 @@ public void Sexy() Assert.Equal("y", res); } - - [Fact] public void Cacahue() { @@ -166,6 +256,14 @@ public void Cacahue() Assert.Equal("cacaGuET", res); } + [Fact] + public void Cicahue() + { + var res = EPA.Transcribe("cicahuite"); + + Assert.Equal("icaguite", res); + } + [Fact] public void Spanglish() { @@ -182,6 +280,14 @@ public void Escapes() Assert.Equal("@miguel http://google.com #Hashtag", res); } + [Fact] + public void MasEscapes() + { + var res = EPA.Transcribe("Mi correo es todito@outlook.com es de Outlook. Tambin tengo cuenta en twitter"); + + Assert.Equal("Mi correo todito@outlook.com de Outlook. Tambin tengo cuenta en twitter", res); + } + [Fact] public void Casada() { @@ -191,7 +297,23 @@ public void Casada() } [Fact] - public void Ahotado() + public void Cazabombardero() + { + var res = EPA.Transcribe("cazabombardero"); + + Assert.Equal("caabombardero", res); + } + + [Fact] + public void Cual() + { + var res = EPA.Transcribe("cual"); + + Assert.Equal("cuh", res); + } + + [Fact] + public void Acar() { var res = EPA.Transcribe("ahotado"); @@ -254,7 +376,14 @@ public void Valkiria() Assert.Equal("barkiri", res); } - + + [Fact] + public void Viandero() + { + var res = EPA.Transcribe("viandero"); + + Assert.Equal("biandero", res); + } [Fact] public void Bueno() @@ -264,6 +393,14 @@ public void Bueno() Assert.Equal("Qu Gueno, qu guena", res); } + [Fact] + public void Coger() + { + var res = EPA.Transcribe("aprehender"); + + Assert.Equal("aprehendh", res); + } + [Fact] public void TodasLasCosas() {