-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTRGrammar.py
95 lines (76 loc) · 3.3 KB
/
TRGrammar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-
import unittest
from typing import List, Dict, Set
from unicode_tr import unicode_tr
from src.Core.Languages.Grammars.IGrammar import IGrammar
from src.Tools import StringHelper
class TRGrammar(IGrammar):
def __init__(self) -> None:
super().__init__()
self._Vowels = {'a','e','i','ı','u','ü','o','ö','â','î','û','ê'}
self._AccentMappings = {"â":"a", "î":"i", "û":"u", "ê":"e"} # Uppercase support should be added. There may be different accent marks. Lowercased ones were enough for OSimUnr study.
self._AccentChars:Set[str] = set()
self.SetAccents(self._AccentMappings)
def ToLowerCase(self, input:str)->str:
ustr = unicode_tr(input)
return ustr.lower()
def ToUpperCase(self, input:str)->str:
ustr = unicode_tr(input)
return ustr.upper()
def GetAlphabet(self) -> List[str]:
return ["A","B","C","Ç","D","E","F","G","Ğ","H","I","İ","J","K","L","M","N","O","Ö","P","R","S","Ş","T","U","Ü","V","Y","Z"]
def IsVowel(self, char:chr)->bool:
"""
Returns whether it is a vowel.
:param char:
:return:
"""
lchar:chr = self.ToLowerCase(char)
return lchar in self._Vowels
#region Accents
def HasAccent(self, word:str)->bool:
if(StringHelper.IsNullOrEmpty(word)): return False
if any(w in word for w in self._AccentChars):
return True
return False
def SetAccents(self, accentMappings:Dict[str,str]):
self._AccentMappings = accentMappings
for a in self._AccentMappings:
self._AccentChars.add(a)
def GetAccentChars(self)->Set[str]:
return self._AccentChars
def ReduceAccents(self, word:str)->str:
"""
Converts the accented letters of the given word to their Latin equivalents.
Only supports lowercased ones.
:param word:
:return:
"""
if(not self.HasAccent(word)): return word
res:str = word
for k,v in self._AccentMappings.items(): # Not the most optimal method. We produced strings continuously.
res = res.replace(k,v)
return res
#endregion
#UNITTEST
class TestTRGrammar(unittest.TestCase):
def test_CapitalILetter_ToLowerCase(self):
self.assertEqual(u"istanbul",TRGrammar().ToLowerCase(u"İSTANBUL"))
self.assertEqual(u"ısparta",TRGrammar().ToLowerCase(u"ISPARTA"))
def test_LowerILetter_ToUpperCase(self):
self.assertEqual(u"İSTANBUL",TRGrammar().ToUpperCase(u"istanbul"))
self.assertEqual(u"ISPARTA",TRGrammar().ToUpperCase(u"ısparta"))
#region Accents
def test_GetAccents_GetDefaults(self):
self.assertEqual(4,TRGrammar().GetAccentChars().__len__())
def test_HasAccents_Accent_True(self):
self.assertTrue(TRGrammar().HasAccent("günahkârlık"))
def test_HasAccents_Accent_False(self):
self.assertFalse(TRGrammar().HasAccent("günahkarlık"))
def test_ReduceAccents_AccentedWord_ReduceToLatin(self):
self.assertEqual("günahkarlık", TRGrammar().ReduceAccents("günahkârlık"))
def test_ReduceAccents_NonAccentedWord_DoNothing(self):
self.assertEqual("günahkarlık", TRGrammar().ReduceAccents("günahkarlık"))
#endregion
if __name__ == '__main__':
unittest.main()