developers-against-repressions · lucidyan · Sep 18, 2019 · Sep 19, 2019 · Sep 19, 2019 · Sep 19, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -1,3 +1,13 @@
 language: python
+python:
+  - 3.6
+
+install:
+  # Install the requirements
+  # Use -U to make sure we get the latest versions of everything so we notice any
+  # incompatibilities as soon as possible.
+  - pip install -U -r requirements.txt
+  - pip list
+
 script:
   - python update_readme.py
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+fuzzyset==0.0.19
+transliterate==1.10.2
diff --git a/update_readme.py b/update_readme.py
@@ -1,13 +1,29 @@
 #!/usr/bin/env python3
 
+import hashlib
 import os
 import re
-import hashlib
+import urllib.request
+from collections import Counter
+from copy import copy
+from operator import itemgetter
+
+import fuzzyset
+import transliterate
+
+
+def strip_func(s):
+    return re.sub(' +', ' ', s).strip()
+
+
+def has_cyrillic(text):
+    return bool(re.search('[\u0400-\u04FF]', text))
 
 
 class InvalidFileFormatException(Exception):
     pass
 
+
 def load_signed():
     signed = []
     signed_new = set()
@@ -21,7 +37,7 @@ def load_signed():
             print('Skipping non-file "%s"' % filename)
             continue
 
-        with open(filename) as inp:
+        with open(filename, encoding='utf-8-sig') as inp:
             for i, line in enumerate(inp):
                 line = line.strip()
                 if not line:
@@ -38,13 +54,13 @@ def load_signed():
                 else:
                     signed_new.add((m.group(1).strip(), m.group(2).strip()))
     for signature in signed_new:
-    	signed.append(signature)
+        signed.append(signature)
     return sorted(signed, key=lambda pair: hashlib.sha256(repr(pair).encode('utf-8')).hexdigest())
 
 
 def write_signed(signed, outp):
     for i, signature in enumerate(signed):
-        outp.write('| {:<4} | {:<34} | {:<39} |\n'.format(i+1, signature[0], signature[1]))
+        outp.write('| {:<4} | {:<34} | {:<39} |\n'.format(i + 1, signature[0], signature[1]))
 
 
 def update_readme(signed):
@@ -56,9 +72,77 @@ def update_readme(signed):
                 outp.write(line)
 
 
+def fix_names(persons, prob_treshold=0.7):
+    """
+    Fix names order to "{Last Name} {First Name}" form
+    :param persons:
+    :param prob_treshold:
+    :return:
+    """
+    # Load name dictionary
+    vocabulary_first_names = fuzzyset.FuzzySet()
+
+    with urllib.request.urlopen('http://www.searchnames.ru/allnames.csv') as response:
+        lines = response.readlines()
+        for line in lines:
+            first_name = line.decode('cp1251').replace('"', '').split(';')
+            first_name[1] = strip_func(first_name[1])
+            vocabulary_first_names.add(first_name[1])
+
+    final_signed_names = []
+
+    # Find first names in our Fuzzyset and put them in second place in the line (we don’t change the order of the rest)
+    for raw_name, description in persons:
+        is_name_found = False
+
+        name = transliterate.translit(raw_name, 'ru')
+
+        if has_cyrillic(raw_name) and name != raw_name:
+            print(
+                f'\nWarning: the name "{raw_name}" contains non-cyrillic characters\n',
+                f'Cyrillic replacement (manual) "{name}"\n'
+            )
+
+        words_probabilities = []
+        for word_index, word in enumerate(name.split(' ')):
+            vocabulary_result = vocabulary_first_names.get(word)
+            current_prob = max(vocabulary_result, key=lambda x: x[0])[0]
+
+            if current_prob >= prob_treshold:
+                is_name_found = True
+                words_probabilities.append((current_prob, word, word_index))
+
+        final_name = copy(raw_name)
+        if is_name_found:
+            if Counter(map(lambda x: x[0], words_probabilities)).most_common()[0][1] > 1:
+                print(f'\nWarning: Several parts of "{raw_name}" are in the dictionary\n{words_probabilities}\n')
+
+            _, (max_prob, find_name, word_index) = max(enumerate(words_probabilities), key=itemgetter(1, 1))
+
+            if word_index == 0:
+                _indexes = [1, 0]
+
+                final_name_list = []
+                splitted_raw_name = raw_name.split(' ')
+                for i in _indexes:
+                    final_name_list.append(splitted_raw_name[i])
+                final_name_list.extend(splitted_raw_name[2:])
+                final_name = ' '.join(final_name_list)
+
+                print(f'Replacement: "{raw_name}" -> "{final_name}"')
+
+        else:
+            print(f'Warning: Name {raw_name} not found in dictionary')
+
+        final_signed_names.append((final_name, description))
+
+    return final_signed_names
+
+
 def main():
-    signed = load_signed()
-    update_readme(signed)
+    signed_persons = load_signed()
+    signed_persons_fixed = fix_names(signed_persons)
+    update_readme(signed_persons_fixed)
 
 
 if __name__ == '__main__':