diff --git a/.travis.yml b/.travis.yml index ae8e91b3d..737c74ba1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,13 @@ language: python +python: + - 3.6 + +install: + # Install the requirements + # Use -U to make sure we get the latest versions of everything so we notice any + # incompatibilities as soon as possible. + - pip install -U -r requirements.txt + - pip list + script: - python update_readme.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..0d596af4c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +fuzzyset==0.0.19 +transliterate==1.10.2 diff --git a/update_readme.py b/update_readme.py index 91986b5fc..a633b6c43 100755 --- a/update_readme.py +++ b/update_readme.py @@ -1,13 +1,29 @@ #!/usr/bin/env python3 +import hashlib import os import re -import hashlib +import urllib.request +from collections import Counter +from copy import copy +from operator import itemgetter + +import fuzzyset +import transliterate + + +def strip_func(s): + return re.sub(' +', ' ', s).strip() + + +def has_cyrillic(text): + return bool(re.search('[\u0400-\u04FF]', text)) class InvalidFileFormatException(Exception): pass + def load_signed(): signed = [] signed_new = set() @@ -21,7 +37,7 @@ def load_signed(): print('Skipping non-file "%s"' % filename) continue - with open(filename) as inp: + with open(filename, encoding='utf-8-sig') as inp: for i, line in enumerate(inp): line = line.strip() if not line: @@ -38,13 +54,13 @@ def load_signed(): else: signed_new.add((m.group(1).strip(), m.group(2).strip())) for signature in signed_new: - signed.append(signature) + signed.append(signature) return sorted(signed, key=lambda pair: hashlib.sha256(repr(pair).encode('utf-8')).hexdigest()) def write_signed(signed, outp): for i, signature in enumerate(signed): - outp.write('| {:<4} | {:<34} | {:<39} |\n'.format(i+1, signature[0], signature[1])) + outp.write('| {:<4} | {:<34} | {:<39} |\n'.format(i + 1, signature[0], signature[1])) def update_readme(signed): @@ -56,9 +72,77 @@ def update_readme(signed): outp.write(line) +def fix_names(persons, prob_treshold=0.7): + """ + Fix names order to "{Last Name} {First Name}" form + :param persons: + :param prob_treshold: + :return: + """ + # Load name dictionary + vocabulary_first_names = fuzzyset.FuzzySet() + + with urllib.request.urlopen('http://www.searchnames.ru/allnames.csv') as response: + lines = response.readlines() + for line in lines: + first_name = line.decode('cp1251').replace('"', '').split(';') + first_name[1] = strip_func(first_name[1]) + vocabulary_first_names.add(first_name[1]) + + final_signed_names = [] + + # Find first names in our Fuzzyset and put them in second place in the line (we don’t change the order of the rest) + for raw_name, description in persons: + is_name_found = False + + name = transliterate.translit(raw_name, 'ru') + + if has_cyrillic(raw_name) and name != raw_name: + print( + f'\nWarning: the name "{raw_name}" contains non-cyrillic characters\n', + f'Cyrillic replacement (manual) "{name}"\n' + ) + + words_probabilities = [] + for word_index, word in enumerate(name.split(' ')): + vocabulary_result = vocabulary_first_names.get(word) + current_prob = max(vocabulary_result, key=lambda x: x[0])[0] + + if current_prob >= prob_treshold: + is_name_found = True + words_probabilities.append((current_prob, word, word_index)) + + final_name = copy(raw_name) + if is_name_found: + if Counter(map(lambda x: x[0], words_probabilities)).most_common()[0][1] > 1: + print(f'\nWarning: Several parts of "{raw_name}" are in the dictionary\n{words_probabilities}\n') + + _, (max_prob, find_name, word_index) = max(enumerate(words_probabilities), key=itemgetter(1, 1)) + + if word_index == 0: + _indexes = [1, 0] + + final_name_list = [] + splitted_raw_name = raw_name.split(' ') + for i in _indexes: + final_name_list.append(splitted_raw_name[i]) + final_name_list.extend(splitted_raw_name[2:]) + final_name = ' '.join(final_name_list) + + print(f'Replacement: "{raw_name}" -> "{final_name}"') + + else: + print(f'Warning: Name {raw_name} not found in dictionary') + + final_signed_names.append((final_name, description)) + + return final_signed_names + + def main(): - signed = load_signed() - update_readme(signed) + signed_persons = load_signed() + signed_persons_fixed = fix_names(signed_persons) + update_readme(signed_persons_fixed) if __name__ == '__main__':