Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Names autofix #1502

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
language: python
python:
- 3.6

install:
# Install the requirements
# Use -U to make sure we get the latest versions of everything so we notice any
# incompatibilities as soon as possible.
- pip install -U -r requirements.txt
- pip list

script:
- python update_readme.py
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
fuzzyset==0.0.19
transliterate==1.10.2
96 changes: 90 additions & 6 deletions update_readme.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,29 @@
#!/usr/bin/env python3

import hashlib
import os
import re
import hashlib
import urllib.request
from collections import Counter
from copy import copy
from operator import itemgetter

import fuzzyset
import transliterate


def strip_func(s):
return re.sub(' +', ' ', s).strip()


def has_cyrillic(text):
return bool(re.search('[\u0400-\u04FF]', text))


class InvalidFileFormatException(Exception):
pass


def load_signed():
signed = []
signed_new = set()
Expand All @@ -21,7 +37,7 @@ def load_signed():
print('Skipping non-file "%s"' % filename)
continue

with open(filename) as inp:
with open(filename, encoding='utf-8-sig') as inp:
for i, line in enumerate(inp):
line = line.strip()
if not line:
Expand All @@ -38,13 +54,13 @@ def load_signed():
else:
signed_new.add((m.group(1).strip(), m.group(2).strip()))
for signature in signed_new:
signed.append(signature)
signed.append(signature)
return sorted(signed, key=lambda pair: hashlib.sha256(repr(pair).encode('utf-8')).hexdigest())


def write_signed(signed, outp):
for i, signature in enumerate(signed):
outp.write('| {:<4} | {:<34} | {:<39} |\n'.format(i+1, signature[0], signature[1]))
outp.write('| {:<4} | {:<34} | {:<39} |\n'.format(i + 1, signature[0], signature[1]))


def update_readme(signed):
Expand All @@ -56,9 +72,77 @@ def update_readme(signed):
outp.write(line)


def fix_names(persons, prob_treshold=0.7):
"""
Fix names order to "{Last Name} {First Name}" form
:param persons:
:param prob_treshold:
:return:
"""
# Load name dictionary
vocabulary_first_names = fuzzyset.FuzzySet()

with urllib.request.urlopen('http://www.searchnames.ru/allnames.csv') as response:
lines = response.readlines()
for line in lines:
first_name = line.decode('cp1251').replace('"', '').split(';')
first_name[1] = strip_func(first_name[1])
vocabulary_first_names.add(first_name[1])

final_signed_names = []

# Find first names in our Fuzzyset and put them in second place in the line (we don’t change the order of the rest)
for raw_name, description in persons:
is_name_found = False

name = transliterate.translit(raw_name, 'ru')

if has_cyrillic(raw_name) and name != raw_name:
print(
f'\nWarning: the name "{raw_name}" contains non-cyrillic characters\n',
f'Cyrillic replacement (manual) "{name}"\n'
)

words_probabilities = []
for word_index, word in enumerate(name.split(' ')):
vocabulary_result = vocabulary_first_names.get(word)
current_prob = max(vocabulary_result, key=lambda x: x[0])[0]

if current_prob >= prob_treshold:
is_name_found = True
words_probabilities.append((current_prob, word, word_index))

final_name = copy(raw_name)
if is_name_found:
if Counter(map(lambda x: x[0], words_probabilities)).most_common()[0][1] > 1:
print(f'\nWarning: Several parts of "{raw_name}" are in the dictionary\n{words_probabilities}\n')

_, (max_prob, find_name, word_index) = max(enumerate(words_probabilities), key=itemgetter(1, 1))

if word_index == 0:
_indexes = [1, 0]

final_name_list = []
splitted_raw_name = raw_name.split(' ')
for i in _indexes:
final_name_list.append(splitted_raw_name[i])
final_name_list.extend(splitted_raw_name[2:])
final_name = ' '.join(final_name_list)

print(f'Replacement: "{raw_name}" -> "{final_name}"')

else:
print(f'Warning: Name {raw_name} not found in dictionary')

final_signed_names.append((final_name, description))

return final_signed_names


def main():
signed = load_signed()
update_readme(signed)
signed_persons = load_signed()
signed_persons_fixed = fix_names(signed_persons)
update_readme(signed_persons_fixed)


if __name__ == '__main__':
Expand Down