-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy path5B_filter_words.py
89 lines (81 loc) · 3 KB
/
5B_filter_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import json
from tqdm import tqdm
from pprint import pprint
LANGUAGES = [
"cs",
"da",
"de",
"en",
"es",
"et",
"fi",
"fr",
"hr",
"hu",
"it",
"lt",
"lv",
"nl",
"no",
"pl",
"pt",
"ro",
"se",
"sk",
"sl",
"sq",
"sv",
"tr",
]
# load local counts into memory
localCounts = dict()
for LANGUAGE in LANGUAGES:
with open("count/by_language/" + LANGUAGE + "/common_kerning_pairs.json", "r") as inputFile_A:
pairs = dict(json.load(inputFile_A))
localCounts[LANGUAGE] = pairs
kerningPairs = dict()
with open("count/total/relevant_kerning_pairs.json", "r") as inputFile_B:
for [key, relevance_score] in tqdm(json.load(inputFile_B)):
top_local_counts = list()
for LANGUAGE, pairs in localCounts.items():
if key in pairs:
top_local_counts.append((LANGUAGE, pairs[key]))
top_local_counts.sort(key=lambda x:x[1], reverse=True)
kerning_words = list()
i = 0
for LANGUAGE, count in top_local_counts[:14]:
with open("count/by_language/" + LANGUAGE + "/words.json", "r") as inputFile_C:
words = dict(json.load(inputFile_C))
share = max(1, (4 - i))
for word in words:
if share >= 1:
if len(word) <= 16 and word not in kerning_words:
if key[0] == ' ':
if key[1].lower() == word[0]:
if key[1].isupper():
kerning_words.append(word.upper())
share -= 1
else:
kerning_words.append(word)
share -= 1
elif key[1] == ' ':
if key[0].lower() == word[-1]:
if key[0].isupper():
kerning_words.append(word.upper())
share -= 1
else:
kerning_words.append(word)
share -= 1
else:
if key.isupper():
if key.lower() in word:
kerning_words.append(word.upper())
share -= 1
else:
if key in word:
kerning_words.append(word)
share -= 1
i += 1
kerningPairs[key] = (relevance_score, kerning_words) # just store Top 5 local Words
with open("result/relevant_words.json", "w") as output:
output.write(json.dumps(kerningPairs, indent=4, sort_keys=False))