-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy path3G_total_count.py
109 lines (92 loc) · 3.24 KB
/
3G_total_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
This script merge all language specific letter pairs to a global total.
Using the highscores instead of the total sum or an average no relevant
pair of any language gets lost or ranked down.
"""
import json
import operator
from collections import defaultdict
from pprint import pprint
LANGUAGES = [
"cs",
"de",
"en",
"es",
"et",
"fi",
"fr",
"hu",
"it",
"nl",
"no",
"pl",
"pt",
"se",
"sv",
"da",
"hr",
"sl",
"lt",
"tr",
"lv",
"ro",
"sk",
"sq",
]
QUOTES = ['"', "'", "«", "»", "‘", "’", "‚", "‛", "“", "”", "„", "‟", "‹", "›"]
# HIGHSCORES
globalLetterPairs = {}
for LANGUAGE in LANGUAGES:
# Dictionaries to store all pairs containing quotes
# To generate all possible stylistic alternates
leftQuotes = defaultdict(lambda: 1)
rightQuotes = defaultdict(lambda: 1)
with open("count/by_language/" + LANGUAGE + "/list.json", "r") as inputList, open(
"count/by_language/" + LANGUAGE + "/dictionary.json", "r"
) as inputDict:
letterPairs = dict(json.load(inputList))
totalValue = letterPairs["total"]
print(LANGUAGE, "Raw number of pairs:", len(letterPairs))
# Sum up all quotes
parentDict = json.load(inputDict)
for leftLetter, childrenDict in parentDict.items():
for rightLetter, count in childrenDict.items():
if leftLetter in QUOTES:
leftQuotes[rightLetter] += count
if rightLetter in QUOTES:
rightQuotes[leftLetter] += count
# Remove all keys containing quotes
letterPairs = {
k: v
for k, v in letterPairs.items()
if not any(QUOTE in k for QUOTE in QUOTES)
}
print(LANGUAGE, "Without quotes", len(letterPairs))
# Overwrite/add pairs containing representative quote characters
for rightLetter, count in leftQuotes.items():
for QUOTE in ['"', "„", "«", "»"]:
letterPairs[QUOTE + rightLetter] = count
for leftLetter, count in rightQuotes.items():
for QUOTE in ['"', "«", "»"]:
letterPairs[leftLetter + QUOTE] = count
print(LANGUAGE, "With all placeholder quotes", len(letterPairs))
# Clamp below minimum count of 1 per book (100 pages à 3000 characters)
letterPairs = {k: v / totalValue * 3000 * 100 for k, v in letterPairs.items()}
# Stores the highest existing value of the given languages
globalLetterPairs = {
key: letterPairs.get(key, 0)
if letterPairs.get(key, 0) > globalLetterPairs.get(key, 0)
else globalLetterPairs.get(key, 0)
for key in set(letterPairs) | set(globalLetterPairs)
}
# SORTING
globalLetterPairs = sorted(
globalLetterPairs.items(), key=operator.itemgetter(1), reverse=True
)
globalLetterPairsDict = {key: value for (key, value) in globalLetterPairs}
# OUTPUT
with open("count/total/list.json", "w") as output_a, open(
"count/total/dictionary.json", "w"
) as output_b:
output_a.write(json.dumps(globalLetterPairs, indent=4, ensure_ascii=False))
output_b.write(json.dumps(globalLetterPairsDict, indent=4, ensure_ascii=False))