-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathWordFreq.py
108 lines (92 loc) · 4.06 KB
/
WordFreq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Python Code to find out most frequent words from famous GRE lists
WordLists ( Courtesy: www.quizlet.com )
1. Barrons 1100 Words
2. Magoosh 1000 Words
3. Majortest Essential 1500 Words
4. Princeton 500 Words
5. Kaplan 500 Words
6. Manhattan Basic+Essential 1000 Words
Above words are stored in different text files along with definations. These text files are combined, parsed and word frequecy is calculated.
Word along with Frequenices is stored in output text file 'output.txt'
"""
from collections import defaultdict, Counter
import json
# Function to calculate word Frequency and store it into Dictionary
def wordListToFreqDict(wordlist):
wordfreq = [wordlist.count(p) for p in wordlist]
return dict(zip(wordlist,wordfreq))
# Combine all wordslist text files into one and convert to lowercase.
filenames = ['WordLists/Kaplan.txt','WordLists/Magoosh.txt','WordLists/Manhattan.txt','WordLists/Barrons.txt','WordLists/Princeton.txt','WordLists/Majortest.txt']
with open('CombinedAllWords.txt', 'w') as outfile:
for fname in filenames:
with open(fname, encoding="ISO-8859-1") as infile:
for line in infile:
line = line.lower()
outfile.write(line)
# Open combined wordlist, remove definations and format words
with open("CombinedAllWords.txt") as f:
items = f.readlines()
combined_vocab = items
combined_vocab = list(map(lambda items: items.strip(), combined_vocab))
for i, w in enumerate(combined_vocab):
combined_vocab[i]=w.split(":", 1)[0]
# my custom code to create a super dictionary
combined_map = defaultdict(list)
with open("CombinedAllWords.txt") as f:
for line in f:
line = line.strip()
splitted = line.split(':')
if len(splitted) > 1:
word, meaning = splitted[0], splitted[1]
combined_map[word].append(meaning)
#Open file and write sorted dictionary values into text file one by one in decreasing order
with open("output/words.txt","w") as f:
# for key, value in sorted(wordListToFreqDict(t).items(), key=lambda k,v: (v,k), reverse =True):
for key, value in sorted(wordListToFreqDict(combined_vocab).items(), key=lambda x: (x[1], x[0]), reverse =True):
# l.write(" \n%s: %s" % (key, value))
word = key
freq = value
meaning = combined_map[word]
f.write("{}[{}]: {}\n".format(key, value, meaning))
counter = Counter([v for k, v in wordListToFreqDict(combined_vocab).items()])
# dump as json
jsonmap = {}
for word, frequency in sorted(wordListToFreqDict(combined_vocab).items(), key=lambda x: (x[1], x[0]), reverse =True):
if frequency not in jsonmap:
jsonmap[frequency] = defaultdict(dict)
jsonmap[frequency][word] = combined_map[word]
for frequency in jsonmap:
wordmap = jsonmap[frequency]
with open("output/{}.json".format(frequency), 'w') as f:
json.dump(wordmap, f, indent=4)
# dump as HTML with some shitty formatting
with open("output/words.html","w") as f:
html = "<!DOCTYPE html>\n<head>{}</head>\n<body>\n{}</body>\n</html>"
# add a metadata about occurence of each type
metadata = "<h1>Metadata</h1><ul>"
for k in range(6, 0, -1):
metadata += "<li>{} : {}</li>".format(k, counter[k])
metadata += "</ul>"
# f.write(metadata)
# real shit happens now
content = ""
count, prev = 0, 6
for i, (key, value) in enumerate(sorted(wordListToFreqDict(combined_vocab).items(), key=lambda x: (x[1], x[0]), reverse =True)):
# l.write(" \n%s: %s" % (key, value))
word = key
frequency = value
# reset counter
if frequency != prev:
count, prev = 0, frequency
meaning = "<ul>"
for m in combined_map[word]:
meaning += "<li>{}</li>".format(m)
meaning += "</ul>"
count += 1
content += "<h2>{} - {}</h2><h4>frequecy : {}</h4><h4>S.N.: {}/{}</h4>{}<hr/>\n".format(
i+1, key, frequency, count, counter[frequency], meaning)
# f.write(content)
body = "{}\n<hr/><hr/>\n{}".format(metadata, content)
html = html.format('', body)
f.write(html)