-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.py
executable file
·98 lines (75 loc) · 2.49 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python3
import glob
import json
import os
import re
import string
import sys
from bulstem.stem import BulStemmer
from collections import Counter
def read_file(filename):
path = os.path.join(os.path.dirname(sys.argv[0]), "noise_data", filename)
with open(path) as tmp_fd:
return tmp_fd.read()
def dict_word(word, words, val):
if word in words:
words[word] += val
else:
words[word] = val
cities = read_file("city_names.txt")
countries = read_file("country_names.txt")
stops = read_file("grammar_words.txt")
names = read_file("human_names.txt")
stemmer = BulStemmer.from_file('stem-context-1')
def extract_words(book, N):
with open(book) as book_fd:
raw_words = dict()
words = dict()
try:
for line in book_fd:
for word in line.split():
dict_word(word, raw_words, 1)
except UnicodeDecodeError:
print("Book", book, "has some non utf-8 chars and was not scanned to the end")
for word, count in raw_words.items():
word = word.strip(string.punctuation + "„“—")
if len(word) < 2 or word in cities or word in names or word in stops:
continue
word = word.lower()
if re.search(r"[^а-я]", word):
continue
try:
word = stemmer.stem(word)
dict_word(word, words, count)
except KeyError:
continue
raw_words.clear()
top_words = Counter(words).most_common(N)
words.clear()
tmp = list()
for word in top_words:
tmp.append(word[0])
top_words.clear()
return ' '.join(tmp)
def main():
books = dict()
for category in glob.glob("*"):
print("Starting", category)
os.chdir(category + "/txt")
for book in glob.glob("*txt"):
id, book_name = book.split('-', 1)
if id in books.keys():
books[id]["genre"].append(category)
continue
books[id] = dict()
books[id]["name"] = book_name[:-4]
books[id]["genre"] = list()
books[id]["genre"].append(category)
books[id]["words"] = list()
N = int(sys.argv[1]) if len(sys.argv) > 1 else 1000
books[id]["words"] = extract_words(book, N)
os.chdir("../..")
with open("/tmp/ops.json", 'w') as tmp:
json.dump(books, tmp, ensure_ascii=False)
if __name__ == "__main__":
main()