-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathremove_words.py
77 lines (63 loc) · 1.96 KB
/
remove_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from nltk.corpus import stopwords
from utils import clean_str
import sys
if len(sys.argv) != 2:
sys.exit("Use: python remove_words.py <dataset>")
datasets = ['i2b2', 'mimic']
dataset = sys.argv[1]
stop_words = set(stopwords.words('english'))
doc_content_list = []
f = open('data/corpus/' + dataset + '.txt', 'rb')
# f = open('data/wiki_long_abstracts_en_text.txt', 'r')
for line in f.readlines():
doc_content_list.append(line.strip().decode('latin1'))
f.close()
word_freq = {} # to remove rare words
for doc_content in doc_content_list:
temp = clean_str(doc_content)
words = temp.split()
for word in words:
if word in word_freq:
word_freq[word] += 1
else:
word_freq[word] = 1
clean_docs = []
for doc_content in doc_content_list:
temp = clean_str(doc_content)
words = temp.split()
doc_words = []
for word in words:
# word not in stop_words and word_freq[word] >= 5
if dataset in ['mr', 'own']:
doc_words.append(word)
elif word not in stop_words and word_freq[word] >= 5:
doc_words.append(word)
doc_str = ' '.join(doc_words).strip()
#if doc_str == '':
#doc_str = temp
clean_docs.append(doc_str)
clean_corpus_str = '\n'.join(clean_docs)
f = open('data/corpus/' + dataset + '.clean.txt', 'w')
#f = open('data/wiki_long_abstracts_en_text.clean.txt', 'w')
f.write(clean_corpus_str)
f.close()
#dataset = '20ng'
min_len = 10000
aver_len = 0
max_len = 0
f = open('data/corpus/' + dataset + '.clean.txt', 'r')
#f = open('data/wiki_long_abstracts_en_text.txt', 'r')
lines = f.readlines()
for line in lines:
line = line.strip()
temp = line.split()
aver_len = aver_len + len(temp)
if len(temp) < min_len:
min_len = len(temp)
if len(temp) > max_len:
max_len = len(temp)
f.close()
aver_len = 1.0 * aver_len / len(lines)
print('min_len : ' + str(min_len))
print('max_len : ' + str(max_len))
print('average_len : ' + str(aver_len))