-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
74 lines (60 loc) · 1.73 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import MeCab
import mojimoji
import re
import os
from glob import glob
def open_file(file):
with open(file, 'r') as f:
lines = f.readlines()
ret = ''
for i, line in enumerate(lines):
if i > 1:
ret += line.replace('\n', ' ')
return ret
def directory_parse(directory):
ret_dict = {}
file_list = glob(directory + '/*')
for f in file_list:
if os.path.isdir(f):
ret_dict.update(directory_parse(f))
elif 'LICENSE' in f or 'README' in f:
pass
else:
ret_dict[f] = open_file(f)
return ret_dict
class Tokenizer():
'''
data to token
'''
def __init__(self):
self.mt = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
def fit_on_texts(self, sentence):
node = self.mt.parseToNode(sentence)
while node:
feature = node.feature.split(',')
if feature[0] != '記号':
ret = mojimoji.zen_to_han(feature[-3], kana=False).lower()
if ret != '*' and ret != ' ':
yield ret
node = node.next
def texts_to_sequences(self, content):
ret = ''
for item in self.fit_on_texts(content):
ret += item
ret += ' '
return ret
def tokenize(self, contents):
ret_list = []
for k, content in contents.items():
ret_list.append(self.texts_to_sequences(content))
return ret_list
tk = Tokenizer()
contents = directory_parse('./text/')
words = tk.tokenize(contents)
with open('all_sentences.dat', 'w') as f:
for word in words:
# print(word)
f.write(word)
f.write('\n')