-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizec.py
55 lines (45 loc) · 1.52 KB
/
tokenizec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import re
import json
class NlpTokenizer:
#toknize single string for example (rabee loai hindi )
#the function recive a string text
def tokenizec(self,text):
regx="\w\w*[.|:|?|'||...|!|-|()]?\w\w*"
tk=re.findall(regx,text)
tk=[item.lower() for item in tk]
return tk
#this function return set list of tokens (remove dublicates tokenz)
#the function recive a string text
def unitoknizer(self,text):
ListOfTokens=self.tokenizec(text)
SetTkn=set(ListOfTokens)
return SetTkn
#this functino return list of token for a single file for example data['3843.eng']
def singlefiletk(self,single_file):
listt=[]
for tag in single_file.keys():
listt.append(single_file[tag])
string=' '.join(map(str, listt))
listt=self.unitoknizer(string)
#listt=self.tokenizec(string)
return listt
#used to tokenize the whole document, input is a dictionary
def datatoken(self,datafile):
list1={}
for fn in datafile.keys():
tags=[]
tokens=[]
for tag in datafile[fn]:
tokens.append(self.tokenizec(str(datafile[fn][tag])))
tags.append(tag)
list1[fn]=dict(zip(tags,tokens))
return list1
'''
nt=NlpTokenizer()
f = open('E:/Master/NLP/project/iaprtc12/iaprtc12/xyz.json',)
data = json.load(f)
listt=nt.singlefiletk(data['3843.eng'])
print(listt)
#listt=nt.datatoken(data)
#print(len(listt))
'''