-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnlpUtil.py
62 lines (56 loc) · 2.16 KB
/
nlpUtil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 19-4-2 上午9:19
# @Author : liyang
# @File : divideWords.py
import pkuseg
import jieba
from pyhanlp import *
from config import *
import os
from sgpyHandle import SgpyDictionary
class NLPUtil(object):
_stopwords = [u'\n', u',', u'。', u'、', u':', u'(',
u')', u'[', u']', u'.', u',', u' ', u'\u3000', u'”', u'“',
u'?', u'?', u'!', u'‘', u'’', u'…', '!', '+', '-', '*',
'#', '"', "'", '^', ':', '/', '%', '=', ';', u';', '@',
'{', '}', u'」', u'「', u'.', u'—', u'-', u'『', u'』',
u'□', u'【', u'】', u'◆', u'(', u')', u'·', u'`', u'·',
u'\t', u'\\n', u'|']
stopwords = set(_stopwords)
jieba.load_userdict(SGPY_DICTIONARY)
jieba.load_userdict(CHINESE_NAME_CORPUS)
#seg = pkuseg.pkuseg(user_dict=SGPY_DICTIONARY, postag=True)
seg = pkuseg.pkuseg(postag=True)
#with open(SGPY_DICTIONARY) as f:
# for word in f.read().strip().split('\n'):
# CustomDictionary.add(word)
@classmethod
def divideWordWithPkuseg(cls, text):
cuted_text = cls.seg.cut(text)
res = ""
for item in cuted_text:
if item[0] not in cls.stopwords:
res += item[0] + ':' + item[1] + ' '
return res
@classmethod
def divideWordWithJieba(cls, text):
cuted_text = jieba.lcut(text, HMM=True)
# res = ""
# for item, flag in cuted_text:
# if item not in cls.stopwords:
# res += item + ':' + flag + ' '
# return res
return cuted_text
@classmethod
def divideWordWithHanLP(cls, text):
cuted_text = HanLP.segment(text)
res = ""
for item in cuted_text:
if item.word not in cls.stopwords and item != '':
res += item.word + ':' + item.nature.name + ' '
return res
if __name__=="__main__":
text = '与牛共舞大数据研究中心建议关注:中海达、振芯科技、合众思壮、海格通信、北斗星通等。'
res = NLPUtil.divideWordWithJieba(text)
print(res)