-
Notifications
You must be signed in to change notification settings - Fork 0
/
AnalyseFeedback.py
156 lines (146 loc) · 5.15 KB
/
AnalyseFeedback.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from FeedbackPTLocals import LOCAL_PREFIX
from FeedbackPTLocals import LOCAL_LOOKUP
from FeedbackPTLocals import LOCAL_NONSTOP
import sys
import getopt
import string
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import spacy
from spacy.lang.pt.examples import sentences
from spacy.lemmatizer import Lemmatizer
def left(s , amount):
return s[:amount]
def show_wordcloud(data, sOutputFile='',bVerbose=False):
wordcloud = WordCloud(
background_color = 'white',
max_words = 200,
max_font_size = 40,
scale = 3,
random_state = 42
).generate(str(data))
fig = plt.figure(1, figsize = (20, 20))
plt.axis('off')
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()
if len(sOutputFile)>0:
print("Writing output file: ",sOutputFile);
plt.savefig(sOutputFile);
def clean_text(text,sLanguage,bVerbose=False):
global LOCAL_LOOKUP;
global LOCAL_PREFIX;
global LOCAL_NONSTOP;
if bVerbose:
print(".. Original: ",text)
# lower text
text = text.lower()
# Portuguese tokenizer
nlp = spacy.load(sLanguage + '_core_news_sm')
WordStr=""
LemmaStr=""
NLPtext = nlp(text)
for word in NLPtext:
if word.is_stop and (word.text not in LOCAL_NONSTOP):
WordStr=word.text;
if bVerbose:
print(".... Ignore stop word: ",WordStr);
elif not word.is_alpha:
WordStr=word.text;
if bVerbose:
print(".... Ignore non alpha word: ",WordStr);
else:
CurrentLemma = word.lemma_.strip();
CurrentWord = word.text.strip();
if len(CurrentWord)<=1:
CurrentWord=word.text;
if bVerbose:
print(".... Ignore short word: ",CurrentWord);
elif left(CurrentWord,3) in LOCAL_PREFIX:
CurrentWord=word.text;
if bVerbose:
print(".... Ignore absurd word: ",CurrentWord)
else:
if CurrentLemma in LOCAL_LOOKUP:
CurrentLemma=LOCAL_LOOKUP[CurrentLemma]
if len(WordStr)>0:
WordStr = WordStr + ' '
LemmaStr = LemmaStr + ' '
WordStr = (WordStr + CurrentWord).strip();
LemmaStr = (LemmaStr + CurrentLemma).strip();
if bVerbose:
print(".... ",CurrentWord, "converted to ",CurrentLemma)
if bVerbose:
print("-- ",LemmaStr)
text=LemmaStr;
return(text)
def ReadContent(sFile, sDelimiter=',',bVerbose=False):
# read data
ContentTxt = pd.read_csv(sFile,delimiter=sDelimiter,names=["REVIEWS"]);
ContentTxt = ContentTxt.dropna();
ContentTxt = ContentTxt.drop_duplicates();
return ContentTxt;
# clean text data
def main():
bVerbose=False;
bSuccess=True;
UsageString= '--- AnalyseFeedback.py ---\r\n';
UsageString += '\t[-h] (Optional) Show this help message and exit\r\n';
UsageString += '\t-i <Input csv file>\r\n';
UsageString += '\t[-l] (Optional) <Language. Defaults to pt>\r\n';
UsageString += '\t[-d] (Optional) <Delimiter. Defaults to ,>\r\n';
UsageString += '\t[-o] (Optional) (NOT WORKING!) <Output graphic file (with extension)>\r\n';
UsageString += '\t[-v] (Optional) Verbose output\r\n';
sInputFile = '';
sOutputFile = '';
sLanguage='pt';
sDelimiter=',';
# Parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:],"hi:o:vd:",["ifile=","ofile=","delimiter="]);
except getopt.GetoptError:
print(UsageString);
print("");
print('ERROR: Unable to process command line parameters');
bSuccess=False;
for opt, arg in opts:
if opt == '-h':
print(UsageString);
sys.exit(1);
elif opt == "-v":
bVerbose = True;
elif opt == "-l":
sLanguage = arg;
elif opt in ("-i", "--ifile"):
sInputFile = arg;
elif opt in ("-o", "--ofile"):
sOutputFile = arg;
elif opt in ("-d", "--delimiter"):
sDelimiter = arg;
if len(sInputFile)<=0:
print(UsageString);
print("");
print('ERROR: Missing input file (-i / --ifile=)');
bSuccess=False;
if bSuccess:
try:
# read data
print("Reading input file ",sInputFile);
reviews_df = ReadContent(sInputFile,sDelimiter,bVerbose);
print('%i lines to be processed' % len(reviews_df));
print("Processing natural language (might take time) ..");
reviews_df["review_clean"] = reviews_df["REVIEWS"].apply(lambda x: clean_text(x,sLanguage,bVerbose));
print("Showing wordcloud ..");
AllWords=" ".join(reviews_df["review_clean"]);
show_wordcloud(AllWords,sOutputFile,bVerbose);
except Exception as e:
print("");
print('ERROR: Error processing input');
print(str(e));
bSuccess=False;
return bSuccess;
if __name__ == '__main__':
bMainResult=main()
if not bMainResult:
exit(-1);