-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment.py
104 lines (91 loc) · 4.21 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -----------------------------------
# SENTIMENT ANALYSIS
# -----------------------------------
from .processing import tokenize_text
import pandas as pd
from germansentiment import SentimentModel
model = SentimentModel()
# -----------------------------------
# SENTIWS
# -----------------------------------
# SENTIWS_GLOSSARY()
# -----------------------------------
# POSITIVE_LINES = .readlines() output for positive SentiWS file
# POSITIVE_LINES = .readlines() output for negative SentiWS file
# SENTIMENT_GLOSSARY = pandas.DataFrame(); glossary of sentiment values, words etc. to work with
def sentiws_glossary(positive_lines, negative_lines):
# create df from positive_lines and negative_lines
sentiment_glossary = pd.DataFrame({'entries':positive_lines})
sentiment_glossary_neg = pd.DataFrame({'entries':negative_lines})
sentiment_glossary = sentiment_glossary.append(sentiment_glossary_neg, ignore_index=True)
# for each entry, split it into corresponding df columns using regex, then delete col "entries"
sentiment_glossary[['word','pos_tag','polarity','infl']] = sentiment_glossary['entries'].str.extract(r"^(.*)\|(.*)\t(.*)\t(.*)$", expand=True)
sentiment_glossary = sentiment_glossary.drop(columns="entries")
# split infl into a list of infls
infls = [enum_item.split(",") for enum_item in sentiment_glossary.infl]
sentiment_glossary = sentiment_glossary.drop(columns=["infl"])
sentiment_glossary["infl"] = infls
return sentiment_glossary
# GET_POLARITY_VALUES()
# -----------------------------------
# TEXT = list(); subset text
# SENTIMENT_DF = pandas.DataFrame(); sentiment glossary/output of SENTIWS_GLOSSARY()
# SENTIMENT_VALS = list(); all polarity values for TEXT based on SENTIMENT_DF
def get_polarity_values(text, sentiment_df):
sentiment_vals = []
for passage in text:
passage_val = 0
tokens = tokenize_text(passage)
for token in tokens:
if token in list(sentiment_df.word):
idx = sentiment_df.index[sentiment_df["word"] == token]
polarity = sentiment_df.at[idx.values[0], "polarity"]
passage_val += float(polarity)
elif any(sentiment_df.infl == token):
#idx = np.where(sentiment_df["infl"].str.contains(token))
idx = sentiment_df.index[sentiment_df["infl"].str.contains(token)]
polarity = sentiment_df.at[idx.values[0], "polarity"]
passage_val += float(polarity)
sentiment_vals.append(passage_val)
return sentiment_vals
# -----------------------------------
# GERMANSENTIMENT
# -----------------------------------
# GET_GERMANSENTIMENT()
# -----------------------------------
# TEXT_COL = pandas.DataFrame.column; text of a specific subset
# SENTIMENT = pandas.DataFrame(); germansentiment value for each text in TEXT_COL
def get_germansentiment(text_col):
counter = 0
sentiment = []
for passage in text_col:
result = model.predict_sentiment(text_col[counter:counter+1])
sentiment.append(result)
counter += 1
return pd.DataFrame(sentiment, columns=['germansentiment'])
# MAP_SENTIMENT()
# -----------------------------------
# DF = pandas.DataFrame(); must contain column "germansentiment" on which the function is applied
def map_sentiment(df):
sentiment_mapped = []
for i in df.germansentiment:
if i == "neutral":
sentiment_mapped.append(0)
elif i == "positive":
sentiment_mapped.append(1)
elif i == "negative":
sentiment_mapped.append(-1)
df["germansentiment_mapped"] = sentiment_mapped
return df
# -----------------------------------
# COMPARE SENTIMENT SCORES
# -----------------------------------
# COMPARE_SENTIMENT()
# -----------------------------------
# PASSAGE_LOC = str(); location of passage to inspect
# DF = pandas.DataFrame(); must contain the columns "text", "germansentiment" and "rel_sentiws" for them to be compared
def compare_sentiment(passage_loc, df):
print("position of passage: " + str(passage_loc))
print("text: " + df.text[passage_loc])
print("score using germansentiment: " + str(df.germansentiment[passage_loc]))
print("score using sentiws: " + str(df.loc[passage_loc, "rel_sentiws"]))