-
Notifications
You must be signed in to change notification settings - Fork 1
/
BN_Sentiment.py
116 lines (93 loc) · 3.71 KB
/
BN_Sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import numpy as np
import pandas as pd
from textblob import *
from tkinter import Tk
from tkinter.filedialog import askopenfilename, asksaveasfilename
from joblib import delayed, Parallel
import multiprocessing
from file_handling import *
from selection import *
def main():
print("Program: Sentiment")
print("Release: 1.2")
print("Date: 2020-03-09")
print("Author: Brian Neely")
print()
print()
print("This program reads a csv file and will preform a sentiment analysis on a specified column.")
print()
print()
# Hide Tkinter GUI
Tk().withdraw()
# Find input file
file_in = askopenfilename(initialdir="../", title="Select file",
filetypes=(("Comma Separated Values", "*.csv"), ("all files", "*.*")))
if not file_in:
input("Program Terminated. Press Enter to continue...")
exit()
# Set ouput file
file_out = asksaveasfilename(initialdir=file_in, title="Select file",
filetypes=(("Comma Separated Values", "*.csv"), ("all files", "*.*")))
if not file_out:
input("Program Terminated. Press Enter to continue...")
exit()
# Ask for Delimination
delimiter = input("Please input Delimiter: ")
# Read data
data = open_unknown_csv(file_in, delimiter)
# Create Column Header List
headers = list(data.columns.values)
# Select Column for Sentiment Analysis
column_list = column_selection_multi(headers, "sentiment analysis")
# Create an empty output file
open(file_out, 'a').close()
# Loop through selected columns
for column in column_list:
# Remove Nan for clean subset of data
data_no_na = data.dropna(subset=[column], inplace=False)
# Split data for Parallel Processing
data_split = split_data(data_no_na)
# Create sentiment score for Data using parallel processing
print("Sentiment score creation...")
data_split = Parallel(n_jobs=-1)(delayed(sentiment_calculation)(i, column, par_index, len(data_split))
for par_index, i in enumerate(data_split))
print("Score Calculation Complete!")
print()
# Union split data frames
data_no_na_out = pd.concat(data_split)
# Join back to original dataset
data = data.join(data_no_na_out[str(column) + ' - Sentiment'], how='left')
# Write CSV
print("Writing CSV File...")
data.to_csv(file_out, index=False)
print("Wrote CSV File!")
print()
print("Sentiment Analysis Completed on column: [" + column + "]")
print("File written to: " + file_out)
input("Press Enter to close...")
def sentiment_calculation(data, column, par_index, par_len):
sentmnt = list()
for index, i in enumerate(data[column]):
sentmnt.append(TextBlob(i).sentiment.polarity)
data[str(column) + ' - Sentiment'] = sentmnt
print("Sentiment Calculation Complete on: " + str(par_index) + " out of " + str(par_len) + "!")
return data
def split_data(data):
# *****Split data for parallel processing*****
print("Calculating Splits...")
# Find number of CPUs and multiply by 16 for number of parallel threads
num_splits = multiprocessing.cpu_count() * 16
# Calculate the split locations
split_locations = np.linspace(0, len(data), num_splits)
# Rounds up the split_locations
split_locations = np.ceil(split_locations)
# Convert split_locations to int for splitting data
split_locations = split_locations.astype(int)
# Split data for parallel processing
data_split = np.split(data, split_locations)
print("Splits Calculated!")
print()
return data_split
# *****End Split*****
if __name__ == '__main__':
main()