-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcopy_of_yoyo_analysis.py
113 lines (78 loc) · 2.95 KB
/
copy_of_yoyo_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
"""Copy of yoyo_analysis.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1djpwKlfVoqUjFP7550FpSYduyVGf4BAb
#Import Libraries to be fast
"""
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
import nltk
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.preprocessing import LabelBinarizer
#from nltk.tokenize.toktok import ToktokTokenizer
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import classification_report,confusion_matrix,
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import io
"""#Upload training data"""
path = "C:/Users/seanw/Downloads/training_data.csv"
df = pd.read_csv(path)
print(len(df))
#prints top few samples the dataset
df.head(7)
df.shape
"""#Clean data
We remove punctuations and any stopwords like the, a, or, that add little meaning to the reviews
"""
#we create a new column replacing punctuations with spaces
df['no_punct'] = df['Text'].str.replace('[^\w\s]', '')
#importing some stopword lists
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
nltk.download('stopwords')
#combine stopwords lists
all_stop_words = list(ENGLISH_STOP_WORDS.union(stopwords.words('english'))).remove('no')
#checking if no_punct indeed has no punctuations
df.head(3)
"""#Transform the text"""
#Split the data into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(df.no_punct,df.Sentiment,test_size=0.3,random_state=42)
#check the dimensions
print("Number of training samples: {}".format(X_train.shape[0]))
print("Number of testing samples: {}".format(X_test.shape[0]))
#we use tf-idf to transform each word into it's id and it's importance in a sentence
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(stop_words = all_stop_words)
X_train = tv.fit_transform(X_train)
X_test = tv.transform(X_test)
#checking if the text has been transformed
print(X_train[0])
#shape the sentiment values into an understandable numpy array
y_train.values
y_test.values
"""#Create Model"""
from sklearn import svm
#training the model
#the values were decided by for loops in another notebook
clf = svm.SVC(C = 10, kernel = "linear",gamma=2).fit(X_train, y_train)
#score the model on the accuracy data
result = clf.predict(X_test)
acc = accuracy_score(y_test, result)
print("accuracy is:", acc)
"""#Apply onto judgement data
We repeat the text preparation
"""
#You can test upload here
data_to_load = files.upload()
df = pd.read_csv(io.BytesIO(data_to_load['some_reviews.csv']))
#get rid of punctuations
df['no_punct'] = df['Text'].str.replace('[^\w\s]', '')
reviews = tv.transform(df['no_punct'])
result = clf.predict(reviews)
#finally we write the results to csv file
output = pd.DataFrame(data={"sentiment": result})
output.to_csv("predicted_labels.csv")
"""#□