-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathms_classification.py
174 lines (138 loc) · 6.61 KB
/
ms_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import pandas as pd
import numpy as np
import re
import warnings
import matplotlib
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import os
print("hello")
def trim(df):
# SAS - strip removes trailing characters at start and end, and whitespaces / newlines
df.columns = df.columns.str.strip()
df = df.drop_duplicates()
# Lower case
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
df_obj = df.select_dtypes(['object'])
df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
print("All column names have been striped, lowered case, replaced space with underscore if any")
print("Dropped duplicated instances if any")
print("Categorical instances have been striped")
return df
def shape(df, df_name):
print(f'STATUS: Dimension of "{df_name}" = {df.shape}')
pd.set_option('display.max_colwidth', 255)
df = pd.read_csv('mtsamples.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df = trim(df)
df.head(3)
#df = df[df['medical_specialty'].isin(['Neurosurgery','ENT - Otolaryngology','Discharge Summary','Neurology'])]
#df = df[df['medical_specialty'].isin(['ENT - Otolaryngology','Neurology'])]
df = df[df['medical_specialty'].isin(['Neurosurgery','Neurology'])]
shape(df,'df')
# Task: Try and predict Medical speciality from keywords
# First check that all data has keywords
for medical_specialty in df['medical_specialty'].unique():
print(medical_specialty)
dum=1
#NOTE,: Thesetranscribed medical transcription sample reports and examples are provided by various users andare for reference purpose only. MTHelpLine does not certify accuracy and quality of sample reports.These transcribed medical transcription sample reports may include some uncommon or unusual formats;this would be due to the preference of the dictating physician. All names and dates have beenchanged (or removed) to keep confidentiality. Any resemblance of any type of name or date orplace or anything else to real world is purely incidental.,
# make sure indexes pair with number of rows
df = df.reset_index()
preClean_L=len(df)
subString="MTHelpLine does not certify accuracy"
fullString="NOTE,: Thesetranscribed medical transcription sample reports and examples are provided by various users andare for reference purpose only. MTHelpLine does not certify accuracy and quality of sample reports.These transcribed medical transcription sample reports may include some uncommon or unusual formats;this would be due to the preference of the dictating physician. All names and dates have beenchanged (or removed) to keep confidentiality. Any resemblance of any type of name or date orplace or anything else to real world is purely incidental.,"
dropNfull=len(fullString)
for row in df.itertuples():
# Load current keyword and convert to strong
current_kw=str(row.keywords)
index=row.Index
#current_kw=str(row['keywords'])
if current_kw=='nan':
#df=df.drop(index,axis=0)
df = df.drop(index)
#print("Dropping row "+index+"with content: "+current_kw)
print("Found a NaN. Dropping row "+str(index))
elif len(current_kw) < 10:
df = df.drop(index)
print("Found short keywords. Dropping row " + str(index))
elif subString in current_kw:
print("Found a keywords with Note at index "+str(index)+". Removed the final N characters from keywords.")
df.at[index,'keywords']=current_kw[:-dropNfull]
print("Dataframe length reduced from "+str(preClean_L)+" to "+str(len(df)))
# Check that the Notes have actually been dropped.
for row in df.itertuples():
current_kw = str(row.keywords)
index = row.Index
if subString in current_kw:
print("Found a keywords with Note at index "+str(index)+". Should have been removed.")
dum=1
# Apply bag of words algorithm from sk-learn
count_vect=CountVectorizer()
X=count_vect.fit_transform(df.iloc[:]['keywords'])
feats=count_vect.get_feature_names_out()
feat_count=np.sum(X,0) # Of shape 1,2075
feat_count_sorted=np.sort(feat_count)
feats_sorted=feats[feat_count.argsort()]
print("Still have "+str(np.count_nonzero(df['medical_specialty']=='Neurology'))+" neurology entries")
#########################################################################################
# Set keywords as input data, and medical_specialty as target data
# Split the training data, and use NB or SVM classifiers
y=df.iloc[:]['medical_specialty']
dum=1;
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
X_array=X.toarray()
X_train, X_test, y_train, y_test = train_test_split(X_array, y, test_size=0.5, random_state=101)
gnb = GaussianNB()
mnb = MultinomialNB()
svc = SVC()
sgd = SGDClassifier()
log_loss=SGDClassifier(loss='log_loss')
gnb.fit(X_train, y_train)
mnb.fit(X_train, y_train)
svc.fit(X_train,y_train)
sgd.fit(X_train, y_train)
log_loss.fit(X_train,y_train)
# Making Predictions
gnb_pred = gnb.predict(X_test)
mnb_pred = mnb.predict(X_test)
svc_pred = svc.predict(X_test)
sgd_pred = sgd.predict(X_test)
ll_pred = log_loss.predict(X_test)
# Manually finding f1 (micro):
gnb_score = np.count_nonzero(y_test==gnb_pred)/len(y_test)
mnb_score = np.count_nonzero(y_test==mnb_pred)/len(y_test)
svc_score = np.count_nonzero(y_test==svc_pred)/len(y_test)
sgd_score = np.count_nonzero(y_test==sgd_pred)/len(y_test)
lL_score = np.count_nonzero(y_test==ll_pred)/len(y_test)
Nclass= len(df['medical_specialty'].unique())
preds = [gnb_pred, mnb_pred, svc_pred, sgd_pred, ll_pred]
if Nclass>2:
average_score = 'weighted'
f1s = [round(metrics.f1_score(y_test,i,average=average_score)*100,2) for i in preds]
else:
# sklearn throwing errors cause how predications are shaped.. Manually finding f1
f1s = [round(np.count_nonzero(y_test==i)/len(i) * 100, 2) for i in preds]
print("Gaussian NB has "+str(gnb_score*100)+"%")
print("Multinomial NB has "+str(mnb_score*100)+"%")
print("svc (without SGD) has "+str(sgd_score*100)+"%")
print("svm with SGD has "+str(svc_score*100)+"%")
print("LogReg with SGD has "+str(lL_score*100)+"%")
def addlabels(x,y):
for i in range(len(x)):
plt.text(i,y[i],y[i])
x_=['Gaussian NB','Multinomial NB','svm','svm with sgd','LogReg with SGD']
scores=[gnb_score,mnb_score,svc_score,sgd_score,lL_score]
scores = [round(i * 100,2) for i in scores]
plt.bar(x_,f1s)
addlabels(x_,f1s)
plt.grid(color='k', linestyle='--', linewidth=0.2)
plt.suptitle('Classifying '+str(Nclass)+' specialities from '+str(len(df))+' samples')
plt.ylabel('F1 score', fontsize=16)
plt.show()
dum=1