diff --git a/Post-Processing-Scripts/algorithms_latest.py b/Post-Processing-Scripts/algorithms_latest.py new file mode 100644 index 0000000..80d7437 --- /dev/null +++ b/Post-Processing-Scripts/algorithms_latest.py @@ -0,0 +1,267 @@ +''' Calculates accuracies for different test : train splits for 4 different algorithms +(Naïve Bayes, SVM, Neural Network, Random Forest). Run this script on a folder +containing extracted content files from a given set of URLs ''' + +import numpy as np +from sklearn.preprocessing import StandardScaler +import csv +from sklearn.model_selection import train_test_split +from numpy import array +from sklearn.naive_bayes import GaussianNB +import os +from sklearn.feature_extraction.text import CountVectorizer +import requests +from tika import parser +from tempfile import TemporaryFile +from sklearn.neural_network import MLPClassifier +from sklearn.model_selection import cross_val_score +from sklearn.model_selection import ShuffleSplit +import urllib +import io +import warnings + +warnings.simplefilter(action='ignore', category=FutureWarning) + +def loadKeywords(keyPath, ngram=False): + if os.path.exists(keyPath): + with open(keyPath, 'rb') as f: + keywords_content = f.read() + else: + print("Keyword path is not valid!") + return None + if ngram: + count_vect = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 2), token_pattern=r'\b\w+\b', + min_df=1) + else: + count_vect = CountVectorizer(lowercase=True, stop_words='english') + count_vect.fit_transform([keywords_content]) + keywords = count_vect.vocabulary_ + return count_vect + + +def download_file(url, i): + local_filename = url.split('/')[-1] + # NOTE the stream=True parameter + r = requests.get(url, stream=True, headers={'User-agent': 'Mozilla/5.0'}) + with open('/Users/prerana/Desktop/Post_Processing/200_files/' + str(i), 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + # f.flush() commented by recommendation from J.F.Sebastian + return local_filename + + +def transformPCA(x_n): + from sklearn.decomposition import PCA + pca = PCA(n_components=5) + x_transformed = pca.fit_transform(x_n) + x_transformed = StandardScaler().fit_transform(x_n) + return x_transformed + + +def mergeAllContents(): + all_files = os.listdir("otherstotext/") + big_f = open("all200Files.txt", "w") + for i in all_files: + f=open("otherstotext/"+str(i), "r") + big_f.write(f.read()) + +def closeWords(model, word, topN): + try: + indexes, metrics = model.cosine(word[0],n=10) + except KeyError: + indexes = 0 + metrics = 0 + list = model.generate_response(indexes, metrics).tolist() + return list[:topN] + + +def closeWordsList(modelBin, keywords, i): + import word2vec + model = word2vec.load('/Users/prerana/Desktop/Post_Processing/ocean.bin') + listTopN = [] + for word in keywords: + for k in (closeWords(model, word, i)): + listTopN.append(k) + return listTopN + + +def addCloseCounts(listTopN, x): + for k in range(1,np.array(listTopN).shape[1]): + all_files = os.listdir("otherstotext/") + big_f = open("all200Files.txt", "w") + m = 0 + for i in all_files: + f=io.open("otherstotext/"+str(i), "r", encoding="utf-8", errors='ignore') + content = f.read() + + if content is None: + continue + else: + for s in range(len(listTopN[k])): + str1 = str(listTopN[k][s]) + if str1 in content: + x[m][k] += 1 + m+=1 + return x + + +def sortingDict(x): + import operator + sorted_x = sorted(x.items(), key=operator.itemgetter(1)) + return sorted_x + + +def cosineSimilarityScore(test_url, gold_standard_url): + import sparse as sparse + from sklearn.metrics.pairwise import cosine_similarity + from scipy import sparse + import numpy as np + A = np.array([test_url, gold_standard_url]) + sparse_A = sparse.csr_matrix(A) + similarities_sparse = cosine_similarity(sparse_A, dense_output=False) + return similarities_sparse[(1)] + + +def accuracy(y_pred, y_test): + accNum = 0 + for a in range(len(y_test)): + if y_pred[a] == y_test[a]: + accNum += 1 + else: + if y_pred[a] in [1, 2, 3, 4, 5] and y_test[a] in [1, 2, 3, 4, 5]: + accNum += 1 + return accNum + +def main(): + keywordPath = "/Users/prerana/Desktop/Post_Processing/features.txt" # this should be the same keywords list/order used for training the ML Model + count_vect = loadKeywords(keywordPath, False) + keywords = count_vect.vocabulary_ + sorted_keywords = sortingDict(keywords) + kList = [] + for item in sorted_keywords: + kList.append(item) + listTopN = closeWordsList('/Users/prerana/Desktop/Post_Processing/ocean.bin', kList, 5) + + x_train = [] + y_train = [] + + with open('/Users/prerana/Desktop/Post_Processing/train.csv', 'r') as csvfile: + reader = csv.reader(csvfile, delimiter=',') + for row in reader: + # Relevancy score for each url taken from the csv file. Relevancy is in 2nd column. + y_train.append(row[1]) + noneContents = [] + x_n = None + y_n = array(y_train) + + all_files = os.listdir("otherstotext/") + big_f = open("all200Files.txt", "w") + for i in all_files: + f=open("otherstotext/"+str(i), "r") + content = f.read() + content = unicode(content, errors='ignore') + if content is not None: + tempX = count_vect.transform(content.split()) + x_train.append(tempX) + if x_n is None: + x_n = array([tempX.toarray().sum(axis=0)]) + else: + x_n = np.concatenate((x_n, [tempX.toarray().sum(axis=0)]), axis=0) + else: + noneContents.append(i) + + + np.savetxt('/Users/prerana/Desktop/Post_Processing/x_n.txt', x_n, fmt='%d') + + x = np.loadtxt('/Users/prerana/Desktop/Post_Processing/x_n.txt', dtype=int) + + y = np.loadtxt('/Users/prerana/Desktop/Post_Processing/y_n.txt', dtype=int) + x_with_closeWords = addCloseCounts(listTopN, x) + + mergeAllContents() + x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50) + + cv = ShuffleSplit(n_splits=5, test_size=0.2) + + clf = GaussianNB() + scoreNB = cross_val_score(clf, x, y, cv=cv) + clf11 = GaussianNB() + + scoreNB2 = cross_val_score(clf11, x_with_closeWords, y, cv=cv) + clf1 = GaussianNB().fit(x_train, y_train) + y_pred = clf1.predict(x_test) + accNum = accuracy(y_pred, y_test) + + print("\n\nModel: Naive Bayes") + acc = (y_test == y_pred).sum() / float(len(y_test)) + print("Test Accuracy:" + str(acc)) + acc_train = (y_train == clf1.predict(x_train)).sum() / float(len(y_train)) + print("Train Accuracy:" + str(acc_train)) + print("******************") + + from sklearn import linear_model + + clf22 = linear_model.SGDClassifier(max_iter=2000,learning_rate='optimal') + scoreSVM = cross_val_score(clf22, x, y, cv=cv) + clf222 = linear_model.SGDClassifier(max_iter=2000,learning_rate='optimal') + + scoreSVM2 = cross_val_score(clf222, x_with_closeWords, y, cv=cv) + clf2 = linear_model.SGDClassifier().fit(x_train, y_train) + + y_pred2 = clf2.predict(x_test) + accNum2 = accNum = accuracy(y_pred2, y_test) + + print("\n\nModel: SVM") + acc = (y_test == y_pred2).sum() / float(len(y_test)) + print("Test Accuracy:" + str(acc)) + acc_train = (y_train == clf2.predict(x_train)).sum() / float(len(y_train)) + print("Train Accuracy:" + str(acc_train)) + print("******************") + + clf33 = MLPClassifier(max_iter=2000, learning_rate='adaptive') + scoreNN = cross_val_score(clf33, x, y, cv=cv) + + clf333 = MLPClassifier(max_iter=2000, learning_rate='adaptive') + + scoreNN3 = cross_val_score(clf333, x_with_closeWords, y, cv=cv) + + clf3 = MLPClassifier(max_iter=2000, learning_rate='adaptive').fit(x_train, y_train) + y_pred3 = clf3.predict(x_test) + + accNum3 = accNum = accuracy(y_pred3, y_test) + + print("\n\nModel: Neural Network") + acc = (y_test == y_pred3).sum() / float(len(y_test)) + print("Test Accuracy:" + str(acc)) + acc_train = (y_train == clf3.predict(x_train)).sum() / float(len(y_train)) + print("Train Accuracy:" + str(acc_train)) + print("******************") + + from sklearn.ensemble import RandomForestClassifier + + clf44 = RandomForestClassifier(n_estimators=100) + scoreRF = cross_val_score(clf44, x, y, cv=cv) + clf444 = RandomForestClassifier(n_estimators=100) + + scoreRF4 = cross_val_score(clf444, x_with_closeWords, y, cv=cv) + + clf4 = RandomForestClassifier(n_estimators=100).fit(x_train, y_train) + y_pred4 = clf4.predict(x_test) + accNum4 = accNum = accuracy(y_pred4, y_test) + + print("\n\nModel: Random Forest") + acc = (y_test == y_pred4).sum() / float(len(y_test)) + print("Test Accuracy:" + str(acc)) + acc_train = (y_train == clf4.predict(x_train)).sum() / float(len(y_train)) + print("Train Accuracy:" + str(acc_train)) + noneContents = array(noneContents) + xOut = TemporaryFile() + yOut = TemporaryFile() + noneContentsOut = TemporaryFile() + np.save(xOut, x_n) + np.save(yOut, y_n) + np.save(noneContentsOut, noneContents) + +if __name__ == '__main__': + main()