-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPreprocessing.py
124 lines (112 loc) · 4.18 KB
/
Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd
import re
import Methods as m
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from spellchecker import SpellChecker
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import CountVectorizer
spell = SpellChecker(distance = 1)
text_set = []
corpus = []# Final corpus
#----collect dataSet----
print("reading dataset 1")
dataSet1 = pd.read_csv('venv/Data/newUpdate.csv', names=['id', 'text'], header=1)
for text in dataSet1["text"]:
text_set.append(text)
print("size of data" , len(text_set))
print("reading dataset 2")
dataSet2 = pd.read_csv('venv/Data/protest.csv', names=['id', 'text'], header=1)
for text in dataSet2["text"]:
text_set.append(text)
print("size of data" , len(text_set))
print("reading dataset 3")
dataSet3 = pd.read_csv('venv/Data/corona.csv', names=['id', 'text'], header=1)
for text in dataSet3["text"]:
text_set.append(text)
print("size of data" , len(text_set))
print("reading dataset b")
dataSeta4 = pd.read_csv('venv/Data/datar.csv', names=['id', 'text'], header=1)
for text in dataSeta4["text"]:
text_set.append(text)
print("size of data" , len(text_set))
print("reading dataset 5")
dataSet5 = pd.read_csv('venv/Data/fashion.csv', names=['id', 'text'], header=1)
for text in dataSet5["text"]:
text_set.append(text)
print("size of data" , len(text_set))
print("reading dataset 6")
dataSet6 = pd.read_csv('venv/Data/Data.csv', names=['ID', 'TEXT'], header=1)
for text in dataSet6["TEXT"]:
text_set.append(text)
print("size of data" , len(text_set))
print("reading dataset 7")
dataSet7 = pd.read_csv('venv/Data/BuzzFeed_real_news_content.csv', names=['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data'], header=1)
for text2 in dataSet7["text"]:
text_set.append(text)
print("size of data" , len(text_set))
print("Reading Dataset 8")
dataSet8 = open("venv/Data/HealthData.txt", "r")
for text3 in dataSet8:
if text3.strip():
text_set.append(text3)
dataSet8.close()
print("size of data" , len(text_set))
print("reading dataset 9")
dataSet9 = pd.read_csv('venv/Data/News_Category_Description.csv', names=['ID', 'short_description'], header=1)
for text4 in dataSet9["short_description"]:
text4 = str(text4)
text_set.append(text4)
print("size of data", len(text_set))
print("reading dataset 10")
dataSet10 = pd.read_csv('venv/Data/helth2.csv', names=['ID', 'text'], header=1)
for text5 in dataSet10["text"]:
text_set.append(text5)
print("size of data" , len(text_set))
print("reading dataset 11")
dataSet11 = pd.read_csv('venv/Data/news.csv', names= ['ID', 'text'], header=1)
for text6 in dataSet11["text"]:
text_set.append(text6)
print("size of data" , len(text_set))
print("--text extraction done---")
# data collection
print("saving collected data in to a file")
dataFrame = pd.DataFrame(text_set, columns=['text'])
dataFrame.to_csv('TextNotProcessedv2.csv')
#---//colect dataSet---
print("--Defining Stop Words---")
stop_words = m.read_stopwords()
print("data cleaning......")
for item in text_set:
temporary_array = []
item = str(item)
item = re.sub(r"http\S+", "", item)
item = re.sub('[^a-zA-Z]', ' ', item)
item = re.sub("</?.*?>", " <> ", item)
item = re.sub("(\\d|\\W)+", " ", item)
item = item.lower()
item = item.split()
lem = WordNetLemmatizer()
item = [lem.lemmatize(word) for word in item if not word in stop_words]
item = " ".join(item)
ps = PorterStemmer()
tokenItem = word_tokenize(item)
for word in tokenItem:
nword = spell.correction(word)
if word != nword:
word = nword
temporary_array.append(word)
corpus.append(TreebankWordDetokenizer().detokenize(temporary_array))
print("corpus removing duplicates")
print("first", len(corpus))
corpus = list(dict.fromkeys(corpus))
print("corpus ready")
print('Start writing in to a file csv')
df = pd.DataFrame(corpus, columns=['text'])
df.to_csv('FinalCorpus.csv')
print('Everything is ready')
f = open("FinalCorpus.txt","w+")
for i in range(len(corpus)):
f.write("%s\n" % corpus[i])
f.close()