-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathData_Cleaning_functions.py
57 lines (43 loc) · 2.36 KB
/
Data_Cleaning_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import re
import pandas as pd
import numpy as np
#Converting labels to numbers
def label_to_int(label):
if label == 'support':
return 0
elif label == 'deny':
return 1
elif label == 'query':
return 2
elif label == 'comment':
return 3
#Pre-processing Twitter and Reddit Posts to handle URLs and Mentions.
#Replaces URLs with $URL$ and mentions with $MENTION$
def processText(text):
text = re.sub(r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", "$URL$",text.strip())
text = re.sub(r"(@[A-Za-z0-9]+)", "$MENTION$", text.strip())
return text
'''Processing all of Twitter and Reddit data frames to
1. Get rid of all NaN values
2. Remove columns not useful for the Model
3. Process text
4. Return a combined frame consisting of both Twitter and Reddit data'''
def processStanceData(twitterDf, RedditDf):
frames = [twitterDf, RedditDf]
resultDf = pd.concat(frames) #Concatenating twitter and reddit data
result1 = resultDf.replace(np.nan, '', regex=True) #Getting rid of NaN values
result1['labelvalue'] = result1.label_x.apply(label_to_int) #Converting labels to numbers
result1['SrcInre'] = result1['inreText'].str.cat(result1['sourceText'],sep=" ")
data = result1[['text_x', 'id', 'inre_x', 'source_x' ,'label_x','SrcInre', 'labelvalue' ]].copy()
'''replyText - the reply post (whose stance towards the target needs to be learnt)
replyTextId - the ID of the reply post
previousText - the text to which replyText was replied
sourceText - the source post of the conversation thread
label - the label value assigned to each post
previoysPlusSrctext - the concatenation of the previousText and the sourceText
labelValue - the numberic value assigned to each label'''
data.columns = ['replyText', 'replyTextId', 'previousText', 'sourceText', 'label', 'previousPlusSrcText', 'labelValue']
data['pReplyText'] = data.replyText.apply(processText)
data['pPreviousPlusSrcText'] = data.previousPlusSrcText.apply(processText)
data['TextSrcInre'] = data['pReplyText'].str.cat(data['pPreviousPlusSrcText'],sep=" ")
return data