-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathdata_scraping_stocktwits.py
96 lines (79 loc) · 2.61 KB
/
data_scraping_stocktwits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import argparse
import os
import requests
import json
import datetime
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument('--scrape_iter',
required=False,
help='Scrape Iteration')
parser.add_argument('--concat_df',
required=True,
choices=['Y', 'N'],
help='Use this script to concat all of the extracted datasets or to extract StockTwits data part by part')
parser.add_argument('--path',
required=True,
help="Path to SENN folder")
def stocktwits_scrap(n,ticker,base=149407261):
'''
Function to scrap stocktwits tweet
Base: Base ID to scrape from
'''
id=[]
created_at=[]
body=[]
for i in range(n):
try:
url = "https://api.stocktwits.com/api/2/streams/symbol/"+ticker+".json?max="+str(base+5000)+"&since="+str(base)+"&limit=30"
base=base+5001
response = requests.request("GET", url, headers={}, data ={})
response=response.json()
messages=response['messages']
iter=range(len(messages))
for idx in iter:
id.append(messages[idx]['id'])
created_at.append(messages[idx]['created_at'])
body.append(messages[idx]['body'])
if (len(id) % 100) == 0:
print('Done Scrape ',len(id),' messages.')
except:
print('Warning Messages: \n ID Out of Range or Too Many Requests within 1 Hour')
break
try:
df_stocktwits=pd.DataFrame(id,columns=['id'])
df_stocktwits['created_at']=created_at
df_stocktwits['created_at']=pd.to_datetime(df_stocktwits['created_at'])
df_stocktwits['body']=body
df_stocktwits=df_stocktwits.sort_values(by='created_at')
df_stocktwits=df_stocktwits.reset_index(drop=True)
print('Done Scrape ',len(id),' messages.')
print('Last ID: ',df_stocktwits.tail(1)['id'].values[0])
return(df_stocktwits,df_stocktwits.tail(1)['id'].values[0])
except:
print('Error')
return None,last_id
def main():
args = parser.parse_args()
is_concat = args.concat_df
PATH = args.path
if is_concat:
source='C:/Users/Louis Owen/Desktop/ICoDSA 2020/SENN/Dataset/stocktwits BA scraping/'
_, _, filenames = next(os.walk(source))
filenames=pd.Series(filenames).sort_values().to_list()
df_stocktwits=pd.read_csv(source+filenames[0])
filenames.pop(0)
for filename in filenames:
toy=pd.read_csv(source+filename)
df_stocktwits=pd.concat([df_stocktwits,toy])
df_stocktwits.to_csv(PATH+'/Dataset/df_stocktwits_full_BA.csv',index=False)
else:
scrape_iter = args.scrape_iter
if scrape_iter==1:
base=149407261
else:
base=last_id
df_stocktwits,last_id=stocktwits_scrap(n=1000,ticker='BA',base=base)
df_stocktwits.to_csv(PATH+'/Dataset/stocktwits BA scraping/df_stocktwits_'+str(scrape_iter)+'.csv',index=False)
if __name__ == '__main__':
main()