-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollect_paperbuzz.py
129 lines (104 loc) · 3.47 KB
/
collect_paperbuzz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:light
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.3'
# jupytext_version: 1.0.5
# kernelspec:
# display_name: altmetrics
# language: python
# name: altmetrics
# ---
# +
import csv
from datetime import datetime
from itertools import zip_longest
import numpy as np
import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from requests_futures.sessions import FuturesSession
try:
__IPYTHON__
except NameError:
from tqdm import tqdm
else:
from tqdm import tqdm_notebook as tqdm
import os
import json
# +
cr_works = "https://api.paperbuzz.org/v0/doi/"
email = "[email protected]"
ONE_SEC = 1
CALLS = 10
# +
@sleep_and_retry
@limits(calls=CALLS, period=ONE_SEC)
def call_paperbuzz(doi, session):
future = session.get(cr_works + doi, params={'email':email}, timeout=15)
return future
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
# -
results_exist = False
if os.path.isfile('./paperbuzz.csv'):
results_exist = True
out = pd.read_csv("paperbuzz.csv", dtype={'status':str})
out.set_index("id", inplace=True)
out['timestamp'] = pd.to_datetime(out['timestamp'])
out['date'] = pd.to_datetime(out['date'], errors = 'coerce')
# +
df = pd.read_csv("out.csv", dtype={'cr_works':str})
df.set_index("id", inplace=True)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = pd.to_datetime(df['date'], errors = 'coerce')
df = df[~df.date.isnull()]
# -
existing_dois = df[df.cr_works == "200"]
# existing_dois = existing_dois.sample(100)
with FuturesSession() as session:
f = call_paperbuzz("10.1038/nature12373", session)
x = f.result()
# +
columns = ["doi", "date", "status", "response", "timestamp"]
batch_size = 50
batches = grouper(existing_dois.index, batch_size)
with open('paperbuzz.csv', 'a+') as csvfile:
csvwriter = csv.writer(csvfile)
if not results_exist:
csvwriter.writerow(["id"] + columns)
for batch in tqdm(batches, total=np.ceil(len(existing_dois)/batch_size), desc="Overall"):
futures = []
subdf = existing_dois.reindex(batch)
with FuturesSession(max_workers=8) as session:
for ix, row in tqdm(subdf.iterrows(), total=len(subdf), desc="Requests", leave=False):
doi = row['doi']
ts = datetime.now().isoformat()
future = call_paperbuzz(str(doi), session)
futures.append((ix, ts, future))
for ix, ts, future in tqdm(futures, desc="Responses", leave=False):
sc = None
response_json = None
try:
resp = future.result(timeout=5)
sc = int(resp.status_code)
try:
response_json = resp.json()
except Exception as e:
response_json = {}
except Exception as e:
sc = e.__class__.__name__
row = existing_dois.loc[ix].copy()
row['status'] = sc
if response_json:
row['response'] = json.dumps(response_json)
else:
row['response'] = str(response_json)
row['timestamp'] = ts
csvwriter.writerow([ix] + row[columns].tolist())