-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgather_traces.py
executable file
·79 lines (73 loc) · 2.61 KB
/
gather_traces.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import glob
import json
import logging
import os
import pathlib
import sqlite3
import subprocess
import sys
DEFAULT_NUM_REPOS = 10
TEMP_DIR = pathlib.Path('./data/temp')
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def main(args, conn):
TEMP_DIR.mkdir(exist_ok=True)
with open('./data/traces.json') as f:
trace_json = json.loads(f.read())
successes = 0
curse = conn.cursor()
seen_traces = curse.execute('SELECT trace FROM cell_execs')
seen_traces = set(tup[0] for tup in seen_traces)
curse.close()
if os.path.exists('./data/seen-traces.json'):
with open('./data/seen-traces.json') as f:
seen_traces |= set(map(int, json.loads(f.read())['seen']))
for entry in trace_json:
trace_id = int(entry['id'])
if trace_id in seen_traces:
logger.info('Skipping already download nb trace %d', trace_id)
continue
seen_traces.add(trace_id)
logger.info("Working on entry %s", entry['html_url'])
try:
subprocess.check_output(['wget', '-q', '-O', './data/temp/temp.sqlite', f'{entry["html_url"]}?raw=true'])
curse = conn.cursor()
curse.execute('attach "./data/temp/temp.sqlite" as t')
curse.execute(f"""
INSERT INTO cell_execs
SELECT {trace_id}, session, line AS counter, source
FROM t.history""".strip())
conn.commit()
except KeyboardInterrupt:
break
except Exception as e:
logger.info("Exception while grabbing nb history for repo: %s", e)
continue
finally:
try:
curse.execute('detach t')
except Exception as e:
logger.info("Exception while detaching from temp.sqlite: %s", e)
finally:
curse.close()
subprocess.check_call(['rm', '-f'] + glob.glob('./data/temp/*'))
successes += 1
if 0 < args.num_repos <= successes:
break
seen_traces = {'seen': sorted(seen_traces)}
with open('./data/seen-traces.json', 'w') as f:
f.write(json.dumps(seen_traces, indent=2))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Grab notebook traces from github')
parser.add_argument('--num-repos', type=int, default=DEFAULT_NUM_REPOS)
args = parser.parse_args()
conn = sqlite3.connect('./data/traces.sqlite')
try:
sys.exit(main(args, conn))
finally:
conn.commit()
conn.close()
subprocess.check_call(['rm', '-f'] + glob.glob('./data/temp/*'))