forked from knowitall/yelp-dataset-challenge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
common_extraction_generator.py
45 lines (37 loc) · 1.36 KB
/
common_extraction_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import json
from collections import Counter
from util import *
NUM_ATTRS = 1500
NUM_VALUES = 3000
def generate_common_extractions():
"""Reads in the raw extractions and only keeps the top NUM_ATTRS most
common attrs and NUM_VALUES most common values. Writes the results to
common_extractions.json.
"""
print "Loading extractions..."
extractions = load_json('./data/extractions.json')
print 'Generating counts...'
attr_counts = Counter()
value_counts = Counter()
for place in extractions:
for attr in extractions[place]:
c = 0
for value in extractions[place][attr]:
c += extractions[place][attr][value]
value_counts[value] += c
attr_counts[attr] += c
print "Writing common attributes and values..."
attrs = set(sorted(attr_counts.keys(),
key = lambda a: attr_counts[a], reverse=True)[:NUM_ATTRS])
values = set(sorted(value_counts.keys(),
key = lambda v: value_counts[v], reverse=True)[:NUM_VALUES])
common_extractions = NestedDict()
for place in extractions:
for attr in extractions[place]:
if attr in attrs:
for value in extractions[place][attr]:
if value in values:
common_extractions[place][attr][value] = extractions[place][attr][value]
write_json(common_extractions, './data/common_extractions.json')
if __name__ == '__main__':
generate_common_extractions()