-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatcher.py
176 lines (138 loc) · 6.46 KB
/
matcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import codecs
import copy
import json
from optparse import OptionParser
import re
import sys
DEFAULT_PRODUCTS_FILE = 'data/products.txt'
DEFAULT_LISTINGS_FILE = 'data/listings.txt'
def get_input_files():
""" Parse command line arguments to determine input files to use or set to defaults if none are provided. """
parser = OptionParser()
parser.add_option("-p", "--products", dest="products_file",
help="Products input file", metavar="FILE")
parser.add_option("-l", "--listings", dest="listings_file",
help="Listings input file", metavar="FILE")
(options, args) = parser.parse_args()
input_files = {}
input_files["products"] = options.products_file or DEFAULT_PRODUCTS_FILE
input_files["listings"] = options.listings_file or DEFAULT_LISTINGS_FILE
return input_files
def jsonfile_to_list(fn):
""" Given a path, return list of python dicts from a text file formatted with one JSON object per line. """
try:
with open(fn, 'r') as f:
return [ json.loads(line) for line in f ]
except IOError:
print 'One or more input files does not exist.'
sys.exit(1)
except ValueError:
print 'No JSON object could be decoded.'
sys.exit(1)
def regexify_hyphens(str):
""" Replace spaces or hyphens with a string that can be used as part of a regex. """
return re.sub('[ -]','[ -]?',str)
def get_unmatched_listings(listings_list):
""" Return unmatched listings. Helpful for debugging. """
unmatched = [listing for listing in listings_list
if not "matched" in listing.keys()]
return unmatched
def get_product_tokens(str_list):
""" Given a list of list of strings, split each on our regular expression and lowercase.
Next, check tokens against an "alphanumeric" regex to split model names like e.g. PL170 into tokens PL and 170. """
split_re = re.compile("[- _]")
all_tokens = []
# split tokens and lowercase them
for str in str_list:
tokens = split_re.split(str)
tokens = map(lambda str: str.lower(),tokens)
all_tokens = all_tokens + tokens
# set up alphanumeric regexen
numfirst_regex = re.compile("^(\d+)(\D+)")
ltrfirst_regex = re.compile("^(\D+)(\d+)")
for token in all_tokens:
try:
match_obj_numfirst = numfirst_regex.match(token)
if not match_obj_numfirst == None:
all_tokens = all_tokens + list(match_obj_numfirst.groups())
match_obj_ltrfirst = ltrfirst_regex.match(token)
if not match_obj_ltrfirst == None:
all_tokens = all_tokens + list(match_obj_ltrfirst.groups())
except AttributeError:
pass
return unique(all_tokens)
# from http://www.peterbe.com/plog/uniqifiers-benchmark
def unique(seq, idfun=None):
if idfun is None:
def idfun(x): return x
seen = {}
result = []
for item in seq:
marker = idfun(item)
if marker in seen: continue
seen[marker] = 1
result.append(item)
return result
# Main program flow begins
def main():
# Determine input files to use
input_files = get_input_files()
print "Opening input files:"
print "Products: {0}".format(input_files["products"])
print "Listings: {0}".format(input_files["listings"])
# Get python lists of our data files
products_list = jsonfile_to_list(input_files["products"])
listings_list = jsonfile_to_list(input_files["listings"])
# Copy original list so the output won't have the extra properties I'm adding to it
orig_listings_list = copy.deepcopy(listings_list)
print "Mapping {0} listings to {1} unique products...".format(len(listings_list), len(products_list) )
# Prepare product data
all_products = {}
all_product_tokens = []
for product in products_list:
# Create an empty destination list for listings matched to this product
all_products[product["product_name"]] = []
# Create regex to match the model
model_hyphens_fixed = regexify_hyphens(product["model"])
model_re = r'\b'+model_hyphens_fixed+r'\b'
product["model_re"] = re.compile(model_re, re.IGNORECASE)
# Split important product properties into tokens and lowercase the tokens
product["tokens"] = get_product_tokens([product["product_name"], product["model"], product["manufacturer"]])
# Save the length to a key for later use
product["tokens_length"] = len(product["tokens"])
# Create a title_lower for every listing
# so we don't need to call lower() inside the big loop when comparing with tokens
for listing in listings_list:
listing["title_lower"] = listing["title"].lower()
# Matching process starts here
for listing in listings_list:
for product in products_list:
# Check how many tokens for the product are present in the listing title
matching_tokens = 0
for token in product["tokens"]:
if token in listing["title_lower"]:
matching_tokens += 1
# If enough tokens match
if matching_tokens >= product["tokens_length"] - 2:
# And the product and listing share a manufacturer
if (product["manufacturer"].lower() in listing["manufacturer"].lower() or \
listing["manufacturer"].lower() in product["manufacturer"].lower() ):
# Test model regex against the title
model_re_match = product["model_re"].search(listing["title_lower"])
if not model_re_match == None:
# If all conditions are satisfied, add listing to results for this product
# Note: appending the original listing, not the one we've been working with
listing["matched"] = True
the_orig_listing = orig_listings_list[listings_list.index(listing)]
all_products[product["product_name"]].append(the_orig_listing)
# using codecs module to write file as utf-8
with codecs.open('results.txt', 'w+', 'utf-8') as outputfile:
for product in all_products:
listings = all_products[product]
results_dict = {}
results_dict["product_name"] = product
results_dict["listings"] = listings
json_str = json.dumps(results_dict,ensure_ascii=False)
outputfile.write(json_str+"\n")
if __name__ == '__main__':
main()