-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline.py
105 lines (89 loc) · 3.7 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from itertools import chain
from datetime import datetime
from KINCluster import Pipeline
import numpy as np
from item import myItem as Item
import memento_settings as MS
from connection import put_cluster, put_bulk
from utility import get_similar, get_emotion, filter_quote, Logging
class PipelineServer(Pipeline):
def __init__(self, entity, date_start, date_end, frame, manage_id):
self.entity = entity
self.date_start = date_start
self.date_end = date_end
self.frame = frame
self.manage_id = manage_id
self.clusters = []
def __del__(self):
print (len(self.clusters), 'inserted!')
put_bulk([{
'_index': 'memento',
'_type': 'cluster',
'_source': cluster
} for cluster in self.clusters])
def capture_item(self):
items = []
for _, row in self.frame.iterrows():
items.append(Item(entities=" ".join(row.entities),
title=row.title,
content=row.content,
published_time=row.published_time,
reply_count=row.reply_count,
href_naver=row.href_naver,
cate=row.cate,
imgs=row.imgs,
comments=row.comments,
title_quote=row.title_quote,
content_quote=row.content_quote,
oid=row.oid,
aid=row.aid))
return items
def dress_item(self, ext):
def get_memento_rate(items):
return sum([item.reply_count + 1000 for item in items])
def get_property(item):
return " ".join([item.title, item.content, filter_quote(item.title_quote), filter_quote(item.content_quote)])
def topic_rating(item, value):
rating = value
if self.entity in item.title:
rating += 0.04
for bound in [1000, 500, 250, 100, 50, 25]:
if item.reply_count > bound:
rating += 0.001
else:
break
for bound in [1000, 2500, 5000, 10000]:
if len(item.content) > bound:
rating -= 0.001
else:
break
return rating
if len(ext.items) < MS.MINIMUM_CLUSTER:
return
get_cc = lambda comments: map(lambda x: x['content'], comments)
rate = get_memento_rate(ext.items)
emot = get_emotion("\n".join(map(lambda x: " ".join(get_cc(x.comments)), ext.items)))
simi = get_similar(list(map(get_property, ext.items)), self.entity)
accuracy = np.std(simi) * 100
comb = get_similar(list(map(get_property, ext.items)))
simi -= np.abs(np.mean(comb) - comb)
simi = [topic_rating(item, v) for v, item in zip(simi, ext.items)]
topic = ext.items[np.argmax(simi)]
self.clusters.append({
'entity': self.entity,
'date_start': datetime.strptime(self.date_start, '%Y.%m.%d'),
'date_end': datetime.strptime(self.date_end, '%Y.%m.%d'),
'manage_id': self.manage_id,
'topic': topic.dump(),
'accuracy': accuracy,
'keywords': [{
'keyword': keyword,
'value': value,
} for keyword, value in ext.keywords],
'rate': rate,
'items': ["{}_{}".format(item.oid, item.aid) for item in ext.items],
'emot': [{
'emotion': emo[0],
'value': emo[1],
} for emo in emot]
})