-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathapp.py
141 lines (112 loc) · 4.48 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# import sys
# the mock-0.3.1 dir contains testcase.py, testutils.py & mock.py
# sys.path.append('lib/')
from bottle import post,get, run, static_file, template, redirect, request, response
from py2neo import Graph
from bs4 import BeautifulSoup, NavigableString
from random import randint
from json import dumps
from himymutil.soupselect import select
from himymutil.sentences import all_sentences
from himymutil.ml import pos_features
import json
import csv
import nltk
import pickle
import itertools
graph = Graph()
all_sentences = all_sentences()
def extract_speaker(sentence):
tokenized_sentence = nltk.word_tokenize(sentence)
for i, word in enumerate(tokenized_sentence):
classification = classifier.classify(pos_features(tokenized_sentence, i))
with open("classifiers/decision_tree.pickle") as f:
classifier = pickle.load(f)
@get('/css/<filename:re:.*\.css>')
def get_css(filename):
return static_file(filename, root="static", mimetype="text/css")
@get('/images/<filename:re:.*\.png>')
def get_image(filename):
return static_file(filename, root="static", mimetype="image/png")
@get('/js/<filename:re:.*\.js>')
def get_js(filename):
return static_file(filename, root="static", mimetype="application/javascript")
@get("/")
def get_index():
redirect("/episodes")
@get("/episodes")
def get_episodes():
""" List of all episodes.
"""
statement = """\
MATCH (e:Episode)
RETURN e.id AS id, e.title as title, e.number as number, e.season as season
ORDER BY id
"""
return template("episodes", episodes=graph.cypher.execute(statement))
@get("/episodes/<episode_id>")
def get_episode(episode_id):
""" Show specific episode
"""
statement = """\
MATCH (e:Episode {id: {episodeId}})-[r:TOPIC]->(topic)
WITH e, r, topic ORDER BY r.score DESC
RETURN e.id AS id, e.title as title, e.season AS season, e.number AS number,
COLLECT({id: topic.id, name: topic.value, score: r.score}) AS topics
ORDER BY id
"""
episode = graph.cypher.execute(statement, {"episodeId": int(episode_id)})[0]
season = episode["season"]
number = episode["number"]
sentences = []
with open("data/import/sentences.csv", "r") as sentences_file:
reader = csv.reader(sentences_file, delimiter = ",")
reader.next()
for row in reader:
if int(row[1]) == int(episode['id']):
tokenized_sentence = nltk.word_tokenize(row[4].decode('utf-8'))
sentence_pos = nltk.pos_tag(tokenized_sentence)
word_pos = [(word, classifier.classify(pos_features(tokenized_sentence, sentence_pos, i)))
for i, word in enumerate(tokenized_sentence)]
speaker = list(itertools.takewhile(lambda x: x[1] == True, word_pos))
sentences.append(("".join(s[0] for s in speaker), row[4]))
transcript = open("data/transcripts/S%d-Ep%d" %(season, number)).read()
soup = BeautifulSoup(transcript)
rows = select(soup, "table.tablebg tr td.post-body div.postbody")
return template("episode", episode = episode, transcript = rows[0], sentences = sentences)
@get("/topics/<topic_id>")
def get_topic(topic_id):
""" Show specific topic
"""
statement = """\
MATCH (topic:Topic {id: {topicId}})<-[r:TOPIC]-(e)
WITH e, r, topic ORDER BY r.score DESC
RETURN topic.id AS id, topic.value as value,
COLLECT({id: e.id, title: e.title, score: r.score}) AS episodes
ORDER BY id
"""
topic = graph.cypher.execute(statement, {"topicId": int(topic_id)})[0]
return template("topic", topic = topic)
@get("/training")
def get_sentences_to_train():
print request.headers['Accept']
if "application/json" in request.headers['Accept']:
sentences = {"sentences": []}
for i in range(0, 10):
sentences["sentences"].append(nltk.word_tokenize(all_sentences[randint(0, len(all_sentences))]))
response.content_type = "application/json"
return sentences
else:
return template("training")
@post("/training")
def train_sentence():
with open("data/import/trained_sentences.json", "r") as json_file:
json_data = json.load(json_file)
for sentence in json.load(request.body):
json_data.append( sentence )
with open("data/import/trained_sentences.json", "w") as json_file:
json.dump(json_data, json_file)
if __name__ == "__main__":
run(host="localhost", port=8000, reloader=True, debug = True)