Skip to content

Commit

Permalink
- testing more features
Browse files Browse the repository at this point in the history
- adding char position for each term at sentence level (this is useful for issue #10)
  • Loading branch information
diegoesteves committed Apr 23, 2017
1 parent 3820cce commit 96b4950
Show file tree
Hide file tree
Showing 8 changed files with 155,606 additions and 62,360 deletions.
46,616 changes: 46,616 additions & 0 deletions output/experiments/EXP_000/out_exp000_0_NLTK.csv

Large diffs are not rendered by default.

46,616 changes: 46,616 additions & 0 deletions output/experiments/EXP_000/out_exp000_1_NLTK.csv

Large diffs are not rendered by default.

124,686 changes: 62,343 additions & 62,343 deletions output/features_dt_wnut16.csv

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions scripts/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
#a = [l[0].decode('utf8')]
#print a[0]

#text = u"diego esteves lives in sao paulo".encode('utf8')
text3 = 'Driving , driving , driving away to Phil . Tasty dinner tonight with the Society of Mining and Metallurgy Engineers .'
text = u"paris hilton was once the toast of the town".encode('utf8')
text2 = u"bullshit about airports/coffee/conferences".encode('utf8')
text = "diego's estees-III @sajdh yo yo go!brow. ha!"
#text = 'Driving , driving , driving away to Phil . Tasty dinner tonight with the Society of Mining and Metallurgy Engineers .'
#text = "paris hilton was once the toast of the town"
#text = u"bullshit about airports/coffee/conferences".encode('utf8')
horus = Core(False, 5)
horus.annotate(text3)
horus.annotate(text)
#print horus.get_cv_annotation()


18 changes: 18 additions & 0 deletions src/horus/components/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,7 @@ def convert_dataset_to_horus_matrix(self, sentences):
try:
for sent in sentences:
sent_index+=1
ipositionstartterm = 0
for c in range(len(sent[6][self.config.models_pos_tag_lib])):
word_index_ref = sent[6][self.config.models_pos_tag_lib][c][0]
compound = sent[6][self.config.models_pos_tag_lib][c][1]
Expand Down Expand Up @@ -654,7 +655,24 @@ def convert_dataset_to_horus_matrix(self, sentences):
temp = [is_entity, sent_index, word_index, term, tag_pos_uni, tag_pos, tag_ner, 0, 0] # 0-8
temp.extend(self.populate_matrix_new_columns())
temp.extend([tag_ner_y])
# that is a hack to integrate to GERBIL

if ipositionstartterm >= len(sent[1][0]):
ipositionstartterm-=1
if sent[1][0][ipositionstartterm] == term[0]:
if sent[1][0][ipositionstartterm:ipositionstartterm+len(term)] != term:
raise Exception("GERBIL integration: error 1!")
else:
ipositionstartterm-=1
if sent[1][0][ipositionstartterm] == term[0]:
if sent[1][0][ipositionstartterm:ipositionstartterm+len(term)] != term:
raise Exception("GERBIL integration: error 2!")
else:
raise Exception("GERBIL integration: error 3!")

temp[27] = ipositionstartterm
converted.append(temp)
ipositionstartterm += (len(term) + 1)

except Exception as error:
self.sys.log.error(':: Erro! %s' % str(error))
Expand Down
14 changes: 5 additions & 9 deletions src/horus/experiments/horus_dt_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
special_char = 1 if len(re.findall('(http://\S+|\S*[^\w\s]\S*)', token))>0 else 0
first_capitalized = 1 if token[0].isupper() else 0
capitalized = 1 if token.isupper() else 0
title = 1 if token.istitle() else 0
digit = 1 if token.isdigit() else 0
nr_images_returned = linha[17]
nr_websites_returned = linha[25]
hyphen = 1 if '-' in token else 0
Expand All @@ -80,17 +82,11 @@
elif linha[6] in definitions.NER_TAGS_PER: ner = definitions.KLASSES2["PER"]
else: ner = definitions.KLASSES2["O"]


'''
pos-1; pos; pos+1; cv_loc; cv_org; cv_per; cv_dist; cv_plc;
tx_loc; tx_org; tx_per; tx_err; tx_dist;
one_char; special_char; first_cap; cap
'''
features.append((pos_bef, pos, pos_aft,
features.append((pos_bef, pos, pos_aft, title, digit,
one_char_token, special_char, first_capitalized, hyphen,
capitalized, nr_images_returned,
cv_org, cv_loc, cv_per, cv_dist, cv_plc))
#tx_org, tx_loc, tx_per, tx_dist, tx_err))
cv_org, cv_loc, cv_per, cv_dist, cv_plc,
tx_org, tx_loc, tx_per, tx_dist, tx_err))
print len(Y)
print set(Y)
print set(teste)
Expand Down
4 changes: 2 additions & 2 deletions src/horus/experiments/horus_dt_layer_bkp.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from horus.components.config import HorusConfig

config = HorusConfig()
#file1reader = csv.reader(open(config.output_path + "experiments/ritter/EXP_000/out_exp000_1.csv"), delimiter=",")
#file1reader = csv.reader(open(config.output_path + "experiments/ritter/EXP_000/out_exp000_1_NLTK.csv"), delimiter=",")
#header1 = file1reader.next()

features = []
Expand All @@ -27,7 +27,7 @@
'PL_CV_I', 'CV_KLASS', 'TOT_RESULTS_TX', 'TOT_TX_LOC', 'TOT_TX_ORG', 'TOT_TX_PER',
'TOT_ERR_TRANS', 'DIST_TX_I', 'TX_KLASS', 'HORUS_KLASS']

data = pandas.read_csv((config.output_path + "experiments/ritter/EXP_000/out_exp000_1.csv"), sep=',',
data = pandas.read_csv((config.output_path + "experiments/ritter/EXP_000/out_exp000_1_NLTK.csv"), sep=',',
names=colnames, na_values=['*'], header=0,
dtype={"IS_ENTITY?": int, "ID_SENT": int, "ID_WORD": int, "WORD_TERM": str,
"POS_UNI": str, "POS": str, "NER": str, "COMPOUND": int, "COMPOUND_SIZE": int,
Expand Down
2 changes: 1 addition & 1 deletion src/horus/experiments/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from horus.components.config import HorusConfig

config = HorusConfig()
file1reader = csv.reader(open(config.output_path + "experiments/ritter/EXP_000/out_exp000_1.csv"), delimiter=",")
file1reader = csv.reader(open(config.output_path + "experiments/ritter/EXP_000/out_exp000_1_NLTK.csv"), delimiter=",")
header1 = file1reader.next() #header

tot = 0
Expand Down

0 comments on commit 96b4950

Please sign in to comment.