- testing more features

- adding char position for each term at sentence level (this is useful for issue #10)
SmartDataAnalytics · Apr 23, 2017 · 96b4950 · 96b4950
1 parent 3820cce
commit 96b4950
Show file tree

Hide file tree

Showing 8 changed files with 155,606 additions and 62,360 deletions.
diff --git a/output/experiments/EXP_000/out_exp000_0_NLTK.csv b/output/experiments/EXP_000/out_exp000_0_NLTK.csv
diff --git a/output/experiments/EXP_000/out_exp000_1_NLTK.csv b/output/experiments/EXP_000/out_exp000_1_NLTK.csv
diff --git a/output/features_dt_wnut16.csv b/output/features_dt_wnut16.csv
diff --git a/scripts/example.py b/scripts/example.py
@@ -5,12 +5,12 @@
 #a = [l[0].decode('utf8')]
 #print a[0]
 
-#text = u"diego esteves lives in sao paulo".encode('utf8')
-text3 = 'Driving , driving , driving away to Phil . Tasty dinner tonight with the Society of Mining and Metallurgy Engineers .'
-text = u"paris hilton was once the toast of the town".encode('utf8')
-text2 = u"bullshit about airports/coffee/conferences".encode('utf8')
+text = "diego's estees-III @sajdh yo yo go!brow. ha!"
+#text = 'Driving , driving , driving away to Phil . Tasty dinner tonight with the Society of Mining and Metallurgy Engineers .'
+#text = "paris hilton was once the toast of the town"
+#text = u"bullshit about airports/coffee/conferences".encode('utf8')
 horus = Core(False, 5)
-horus.annotate(text3)
+horus.annotate(text)
 #print horus.get_cv_annotation()
 
 
diff --git a/src/horus/components/core.py b/src/horus/components/core.py
@@ -604,6 +604,7 @@ def convert_dataset_to_horus_matrix(self, sentences):
         try:
             for sent in sentences:
                 sent_index+=1
+                ipositionstartterm = 0
                 for c in range(len(sent[6][self.config.models_pos_tag_lib])):
                     word_index_ref = sent[6][self.config.models_pos_tag_lib][c][0]
                     compound = sent[6][self.config.models_pos_tag_lib][c][1]
@@ -654,7 +655,24 @@ def convert_dataset_to_horus_matrix(self, sentences):
                     temp = [is_entity, sent_index, word_index, term, tag_pos_uni, tag_pos, tag_ner, 0, 0] # 0-8
                     temp.extend(self.populate_matrix_new_columns())
                     temp.extend([tag_ner_y])
+                    # that is a hack to integrate to GERBIL
+
+                    if ipositionstartterm >= len(sent[1][0]):
+                        ipositionstartterm-=1
+                    if sent[1][0][ipositionstartterm] == term[0]:
+                        if sent[1][0][ipositionstartterm:ipositionstartterm+len(term)] != term:
+                            raise Exception("GERBIL integration: error 1!")
+                    else:
+                        ipositionstartterm-=1
+                        if sent[1][0][ipositionstartterm] == term[0]:
+                            if sent[1][0][ipositionstartterm:ipositionstartterm+len(term)] != term:
+                                raise Exception("GERBIL integration: error 2!")
+                        else:
+                            raise Exception("GERBIL integration: error 3!")
+
+                    temp[27] = ipositionstartterm
                     converted.append(temp)
+                    ipositionstartterm += (len(term) + 1)
 
         except Exception as error:
             self.sys.log.error(':: Erro! %s' % str(error))

diff --git a/src/horus/experiments/horus_dt_layer.py b/src/horus/experiments/horus_dt_layer.py
@@ -60,6 +60,8 @@
             special_char = 1 if len(re.findall('(http://\S+|\S*[^\w\s]\S*)', token))>0 else 0
             first_capitalized = 1 if token[0].isupper() else 0
             capitalized = 1 if token.isupper() else 0
+            title = 1 if token.istitle() else 0
+            digit = 1 if token.isdigit() else 0
             nr_images_returned = linha[17]
             nr_websites_returned = linha[25]
             hyphen = 1 if '-' in token else 0
@@ -80,17 +82,11 @@
             elif linha[6] in definitions.NER_TAGS_PER: ner = definitions.KLASSES2["PER"]
             else: ner = definitions.KLASSES2["O"]
 
-
-            '''
-            pos-1; pos; pos+1; cv_loc; cv_org; cv_per; cv_dist; cv_plc; 
-            tx_loc; tx_org; tx_per; tx_err; tx_dist; 
-            one_char; special_char; first_cap; cap
-            '''
-            features.append((pos_bef, pos, pos_aft,
+            features.append((pos_bef, pos, pos_aft, title, digit,
                              one_char_token, special_char, first_capitalized, hyphen,
                              capitalized, nr_images_returned,
-                             cv_org, cv_loc, cv_per, cv_dist, cv_plc))
-                             #tx_org, tx_loc, tx_per, tx_dist, tx_err))
+                             cv_org, cv_loc, cv_per, cv_dist, cv_plc,
+                             tx_org, tx_loc, tx_per, tx_dist, tx_err))
 print len(Y)
 print set(Y)
 print set(teste)

diff --git a/src/horus/experiments/horus_dt_layer_bkp.py b/src/horus/experiments/horus_dt_layer_bkp.py
@@ -15,7 +15,7 @@
 from horus.components.config import HorusConfig
 
 config = HorusConfig()
-#file1reader = csv.reader(open(config.output_path + "experiments/ritter/EXP_000/out_exp000_1.csv"), delimiter=",")
+#file1reader = csv.reader(open(config.output_path + "experiments/ritter/EXP_000/out_exp000_1_NLTK.csv"), delimiter=",")
 #header1 = file1reader.next()
 
 features = []
@@ -27,7 +27,7 @@
             'PL_CV_I', 'CV_KLASS', 'TOT_RESULTS_TX', 'TOT_TX_LOC', 'TOT_TX_ORG', 'TOT_TX_PER',
             'TOT_ERR_TRANS', 'DIST_TX_I', 'TX_KLASS', 'HORUS_KLASS']
 
-data = pandas.read_csv((config.output_path + "experiments/ritter/EXP_000/out_exp000_1.csv"), sep=',',
+data = pandas.read_csv((config.output_path + "experiments/ritter/EXP_000/out_exp000_1_NLTK.csv"), sep=',',
                        names=colnames, na_values=['*'], header=0,
                        dtype={"IS_ENTITY?": int, "ID_SENT": int, "ID_WORD": int, "WORD_TERM": str,
                               "POS_UNI": str, "POS": str, "NER": str, "COMPOUND": int, "COMPOUND_SIZE": int,

diff --git a/src/horus/experiments/report.py b/src/horus/experiments/report.py
@@ -13,7 +13,7 @@
 from horus.components.config import HorusConfig
 
 config = HorusConfig()
-file1reader = csv.reader(open(config.output_path + "experiments/ritter/EXP_000/out_exp000_1.csv"), delimiter=",")
+file1reader = csv.reader(open(config.output_path + "experiments/ritter/EXP_000/out_exp000_1_NLTK.csv"), delimiter=",")
 header1 = file1reader.next() #header
 
 tot = 0