Merge pull request #20 from alistairewj/master

Fix errors and reformat ptb2ud.py
ncbi-nlp · Feb 20, 2019 · 7a2be81 · 7a2be81
2 parents da4762f + 0f7f293
commit 7a2be81
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 17 deletions.
diff --git a/negbio/pipeline/parse.py b/negbio/pipeline/parse.py
@@ -30,11 +30,12 @@ def parse(self, s):
         """
         if not s:
             raise ValueError('Cannot parse empty sentence: {}'.format(s))
-        try:
-            nbest = self.rrp.parse(str(s))
+
+        nbest = self.rrp.parse(str(s))
+        if nbest:
             return nbest[0].ptb_parse
-        except:
-            raise ValueError('Cannot parse sentence: %s' % s)
+
+        return None
 
 
 class NegBioParser(Bllip):
@@ -50,10 +51,12 @@ def parse_doc(self, document):
         """
         for passage in document.passages:
             for sentence in passage.sentences:
-                try:
-                    text = sentence.text
-                    tree = self.parse(text)
+                text = sentence.text
+                tree = self.parse(text)
+                if tree:
                     sentence.infons['parse tree'] = str(tree)
-                except:
-                    logging.exception('Cannot parse sentence: {}'.format(sentence.offset))
+                else:
+                    sentence.infons['parse tree'] = None
+                    logging.exception(
+                        'No parse tree for sentence: %s', sentence.offset)
         return document
diff --git a/negbio/pipeline/ptb2ud.py b/negbio/pipeline/ptb2ud.py
@@ -51,7 +51,7 @@ class Ptb2DepConverter(object):
 
     basic = 'basic'
     collapsed = 'collapsed'
-    CCprocessed = 'CCprocessed',
+    CCprocessed = 'CCprocessed'
     collapsedTree = 'collapsedTree'
 
     def __init__(self, lemmatizer, representation='CCprocessed', universal=False):
@@ -98,19 +98,32 @@ def __init__(self, lemmatizer, representation='CCprocessed', universal=False):
         Args:
             lemmatizer (Lemmatizer)
         """
-        super(NegBioPtb2DepConverter, self).__init__(lemmatizer, representation, universal)
+        super(NegBioPtb2DepConverter, self).__init__(
+            lemmatizer, representation, universal)
 
     def convert_doc(self, document):
         for passage in document.passages:
             for sentence in passage.sentences:
+                # check for empty infons, don't process if empty
+                # this sometimes happens with poorly tokenized sentences
+                if not sentence.infons:
+                    continue
+                elif not sentence.infons['parse tree']:
+                    continue
+
                 try:
-                    dependency_graph = self.convert(sentence.infons['parse tree'])
-                    anns, rels = convert_dg(dependency_graph, sentence.text, sentence.offset,
-                                            self.add_lemmas)
+                    dependency_graph = self.convert(
+                        sentence.infons['parse tree'])
+                    anns, rels = convert_dg(dependency_graph, sentence.text,
+                                            sentence.offset,
+                                            has_lemmas=self.add_lemmas)
                     sentence.annotations = anns
                     sentence.relations = rels
+                except KeyboardInterrupt:
+                    raise
                 except:
-                    logging.exception("Cannot process sentence %d in %s", sentence.offset, document.id)
+                    logging.exception(
+                        "Cannot process sentence %d in %s", sentence.offset, document.id)
 
                 if not self.add_lemmas:
                     for ann in sentence.annotations:
@@ -188,8 +201,10 @@ def convert_dg(dependency_graph, text, offset, ann_index=0, rel_index=0, has_lem
         relation.infons['dependency'] = node.deprel
         if node.extra:
             relation.infons['extra'] = node.extra
-        relation.add_node(bioc.BioCNode('T{}'.format(annotation_id_map[node.index]), 'dependant'))
-        relation.add_node(bioc.BioCNode('T{}'.format(annotation_id_map[node.head]), 'governor'))
+        relation.add_node(bioc.BioCNode('T{}'.format(
+            annotation_id_map[node.index]), 'dependant'))
+        relation.add_node(bioc.BioCNode('T{}'.format(
+            annotation_id_map[node.head]), 'governor'))
         relations.append(relation)
         rel_index += 1