From ca4880589ced8f2c08ad548fec24536c26b802f6 Mon Sep 17 00:00:00 2001 From: Alistair Johnson Date: Tue, 19 Feb 2019 14:31:52 -0500 Subject: [PATCH 1/6] pep8 --- negbio/pipeline/ptb2ud.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/negbio/pipeline/ptb2ud.py b/negbio/pipeline/ptb2ud.py index 23118da..e7706ce 100644 --- a/negbio/pipeline/ptb2ud.py +++ b/negbio/pipeline/ptb2ud.py @@ -98,19 +98,22 @@ def __init__(self, lemmatizer, representation='CCprocessed', universal=False): Args: lemmatizer (Lemmatizer) """ - super(NegBioPtb2DepConverter, self).__init__(lemmatizer, representation, universal) + super(NegBioPtb2DepConverter, self).__init__( + lemmatizer, representation, universal) def convert_doc(self, document): for passage in document.passages: for sentence in passage.sentences: try: - dependency_graph = self.convert(sentence.infons['parse tree']) - anns, rels = convert_dg(dependency_graph, sentence.text, sentence.offset, - self.add_lemmas) + dependency_graph = self.convert( + sentence.infons['parse tree']) + anns, rels = convert_dg(dependency_graph, sentence.text, + sentence.offset, self.add_lemmas) sentence.annotations = anns sentence.relations = rels except: - logging.exception("Cannot process sentence %d in %s", sentence.offset, document.id) + logging.exception( + "Cannot process sentence %d in %s", sentence.offset, document.id) if not self.add_lemmas: for ann in sentence.annotations: @@ -188,8 +191,10 @@ def convert_dg(dependency_graph, text, offset, ann_index=0, rel_index=0, has_lem relation.infons['dependency'] = node.deprel if node.extra: relation.infons['extra'] = node.extra - relation.add_node(bioc.BioCNode('T{}'.format(annotation_id_map[node.index]), 'dependant')) - relation.add_node(bioc.BioCNode('T{}'.format(annotation_id_map[node.head]), 'governor')) + relation.add_node(bioc.BioCNode('T{}'.format( + annotation_id_map[node.index]), 'dependant')) + relation.add_node(bioc.BioCNode('T{}'.format( + annotation_id_map[node.head]), 'governor')) relations.append(relation) rel_index += 1 From 72dff0ed72ed866d062dd01f793f9f02c85ac6c8 Mon Sep 17 00:00:00 2001 From: Alistair Johnson Date: Tue, 19 Feb 2019 14:32:12 -0500 Subject: [PATCH 2/6] remove trailing comma tuple, likely bug --- negbio/pipeline/ptb2ud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/negbio/pipeline/ptb2ud.py b/negbio/pipeline/ptb2ud.py index e7706ce..44873c3 100644 --- a/negbio/pipeline/ptb2ud.py +++ b/negbio/pipeline/ptb2ud.py @@ -51,7 +51,7 @@ class Ptb2DepConverter(object): basic = 'basic' collapsed = 'collapsed' - CCprocessed = 'CCprocessed', + CCprocessed = 'CCprocessed' collapsedTree = 'collapsedTree' def __init__(self, lemmatizer, representation='CCprocessed', universal=False): From 21ffbb7cfa9e4329b16b7cd9b3583225e1188e7b Mon Sep 17 00:00:00 2001 From: Alistair Johnson Date: Tue, 19 Feb 2019 14:33:39 -0500 Subject: [PATCH 3/6] fix has_lemmas being interpreted as annotation index --- negbio/pipeline/ptb2ud.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/negbio/pipeline/ptb2ud.py b/negbio/pipeline/ptb2ud.py index 44873c3..8faedd9 100644 --- a/negbio/pipeline/ptb2ud.py +++ b/negbio/pipeline/ptb2ud.py @@ -108,7 +108,8 @@ def convert_doc(self, document): dependency_graph = self.convert( sentence.infons['parse tree']) anns, rels = convert_dg(dependency_graph, sentence.text, - sentence.offset, self.add_lemmas) + sentence.offset, + has_lemmas=self.add_lemmas) sentence.annotations = anns sentence.relations = rels except: From 47fcca070ff29f3f9f3d5614dc061b8ca053ce6e Mon Sep 17 00:00:00 2001 From: Alistair Johnson Date: Tue, 19 Feb 2019 14:34:36 -0500 Subject: [PATCH 4/6] raise error if keyboard interrupt --- negbio/pipeline/ptb2ud.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/negbio/pipeline/ptb2ud.py b/negbio/pipeline/ptb2ud.py index 8faedd9..3e831c6 100644 --- a/negbio/pipeline/ptb2ud.py +++ b/negbio/pipeline/ptb2ud.py @@ -112,6 +112,8 @@ def convert_doc(self, document): has_lemmas=self.add_lemmas) sentence.annotations = anns sentence.relations = rels + except KeyboardInterrupt: + raise except: logging.exception( "Cannot process sentence %d in %s", sentence.offset, document.id) From 419768ea5e3719dd04b4c988ae184752c5ca91ca Mon Sep 17 00:00:00 2001 From: Alistair Johnson Date: Tue, 19 Feb 2019 18:50:14 -0500 Subject: [PATCH 5/6] fix opaque errors if parsing of sentence was empty --- negbio/pipeline/parse.py | 14 +++++++++++--- negbio/pipeline/ptb2ud.py | 11 +++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/negbio/pipeline/parse.py b/negbio/pipeline/parse.py index 030aad6..fa53c11 100644 --- a/negbio/pipeline/parse.py +++ b/negbio/pipeline/parse.py @@ -30,9 +30,13 @@ def parse(self, s): """ if not s: raise ValueError('Cannot parse empty sentence: {}'.format(s)) + try: nbest = self.rrp.parse(str(s)) - return nbest[0].ptb_parse + if nbest: + return nbest[0].ptb_parse + else: + return None except: raise ValueError('Cannot parse sentence: %s' % s) @@ -53,7 +57,11 @@ def parse_doc(self, document): try: text = sentence.text tree = self.parse(text) - sentence.infons['parse tree'] = str(tree) + if tree: + sentence.infons['parse tree'] = str(tree) + else: + sentence.infons['parse tree'] = None except: - logging.exception('Cannot parse sentence: {}'.format(sentence.offset)) + logging.exception( + 'Cannot parse sentence: {}'.format(sentence.offset)) return document diff --git a/negbio/pipeline/ptb2ud.py b/negbio/pipeline/ptb2ud.py index 3e831c6..ab34f84 100644 --- a/negbio/pipeline/ptb2ud.py +++ b/negbio/pipeline/ptb2ud.py @@ -104,6 +104,17 @@ def __init__(self, lemmatizer, representation='CCprocessed', universal=False): def convert_doc(self, document): for passage in document.passages: for sentence in passage.sentences: + # check for empty infons, don't process if empty + # this sometimes happens with poorly tokenized sentences + if not sentence.infons: + logging.warning( + "No parse information for sentence %d in %s", sentence.offset, document.id) + continue + elif not sentence.infons['parse tree']: + logging.warning( + "No parse tree for sentence %d in %s", sentence.offset, document.id) + continue + try: dependency_graph = self.convert( sentence.infons['parse tree']) From 0f7f2935cbec370db905ddafcbd27cd8674d6e39 Mon Sep 17 00:00:00 2001 From: Alistair Johnson Date: Tue, 19 Feb 2019 18:53:41 -0500 Subject: [PATCH 6/6] remove try/except clauses from subfunctions --- negbio/pipeline/parse.py | 29 ++++++++++++----------------- negbio/pipeline/ptb2ud.py | 4 ---- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/negbio/pipeline/parse.py b/negbio/pipeline/parse.py index fa53c11..c84bda0 100644 --- a/negbio/pipeline/parse.py +++ b/negbio/pipeline/parse.py @@ -31,14 +31,11 @@ def parse(self, s): if not s: raise ValueError('Cannot parse empty sentence: {}'.format(s)) - try: - nbest = self.rrp.parse(str(s)) - if nbest: - return nbest[0].ptb_parse - else: - return None - except: - raise ValueError('Cannot parse sentence: %s' % s) + nbest = self.rrp.parse(str(s)) + if nbest: + return nbest[0].ptb_parse + + return None class NegBioParser(Bllip): @@ -54,14 +51,12 @@ def parse_doc(self, document): """ for passage in document.passages: for sentence in passage.sentences: - try: - text = sentence.text - tree = self.parse(text) - if tree: - sentence.infons['parse tree'] = str(tree) - else: - sentence.infons['parse tree'] = None - except: + text = sentence.text + tree = self.parse(text) + if tree: + sentence.infons['parse tree'] = str(tree) + else: + sentence.infons['parse tree'] = None logging.exception( - 'Cannot parse sentence: {}'.format(sentence.offset)) + 'No parse tree for sentence: %s', sentence.offset) return document diff --git a/negbio/pipeline/ptb2ud.py b/negbio/pipeline/ptb2ud.py index ab34f84..8f35265 100644 --- a/negbio/pipeline/ptb2ud.py +++ b/negbio/pipeline/ptb2ud.py @@ -107,12 +107,8 @@ def convert_doc(self, document): # check for empty infons, don't process if empty # this sometimes happens with poorly tokenized sentences if not sentence.infons: - logging.warning( - "No parse information for sentence %d in %s", sentence.offset, document.id) continue elif not sentence.infons['parse tree']: - logging.warning( - "No parse tree for sentence %d in %s", sentence.offset, document.id) continue try: