From 14133016a356adf83c63fc614c77e1637a2b6ff6 Mon Sep 17 00:00:00 2001 From: enury Date: Mon, 9 Feb 2015 17:49:32 +0000 Subject: [PATCH 01/34] Update core_functions.py Update create_from_json method. --- collatex-pythonport/collatex/core_functions.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py index c2d2d4709..23d7bc026 100644 --- a/collatex-pythonport/collatex/core_functions.py +++ b/collatex-pythonport/collatex/core_functions.py @@ -108,10 +108,17 @@ def create_from_dict(cls, data, limit=None): collation.add_witness(witness) return collation + # json input can be a string or a file @classmethod - # json_data can be a string or a file - def create_from_json(cls, json_data): - data = json.load(json_data) + def create_from_json_string(cls, json_string): + data = json.loads(json_string) + collation = cls.create_from_dict(data) + return collation + + @classmethod + def create_from_json_file(cls, json_path): + with open(json_path, 'r') as json_file: + data = json.load(json_file) collation = cls.create_from_dict(data) return collation From 6842ca85cfd23e07a995158d63f62e9c7f4cf672 Mon Sep 17 00:00:00 2001 From: enury Date: Mon, 9 Feb 2015 18:03:52 +0000 Subject: [PATCH 02/34] Create json-test1 --- collatex-pythonport/use_cases/json-test1 | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 collatex-pythonport/use_cases/json-test1 diff --git a/collatex-pythonport/use_cases/json-test1 b/collatex-pythonport/use_cases/json-test1 new file mode 100644 index 000000000..a419c0e49 --- /dev/null +++ b/collatex-pythonport/use_cases/json-test1 @@ -0,0 +1,29 @@ +{"witnesses" : + [ + {"id" : "A","tokens" : + [ + {"t" : "The"}, + {"t" : "quick"}, + {"t" : "brown"}, + {"t" : "fox"}, + {"t" : "jumps"}, + {"t" : "over"}, + {"t" : "the"}, + {"t" : "dog."} + ] + }, + + {"id" : "B", "tokens" : + [ + {"t" : "The"}, + {"t" : "brown"}, + {"t" : "fox"}, + {"t" : "jumps"}, + {"t" : "over"}, + {"t" : "the"}, + {"t" : "lazy"}, + {"t" : "dog."} + ] + } + ] +} From 5d12d0873230eddde4e5226a8e4f447010ea373d Mon Sep 17 00:00:00 2001 From: enury Date: Mon, 9 Feb 2015 18:04:47 +0000 Subject: [PATCH 03/34] Create json-test2.json --- collatex-pythonport/use_cases/json-test2.json | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 collatex-pythonport/use_cases/json-test2.json diff --git a/collatex-pythonport/use_cases/json-test2.json b/collatex-pythonport/use_cases/json-test2.json new file mode 100644 index 000000000..97d91d424 --- /dev/null +++ b/collatex-pythonport/use_cases/json-test2.json @@ -0,0 +1,31 @@ +{"witnesses" : + [ + {"id" : "E","tokens" : + [ + {"t" : "The"}, + {"t" : "quick"}, + {"t" : "brown"}, + {"t" : "fox"}, + {"t" : "jumps"}, + {"t" : "over"}, + {"t" : "the"}, + {"t" : "dog"}, + {"t" : "."} + ] + }, + + {"id" : "F", "tokens" : + [ + {"t" : "The"}, + {"t" : "brown"}, + {"t" : "fox"}, + {"t" : "jumps"}, + {"t" : "over"}, + {"t" : "the"}, + {"t" : "lazy"}, + {"t" : "dog"}, + {"t" : "."} + ] + } + ] +} From aad18be8ece6b9567538e1cf3ccdaaa9ad809a9c Mon Sep 17 00:00:00 2001 From: enury Date: Mon, 9 Feb 2015 18:05:15 +0000 Subject: [PATCH 04/34] Delete json-test1 --- collatex-pythonport/use_cases/json-test1 | 29 ------------------------ 1 file changed, 29 deletions(-) delete mode 100644 collatex-pythonport/use_cases/json-test1 diff --git a/collatex-pythonport/use_cases/json-test1 b/collatex-pythonport/use_cases/json-test1 deleted file mode 100644 index a419c0e49..000000000 --- a/collatex-pythonport/use_cases/json-test1 +++ /dev/null @@ -1,29 +0,0 @@ -{"witnesses" : - [ - {"id" : "A","tokens" : - [ - {"t" : "The"}, - {"t" : "quick"}, - {"t" : "brown"}, - {"t" : "fox"}, - {"t" : "jumps"}, - {"t" : "over"}, - {"t" : "the"}, - {"t" : "dog."} - ] - }, - - {"id" : "B", "tokens" : - [ - {"t" : "The"}, - {"t" : "brown"}, - {"t" : "fox"}, - {"t" : "jumps"}, - {"t" : "over"}, - {"t" : "the"}, - {"t" : "lazy"}, - {"t" : "dog."} - ] - } - ] -} From bebc85692fbfe05d266fd1c7a2d470e2e5c0c216 Mon Sep 17 00:00:00 2001 From: enury Date: Mon, 9 Feb 2015 18:05:45 +0000 Subject: [PATCH 05/34] Create json-test1.json --- collatex-pythonport/use_cases/json-test1.json | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 collatex-pythonport/use_cases/json-test1.json diff --git a/collatex-pythonport/use_cases/json-test1.json b/collatex-pythonport/use_cases/json-test1.json new file mode 100644 index 000000000..6f4f00d38 --- /dev/null +++ b/collatex-pythonport/use_cases/json-test1.json @@ -0,0 +1,29 @@ +{"witnesses" : + [ + {"id" : "E","tokens" : + [ + {"t" : "The"}, + {"t" : "quick"}, + {"t" : "brown"}, + {"t" : "fox"}, + {"t" : "jumps"}, + {"t" : "over"}, + {"t" : "the"}, + {"t" : "dog."} + ] + }, + + {"id" : "F", "tokens" : + [ + {"t" : "The"}, + {"t" : "brown"}, + {"t" : "fox"}, + {"t" : "jumps"}, + {"t" : "over"}, + {"t" : "the"}, + {"t" : "lazy"}, + {"t" : "dog."} + ] + } + ] +} From f68f4cccde91d72a029fcdee23ab08d7d8f7f24f Mon Sep 17 00:00:00 2001 From: enury Date: Mon, 9 Feb 2015 18:08:16 +0000 Subject: [PATCH 06/34] Update json-test1.json --- collatex-pythonport/use_cases/json-test1.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/collatex-pythonport/use_cases/json-test1.json b/collatex-pythonport/use_cases/json-test1.json index 6f4f00d38..a419c0e49 100644 --- a/collatex-pythonport/use_cases/json-test1.json +++ b/collatex-pythonport/use_cases/json-test1.json @@ -1,6 +1,6 @@ {"witnesses" : [ - {"id" : "E","tokens" : + {"id" : "A","tokens" : [ {"t" : "The"}, {"t" : "quick"}, @@ -13,7 +13,7 @@ ] }, - {"id" : "F", "tokens" : + {"id" : "B", "tokens" : [ {"t" : "The"}, {"t" : "brown"}, From b0fa0b469417e0cc46893b23cd3ffb19f0167047 Mon Sep 17 00:00:00 2001 From: enury Date: Mon, 9 Feb 2015 18:08:34 +0000 Subject: [PATCH 07/34] Update json-test2.json --- collatex-pythonport/use_cases/json-test2.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/collatex-pythonport/use_cases/json-test2.json b/collatex-pythonport/use_cases/json-test2.json index 97d91d424..ca26f69af 100644 --- a/collatex-pythonport/use_cases/json-test2.json +++ b/collatex-pythonport/use_cases/json-test2.json @@ -1,6 +1,6 @@ {"witnesses" : [ - {"id" : "E","tokens" : + {"id" : "C","tokens" : [ {"t" : "The"}, {"t" : "quick"}, @@ -14,7 +14,7 @@ ] }, - {"id" : "F", "tokens" : + {"id" : "D", "tokens" : [ {"t" : "The"}, {"t" : "brown"}, From ed8c8e00a296ab0720ec183a5a0bc7df37c245ec Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 10 Feb 2015 00:07:37 +0000 Subject: [PATCH 08/34] Update core_functions.py Update tokens property of Collation. --- collatex-pythonport/collatex/core_functions.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py index 23d7bc026..43a7b96c1 100644 --- a/collatex-pythonport/collatex/core_functions.py +++ b/collatex-pythonport/collatex/core_functions.py @@ -179,8 +179,19 @@ def to_extended_suffix_array(self): def tokens(self): #print("COLLATION TOKENIZE IS CALLED!") #TODO: complete set of witnesses is retokenized here! - tokenizer = WordPunctuationTokenizer() - tokens = tokenizer.tokenize(self.get_combined_string()) + #tokenizer = WordPunctuationTokenizer() + #tokens = tokenizer.tokenize(self.get_combined_string()) + + #tokens = [token.token_string for witness in self.witnesses for token in witness._tokens] + tokens = [] + for i, witness in enumerate(self.witnesses): + for tk in witness._tokens: + tokens.append(tk.token_string) + # if last witness, do not append $ or i to the list of tokens + if i == len(self.witnesses)-1: + break + tokens.append('$') + tokens.append(str(i+1)) return tokens From babd89c04edd86e2a60cfcec129e63384216700e Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 10 Feb 2015 08:00:08 +0000 Subject: [PATCH 09/34] Update core_functions.py --- collatex-pythonport/collatex/core_functions.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py index 43a7b96c1..d73ad96d5 100644 --- a/collatex-pythonport/collatex/core_functions.py +++ b/collatex-pythonport/collatex/core_functions.py @@ -185,13 +185,11 @@ def tokens(self): #tokens = [token.token_string for witness in self.witnesses for token in witness._tokens] tokens = [] for i, witness in enumerate(self.witnesses): + if i > 0 : + tokens.append('$') + tokens.append(str(i)) for tk in witness._tokens: tokens.append(tk.token_string) - # if last witness, do not append $ or i to the list of tokens - if i == len(self.witnesses)-1: - break - tokens.append('$') - tokens.append(str(i+1)) return tokens From 5b5ab0483674f025fe047f7680dd364ca9a4e444 Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 10 Feb 2015 17:47:36 +0000 Subject: [PATCH 10/34] Update core_functions.py --- collatex-pythonport/collatex/core_functions.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py index d73ad96d5..b5bcc5333 100644 --- a/collatex-pythonport/collatex/core_functions.py +++ b/collatex-pythonport/collatex/core_functions.py @@ -126,11 +126,8 @@ def __init__(self): self.witnesses = [] self.counter = 0 self.witness_ranges = {} - self.combined_string = "" self.cached_suffix_array = None - # the tokenization process happens multiple times - # and by different tokenizers. This should be fixed def add_witness(self, witnessdata): # clear the suffix array and LCP array cache self.cached_suffix_array = None @@ -141,9 +138,6 @@ def add_witness(self, witnessdata): # the extra one is for the marker token self.counter += len(witness.tokens()) +2 # $ + number self.witness_ranges[witness.sigil] = witness_range - if not self.combined_string == "": - self.combined_string += " $"+str(len(self.witnesses)-1)+ " " - self.combined_string += witness.content def add_plain_witness(self, sigil, content): return self.add_witness({'id':sigil, 'content':content}) @@ -153,9 +147,6 @@ def get_range_for_witness(self, witness_sigil): raise Exception("Witness "+witness_sigil+" is not added to the collation!") return self.witness_ranges[witness_sigil] - def get_combined_string(self): - return self.combined_string - def get_sa(self): #NOTE: implemented in a lazy manner, since calculation of the Suffix Array and LCP Array takes time if not self.cached_suffix_array: @@ -171,24 +162,17 @@ def get_lcp_array(self): sa = self.get_sa() return sa._LCP_values - def to_extended_suffix_array(self): return ExtendedSuffixArray(self.tokens, self.get_suffix_array(), self.get_lcp_array()) @property def tokens(self): - #print("COLLATION TOKENIZE IS CALLED!") - #TODO: complete set of witnesses is retokenized here! - #tokenizer = WordPunctuationTokenizer() - #tokens = tokenizer.tokenize(self.get_combined_string()) - - #tokens = [token.token_string for witness in self.witnesses for token in witness._tokens] tokens = [] for i, witness in enumerate(self.witnesses): if i > 0 : tokens.append('$') tokens.append(str(i)) - for tk in witness._tokens: + for tk in witness.tokens(): tokens.append(tk.token_string) return tokens From 8bc6bec44ec5d236089e517bc57b82113a7648cb Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 10 Feb 2015 18:06:08 +0000 Subject: [PATCH 11/34] Update Witness class in core_classes.py --- collatex-pythonport/collatex/core_classes.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/collatex-pythonport/collatex/core_classes.py b/collatex-pythonport/collatex/core_classes.py index 6d62f2a06..aee498dcf 100644 --- a/collatex-pythonport/collatex/core_classes.py +++ b/collatex-pythonport/collatex/core_classes.py @@ -164,17 +164,15 @@ def __init__(self, witnessdata): self.sigil = witnessdata['id'] self._tokens = [] if 'content' in witnessdata: - self.content = witnessdata['content'] - # print("Witness "+sigil+" TOKENIZER IS CALLED!") tokenizer = WordPunctuationTokenizer() - tokens_as_strings = tokenizer.tokenize(self.content) + tokens_as_strings = tokenizer.tokenize(witnessdata['content']) for token_string in tokens_as_strings: self._tokens.append(Token({'t':token_string})) elif 'tokens' in witnessdata: for tk in witnessdata['tokens']: self._tokens.append(Token(tk)) - # TODO no idea what this content string is needed for. - self.content = ' '.join([x.token_string for x in self._tokens]) + #else raise an exception, if neither 'content' or 'tokens' in witnessdata? + #also if no 'id' in witnessdata? def tokens(self): return self._tokens From eaa7a883db676461091996e0ce127f68b09d9e5d Mon Sep 17 00:00:00 2001 From: enury Date: Wed, 11 Feb 2015 00:07:42 +0000 Subject: [PATCH 12/34] Update core_functions.py Added pretokenized keyword argument in function collate, and created new function get_tokenized_at. Deleted function collate_pretokenized_json. Added option of vertical layout also for json output. --- .../collatex/core_functions.py | 67 +++++++------------ 1 file changed, 25 insertions(+), 42 deletions(-) diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py index b5bcc5333..071e175fc 100644 --- a/collatex-pythonport/collatex/core_functions.py +++ b/collatex-pythonport/collatex/core_functions.py @@ -16,7 +16,7 @@ # "table" for the alignment table (default) # "graph" for the variant graph # "json" for the alignment table exported as JSON -def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, debug_scores=False): +def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, debug_scores=False, pretokenized=False): algorithm = EditGraphAligner(collation, near_match=near_match, astar=astar, debug_scores=debug_scores) # build graph graph = VariantGraph() @@ -27,10 +27,17 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n # check which output format is requested: graph or table if output=="graph": return graph + # create alignment table table = AlignmentTable(collation, graph, layout) + if pretokenized and not segmentation: + token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses] + #for the moment only with segmentation=False + #there could be a different comportment of get_tokenized_table if semgentation=True + table = get_tokenized_at(table, token_list, segmentation=segmentation) + if output == "json": - return export_alignment_table_as_json(table) + return export_alignment_table_as_json(table, layout=layout) if output == "html": return display_alignment_table_as_HTML(table) if output == "table": @@ -38,48 +45,21 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n else: raise Exception("Unknown output type: "+output) - - -#TODO: this only works with a table output at the moment -#TODO: store the tokens on the graph instead -def collate_pretokenized_json(json, output='table', layout='horizontal', **kwargs): - # Takes more or less the same arguments as collate() above, but with some restrictions. - # Only output types 'json' and 'table' are supported. - if output not in ['json', 'table']: - raise UnsupportedError("Output type" + kwargs['output'] + "not supported for pretokenized collation") - if 'segmentation' in kwargs and kwargs['segmentation']: - raise UnsupportedError("Segmented output not supported for pretokenized collation") - kwargs['segmentation'] = False - - # For each witness given, make a 'shadow' witness based on the normalization tokens - # that will actually be collated. - tokenized_witnesses = [] - collation = Collation() - for witness in json["witnesses"]: - collation.add_witness(witness) - tokenized_witnesses.append(witness["tokens"]) - at = collate(collation, output="table", **kwargs) - tokenized_at = AlignmentTable(collation, layout=layout) - for row, tokenized_witness in zip(at.rows, tokenized_witnesses): - new_row = Row(row.header) +def get_tokenized_at(table, token_list, segmentation=False): + tokenized_at = AlignmentTable(Collation()) + for witness_row, witness_tokens in zip(table.rows, token_list): + new_row = Row(witness_row.header) tokenized_at.rows.append(new_row) - token_counter = 0 - for cell in row.cells: + counter = 0 + for cell in witness_row.cells: if cell != "-": - new_row.cells.append(tokenized_witness[token_counter]) - token_counter+=1 - else: - #TODO: should probably be null or None instead, but that would break the rendering at the moment - new_row.cells.append({"t":"-"}) - if output=="json": - return export_alignment_table_as_json(tokenized_at) - if output=="table": - # transform JSON objects to "t" form. - for row in tokenized_at.rows: - row.cells = [cell["t"] for cell in row.cells] - return tokenized_at - -def export_alignment_table_as_json(table, indent=None, status=False): + new_row.cells.append(witness_tokens[counter]) + counter+=1 + else: + new_row.cells.append({}) + return tokenized_at + +def export_alignment_table_as_json(table, indent=None, status=False, layout="horizontal"): json_output = {} json_output["table"]=[] sigli = [] @@ -92,6 +72,9 @@ def export_alignment_table_as_json(table, indent=None, status=False): for column in table.columns: variant_status.append(column.variant) json_output["status"]=variant_status + if layout=="vertical": + new_table = [[row[i] for row in json_output["table"]] for i in range(len(row.cells))] + json_output["table"] = new_table return json.dumps(json_output, sort_keys=True, indent=indent) ''' From 6672e5288fdbf94dd646eb0ab465829b0e2fb194 Mon Sep 17 00:00:00 2001 From: enury Date: Wed, 11 Feb 2015 00:08:45 +0000 Subject: [PATCH 13/34] Update __init__.py --- collatex-pythonport/collatex/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/collatex-pythonport/collatex/__init__.py b/collatex-pythonport/collatex/__init__.py index 72cd2d045..a277707f2 100755 --- a/collatex-pythonport/collatex/__init__.py +++ b/collatex-pythonport/collatex/__init__.py @@ -7,8 +7,7 @@ from collatex.core_functions import Collation from collatex.core_functions import collate -from collatex.core_functions import collate_pretokenized_json -__all__ = ["Collation", "collate", "collate_pretokenized_json"] +__all__ = ["Collation", "collate"] From 070f8256e21276046c58a1b018757932169ccf4e Mon Sep 17 00:00:00 2001 From: enury Date: Wed, 11 Feb 2015 10:55:49 +0000 Subject: [PATCH 14/34] Update core_functions.py --- .../collatex/core_functions.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py index 071e175fc..354ce3375 100644 --- a/collatex-pythonport/collatex/core_functions.py +++ b/collatex-pythonport/collatex/core_functions.py @@ -35,6 +35,10 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n #for the moment only with segmentation=False #there could be a different comportment of get_tokenized_table if semgentation=True table = get_tokenized_at(table, token_list, segmentation=segmentation) + # for display purpose, table and html output will return only token 't' (string) and not the full token_data (dict) + if output=="table" or output=="html": + for row in table.rows: + row.cells = [cell["t"] for cell in row.cells] if output == "json": return export_alignment_table_as_json(table, layout=layout) @@ -52,11 +56,22 @@ def get_tokenized_at(table, token_list, segmentation=False): tokenized_at.rows.append(new_row) counter = 0 for cell in witness_row.cells: - if cell != "-": + if cell == "-": + # TODO: should probably be null or None instead, but that would break the rendering at the moment (line 41) + new_row.cells.append({"t" : "-"}) + # if segmentation=False + else: new_row.cells.append(witness_tokens[counter]) counter+=1 - else: - new_row.cells.append({}) + # else if segmentation=True + #string = witness_tokens[counter].token_string + #token_counter = 1 + #while string != cell: + ##add token_string of the next token until it is equivalent to the string in the cell + #string += next token string + #token_counter += 1 + #new_row.cells.append([tk for tk in witness_tokens[counter:counter+token_counter]]) + #update counter (counter += token_counter) return tokenized_at def export_alignment_table_as_json(table, indent=None, status=False, layout="horizontal"): From 42fb5ebf7c240dd081a9f3ef998652a15f3969a3 Mon Sep 17 00:00:00 2001 From: enury Date: Wed, 11 Feb 2015 15:27:57 +0000 Subject: [PATCH 15/34] Update core_functions.py Replaced Collation.tokens property with an attribute combined_tokens. --- .../collatex/core_functions.py | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py index 354ce3375..212ba9b31 100644 --- a/collatex-pythonport/collatex/core_functions.py +++ b/collatex-pythonport/collatex/core_functions.py @@ -32,8 +32,8 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n table = AlignmentTable(collation, graph, layout) if pretokenized and not segmentation: token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses] - #for the moment only with segmentation=False - #there could be a different comportment of get_tokenized_table if semgentation=True + # only with segmentation=False + # there could be a different comportment of get_tokenized_table if semgentation=True table = get_tokenized_at(table, token_list, segmentation=segmentation) # for display purpose, table and html output will return only token 't' (string) and not the full token_data (dict) if output=="table" or output=="html": @@ -125,6 +125,7 @@ def __init__(self): self.counter = 0 self.witness_ranges = {} self.cached_suffix_array = None + self.combined_tokens =[] def add_witness(self, witnessdata): # clear the suffix array and LCP array cache @@ -136,6 +137,11 @@ def add_witness(self, witnessdata): # the extra one is for the marker token self.counter += len(witness.tokens()) +2 # $ + number self.witness_ranges[witness.sigil] = witness_range + if len(self.witnesses) > 1: + self.combined_tokens.append('$') + self.combined_tokens.append(str(len(self.witnesses)-1)) + for tk in witness.tokens(): + self.combined_tokens.append(tk.token_string) def add_plain_witness(self, sigil, content): return self.add_witness({'id':sigil, 'content':content}) @@ -149,7 +155,7 @@ def get_sa(self): #NOTE: implemented in a lazy manner, since calculation of the Suffix Array and LCP Array takes time if not self.cached_suffix_array: # Unit byte is done to skip tokenization in third party library - self.cached_suffix_array = SuffixArray(self.tokens, unit=UNIT_BYTE) + self.cached_suffix_array = SuffixArray(self.combined_tokens, unit=UNIT_BYTE) return self.cached_suffix_array def get_suffix_array(self): @@ -161,18 +167,6 @@ def get_lcp_array(self): return sa._LCP_values def to_extended_suffix_array(self): - return ExtendedSuffixArray(self.tokens, self.get_suffix_array(), self.get_lcp_array()) - - @property - def tokens(self): - tokens = [] - for i, witness in enumerate(self.witnesses): - if i > 0 : - tokens.append('$') - tokens.append(str(i)) - for tk in witness.tokens(): - tokens.append(tk.token_string) - return tokens - + return ExtendedSuffixArray(self.combined_tokens, self.get_suffix_array(), self.get_lcp_array()) From 3392a422967130b9d2e06f89f8e9075fa702fbe8 Mon Sep 17 00:00:00 2001 From: enury Date: Wed, 11 Feb 2015 15:29:34 +0000 Subject: [PATCH 16/34] Update suffix_based_scorer.py Replaced Collation.tokens property with combined_tokens attribute. --- collatex-pythonport/collatex/suffix_based_scorer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collatex-pythonport/collatex/suffix_based_scorer.py b/collatex-pythonport/collatex/suffix_based_scorer.py index cfb2713f2..27c8b468e 100644 --- a/collatex-pythonport/collatex/suffix_based_scorer.py +++ b/collatex-pythonport/collatex/suffix_based_scorer.py @@ -147,7 +147,7 @@ def _get_block_witness(self, witness): occurrences.append(occurrence) # sort occurrences on position sorted_o = sorted(occurrences, key=attrgetter('lower_end')) - block_witness = BlockWitness(sorted_o, self.collation.tokens) + block_witness = BlockWitness(sorted_o, self.collation.combined_tokens) return block_witness ''' From 58a3a5e93143f38e08084dd9ec0c1b2fbefb0fcf Mon Sep 17 00:00:00 2001 From: enury Date: Wed, 11 Feb 2015 15:59:34 +0000 Subject: [PATCH 17/34] Update core_functions.py Added auto-detection of pretokenized json. --- collatex-pythonport/collatex/core_functions.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py index 212ba9b31..5b66988ce 100644 --- a/collatex-pythonport/collatex/core_functions.py +++ b/collatex-pythonport/collatex/core_functions.py @@ -16,7 +16,7 @@ # "table" for the alignment table (default) # "graph" for the variant graph # "json" for the alignment table exported as JSON -def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, debug_scores=False, pretokenized=False): +def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, debug_scores=False): algorithm = EditGraphAligner(collation, near_match=near_match, astar=astar, debug_scores=debug_scores) # build graph graph = VariantGraph() @@ -30,7 +30,7 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n # create alignment table table = AlignmentTable(collation, graph, layout) - if pretokenized and not segmentation: + if collation.pretokenized and not segmentation: token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses] # only with segmentation=False # there could be a different comportment of get_tokenized_table if semgentation=True @@ -99,8 +99,13 @@ class Collation(object): @classmethod def create_from_dict(cls, data, limit=None): + if "witnesses" not in data: + raise UnsupportedError("Json input not valid") witnesses = data["witnesses"] collation = Collation() + # determine if data is pretokenized (check for the first witness) + if 'tokens' in witnesses[0]: + collation.pretokenized = True for witness in witnesses[:limit]: # generate collation object from json_data collation.add_witness(witness) @@ -122,6 +127,7 @@ def create_from_json_file(cls, json_path): def __init__(self): self.witnesses = [] + self.pretokenized = False self.counter = 0 self.witness_ranges = {} self.cached_suffix_array = None From b5e2bad1340b36db7f0b407c6e46ffae8ec68075 Mon Sep 17 00:00:00 2001 From: enury Date: Wed, 11 Feb 2015 16:14:58 +0000 Subject: [PATCH 18/34] Update core_classes.py Added exception handling in Witness class. --- collatex-pythonport/collatex/core_classes.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/collatex-pythonport/collatex/core_classes.py b/collatex-pythonport/collatex/core_classes.py index aee498dcf..882af47e4 100644 --- a/collatex-pythonport/collatex/core_classes.py +++ b/collatex-pythonport/collatex/core_classes.py @@ -161,6 +161,8 @@ def __repr__(self): class Witness(object): def __init__(self, witnessdata): + if 'id' not in witnessdata: + raise UnsupportedError("No defined id in witnessdata") self.sigil = witnessdata['id'] self._tokens = [] if 'content' in witnessdata: @@ -171,8 +173,8 @@ def __init__(self, witnessdata): elif 'tokens' in witnessdata: for tk in witnessdata['tokens']: self._tokens.append(Token(tk)) - #else raise an exception, if neither 'content' or 'tokens' in witnessdata? - #also if no 'id' in witnessdata? + else: + raise UnsupportedError("No defined content/tokens in witness "+self.sigil) def tokens(self): return self._tokens From 7fc92ebf0d8df16cb4fe35286759056de0421aeb Mon Sep 17 00:00:00 2001 From: enury Date: Wed, 11 Feb 2015 16:32:37 +0000 Subject: [PATCH 19/34] Create json-test3.json --- collatex-pythonport/use_cases/json-test3.json | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 collatex-pythonport/use_cases/json-test3.json diff --git a/collatex-pythonport/use_cases/json-test3.json b/collatex-pythonport/use_cases/json-test3.json new file mode 100644 index 000000000..dcbee333e --- /dev/null +++ b/collatex-pythonport/use_cases/json-test3.json @@ -0,0 +1,29 @@ +{"witnesses" : + [ + {"id" : "E","tokens" : + [ + {"t" : "The", "id": 1, "n": "the"}, + {"t" : "quick", "id": 2}, + {"t" : "brown", "id": 3}, + {"t" : "fox", "id": 4}, + {"t" : "jumps", "id": 5}, + {"t" : "over", "id": 6}, + {"t" : "the", "id": 7}, + {"t" : "dog.", "id": 8, "n": "dog"} + ] + }, + + {"id" : "F", "tokens" : + [ + {"t" : "The"}, + {"t" : "brown"}, + {"t" : "fox"}, + {"t" : "jumps"}, + {"t" : "over"}, + {"t" : "the"}, + {"t" : "lazy"}, + {"t" : "dog."} + ] + } + ] +} From b7beb23639f8ec49693437dda1074e79da159836 Mon Sep 17 00:00:00 2001 From: enury Date: Wed, 11 Feb 2015 16:47:12 +0000 Subject: [PATCH 20/34] Update core_functions.py --- collatex-pythonport/collatex/core_functions.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py index 5b66988ce..69b010c70 100644 --- a/collatex-pythonport/collatex/core_functions.py +++ b/collatex-pythonport/collatex/core_functions.py @@ -64,14 +64,7 @@ def get_tokenized_at(table, token_list, segmentation=False): new_row.cells.append(witness_tokens[counter]) counter+=1 # else if segmentation=True - #string = witness_tokens[counter].token_string - #token_counter = 1 - #while string != cell: - ##add token_string of the next token until it is equivalent to the string in the cell - #string += next token string - #token_counter += 1 - #new_row.cells.append([tk for tk in witness_tokens[counter:counter+token_counter]]) - #update counter (counter += token_counter) + # do something else... return tokenized_at def export_alignment_table_as_json(table, indent=None, status=False, layout="horizontal"): From a5bc15b3173f40f167c644c410f1971495141e20 Mon Sep 17 00:00:00 2001 From: enury Date: Thu, 12 Feb 2015 10:42:40 +0000 Subject: [PATCH 21/34] Update core_functions.py --- collatex-pythonport/collatex/core_functions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py index 69b010c70..7875cf271 100644 --- a/collatex-pythonport/collatex/core_functions.py +++ b/collatex-pythonport/collatex/core_functions.py @@ -96,12 +96,12 @@ def create_from_dict(cls, data, limit=None): raise UnsupportedError("Json input not valid") witnesses = data["witnesses"] collation = Collation() - # determine if data is pretokenized (check for the first witness) - if 'tokens' in witnesses[0]: - collation.pretokenized = True for witness in witnesses[:limit]: # generate collation object from json_data collation.add_witness(witness) + # determine if data is pretokenized + if 'tokens' in witness: + collation.pretokenized = True return collation # json input can be a string or a file From 42de8b1e3d9b65cc43fda22ce866feabe6ca5873 Mon Sep 17 00:00:00 2001 From: enury Date: Thu, 12 Feb 2015 17:10:39 +0000 Subject: [PATCH 22/34] Update core_functions.py Added layout for tokenized alignment table Attempt at creating the tokenized alignment table when tokens are joined into segments. --- .../collatex/core_functions.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py index 7875cf271..318e0437f 100644 --- a/collatex-pythonport/collatex/core_functions.py +++ b/collatex-pythonport/collatex/core_functions.py @@ -34,7 +34,7 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses] # only with segmentation=False # there could be a different comportment of get_tokenized_table if semgentation=True - table = get_tokenized_at(table, token_list, segmentation=segmentation) + table = get_tokenized_at(table, token_list, segmentation=segmentation, layout=layout) # for display purpose, table and html output will return only token 't' (string) and not the full token_data (dict) if output=="table" or output=="html": for row in table.rows: @@ -49,8 +49,8 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n else: raise Exception("Unknown output type: "+output) -def get_tokenized_at(table, token_list, segmentation=False): - tokenized_at = AlignmentTable(Collation()) +def get_tokenized_at(table, token_list, segmentation=False, layout="horizontal"): + tokenized_at = AlignmentTable(Collation(), layout=layout) for witness_row, witness_tokens in zip(table.rows, token_list): new_row = Row(witness_row.header) tokenized_at.rows.append(new_row) @@ -64,7 +64,19 @@ def get_tokenized_at(table, token_list, segmentation=False): new_row.cells.append(witness_tokens[counter]) counter+=1 # else if segmentation=True - # do something else... + ##token_list must be a list of Token instead of list of dict (update lines 34, 64) + ##line 41 will not be happy in case of table/html output + #string = witness_tokens[counter].token_string + #token_counter = 1 + #while string != cell : + # if counter+token_counter-1 < len(witness_tokens)-1: + # #add token_string of the next token until it is equivalent to the string in the cell + # #if we are not at the last token + # string += ' '+witness_tokens[counter+token_counter].token_string + # token_counter += 1 + ##there is one list level too many in the output + #new_row.cells.append([tk.token_data for tk in witness_tokens[counter:counter+token_counter]]) + #counter += token_counter. return tokenized_at def export_alignment_table_as_json(table, indent=None, status=False, layout="horizontal"): From 9aeaebbd744632f8fad63469e6965c945a3164da Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 24 Mar 2015 11:43:23 +0000 Subject: [PATCH 23/34] Create test_token_class.py --- collatex-pythonport/tests/test_token_class.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 collatex-pythonport/tests/test_token_class.py diff --git a/collatex-pythonport/tests/test_token_class.py b/collatex-pythonport/tests/test_token_class.py new file mode 100644 index 000000000..fe4088ad0 --- /dev/null +++ b/collatex-pythonport/tests/test_token_class.py @@ -0,0 +1,39 @@ +''' +Created on March 24, 2015 + +@author: Elisa Nury +''' + +import unittest +from collatex.core_classes import Token +from collatex.exceptions import TokenError + + +class TestToken(unittest.TestCase): + + def test_creation_token_t(self): + data = {'t': 'fox', 'id': 123 } + t = Token(data) + self.assertEqual(t.token_string, 'fox') + self.assertEqual(t.token_data, data) + + def test_creation_token_n(self): + data = {'t': 'kitten', 'n': 'cat'} + t = Token(data) + self.assertEqual(t.token_string, 'cat') + self.assertEqual(t.token_data, data) + + def test_creation_token_none(self): + t = Token(None) + self.assertEqual(t.token_string, '') + self.assertIsNone(t.token_data) + + def test_invalid_token_raises_exception(self): + with self.assertRaises(TokenError): + #data = {'x': 'abc'} + data = {} + Token(data) + +if __name__ == '__main__': + unittest.main() + From 13af6ddcc3a9a5430e315cfee1a048e3e43dfe9e Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 24 Mar 2015 11:44:46 +0000 Subject: [PATCH 24/34] Create test_witness_class.py --- .../tests/test_witness_class.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 collatex-pythonport/tests/test_witness_class.py diff --git a/collatex-pythonport/tests/test_witness_class.py b/collatex-pythonport/tests/test_witness_class.py new file mode 100644 index 000000000..8f2e6e33b --- /dev/null +++ b/collatex-pythonport/tests/test_witness_class.py @@ -0,0 +1,54 @@ +''' +Created on March 24, 2015 + +@author: Elisa Nury +''' + +import unittest +from collatex.core_classes import Witness, Token, Tokenizer +from collatex.exceptions import UnsupportedError, TokenError + +class TestWitness(unittest.TestCase): + + def test_creation_witness_plain(self): + data = {'id': 'A', 'content': 'The quick brown fox jumped over the lazy dogs.'} + w = Witness(data) + self.assertEqual(w.sigil, 'A') + self.assertEqual(len(w.tokens()), 10) + self.assertEqual(w.tokens()[3].token_string, 'fox') + + def test_creation_witness_pretokenized(self): + data = { 'id': 'B', + 'tokens': [ + {'t': 'A', 'ref': 123}, + {'t': 'black and blue', 'adj': True}, + {'t': 'cat', 'id': 'xyz'}, + {'t': 'bird.', 'id': 'abc'} + ] + } + w = Witness(data) + self.assertEqual(w.sigil, 'B') + self.assertEqual(len(w.tokens()), 4) + + def test_invalid_witness_missing_id(self): + data = {'name': 'A', 'content': 'The quick brown fox jumped over the lazy dogs.'} + self.assertRaises(UnsupportedError, Witness, data) + + def test_invalid_witness_missing_content_tokens(self): + data = {'id': 'A'} + self.assertRaises(UnsupportedError, Witness, data) + + def test_invalid_witness_content_is_pretokenized(self): + #'content' is pretokenized instead of plain text + data = {'id': 'A', 'content': [{'t':'the'}, {'t':'fox'}]} + self.assertRaises(TypeError, Witness, data) + + def test_invalid_witness_tokens_is_plain(self): + #'tokens' is plain text instead of pretokenized + data = {'id': 'A', 'tokens': 'The quick brown fox jumped over the lazy dogs.'} + self.assertRaises(TokenError, Witness, data) + + +if __name__ == '__main__': + unittest.main() + From 87675aa504705267d658be023d39969e0672ae67 Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 24 Mar 2015 11:46:44 +0000 Subject: [PATCH 25/34] Create test_collation_class.py --- .../tests/test_collation_class.py | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 collatex-pythonport/tests/test_collation_class.py diff --git a/collatex-pythonport/tests/test_collation_class.py b/collatex-pythonport/tests/test_collation_class.py new file mode 100644 index 000000000..a97fb684e --- /dev/null +++ b/collatex-pythonport/tests/test_collation_class.py @@ -0,0 +1,84 @@ +''' +Created on March 24, 2015 + +@author: Elisa Nury +''' + +import unittest +from collatex.core_functions import * +from collatex.exceptions import UnsupportedError +from testfixtures import TempDirectory +import os +import json + +class TestCollationMethods(unittest.TestCase): + + def test_collation_method_create_from_json_file(self): + with TempDirectory() as d: + #create a temporary file in a temporary directory + d.write('testfile.json', b'{"witnesses" : [{"id" : "A", "content" : "The fox."}, {"id" : "B", "content": "The dog"}]}') + c = Collation.create_from_json_file(os.path.join(d.path, 'testfile.json')) + self.assertEqual(len(c.witnesses), 2) + + def test_collation_create_from_dict(self): + data = {"witnesses" : [{"id" : "A", "content" : "The fox."}, {"id" : "B", "content": "The dog"}]} + c = Collation.create_from_dict(data) + self.assertEqual(len(c.witnesses), 2) + + +class TestCollationFunctions(unittest.TestCase): + def setUp(self): + data = { + 'witnesses' : [ + { + 'id' : 'A', + 'content' : 'The cat' + }, + { + 'id' : 'B', + 'tokens' : [ + { 't' : 'The'}, + { 't' : 'kitten'} + ] + } + ] + } + self.c = Collation.create_from_dict(data) + + def test_collation_function_add_plain_witness(self): + self.c.add_plain_witness('C', 'A cat') + self.assertEqual(len(self.c.witnesses), 3) + + def test_collation_function_add_witness(self): + witnessdata = {'id': 'C', 'tokens': [{ 't' : 'A'},{ 't' : 'cat'}]} + self.c.add_witness(witnessdata) + self.assertEqual(len(self.c.witnesses), 3) + + def test_collation_function_add_witnesses_with_same_id(self): + witnessdata1 = {'id': 'C', 'tokens': [{ 't' : 'The'},{ 't': 'fox'}]} + witnessdata2 = {'id': 'C', 'tokens': [{ 't' : 'The'},{ 't': 'dog'}]} + self.c.add_witness(witnessdata1) + self.c.add_witness(witnessdata2) + self.assertEqual(len(self.c.witnesses), 4) + + #error in the collation result => there should be an exception raised... + #json_result = json.loads(collate(self.c, output='json')) + #self.assertEqual(json_result['table'][2][1], 'fox') + #self.assertEqual(json_result['table'][3][1], 'dog') + self.fail("It should not be possible to add 2 witnesses with the same id") + + def test_collation_function_get_range_for_witness(self): + expected_range_B = RangeSet() + expected_range_B.add_range(4, 6) + self.assertEqual(self.c.get_range_for_witness('B'), expected_range_B) + self.assertRaises(Exception, self.c.get_range_for_witness, 'W') + + #test other functions? + #get suffix array + #get sa + #get lcp array + #to extended suffix array + + +if __name__ == '__main__': + unittest.main() From 8b9b0ece18c7c46c672dc59fb73614aeaa6012b0 Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 24 Mar 2015 11:48:15 +0000 Subject: [PATCH 26/34] Create test_collate_outputs.py --- .../tests/test_collate_outputs.py | 275 ++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 collatex-pythonport/tests/test_collate_outputs.py diff --git a/collatex-pythonport/tests/test_collate_outputs.py b/collatex-pythonport/tests/test_collate_outputs.py new file mode 100644 index 000000000..cb9c8e5dd --- /dev/null +++ b/collatex-pythonport/tests/test_collate_outputs.py @@ -0,0 +1,275 @@ +''' +Created on March 24, 2015 + +@author: Elisa Nury +''' + +import unittest +from collatex.core_functions import * +from collatex.exceptions import UnsupportedError + +class TestCollate(unittest.TestCase): + def test_collate_with_invalid_output(self): + data = {"witnesses" : + [ + {"id" : "A", "tokens" : + [ + {"t": "A", "id": 1}, + {"t": "small"}, + {"t": "black"}, + {"t": "cat"} + ] + }, + {"id" : "B", "tokens" : + [ + {"t": "A"}, + {"t": "small"}, + {"t": "white"}, + {"t": "kitten.", "n": "cat"} + ] + } + ] + } + c = Collation.create_from_dict(data) + with self.assertRaises(Exception): + collate(c, output="xyz") + + def test_collate_with_empty_collation(self): + c = Collation() + with self.assertRaises(IndexError): + collate(c) + + +class TestTokenizedJsonOutput(unittest.TestCase): + def setUp(self): + self.data = {"witnesses" : + [ + {"id" : "A", "tokens" : + [ + {"t": "A", "id": 1}, + {"t": "small"}, + {"t": "black"}, + {"t": "cat"} + ] + }, + {"id" : "B", "tokens" : + [ + {"t": "A"}, + {"t": "small"}, + {"t": "white"}, + {"t": "kitten.", "n": "cat"} + ] + } + ] + } + self.c = Collation.create_from_dict(self.data) + self.maxDiff = None + + #-------------------------------------------------- + #JSON output + def test_tokenized_output_json_segmentationFalse_layoutHorizontal(self): + expected = '{"table": [[[{"id": 1, "t": "A"}], [{"t": "small"}], [{"t": "black"}], [{"t": "cat"}]], [[{"t": "A"}], [{"t": "small"}], [{"t": "white"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}' + output = collate(self.c, output="json", segmentation=False, layout="horizontal") + self.assertEqual(output, expected) + + def test_tokenized_output_json_segmentationFalse_layoutVertical(self): + expected = '{"table": [[[{"id": 1, "t": "A"}], [{"t": "A"}]], [[{"t": "small"}], [{"t": "small"}]], [[{"t": "black"}], [{"t": "white"}]], [[{"t": "cat"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}' + output = collate(self.c, output="json", segmentation=False, layout="vertical") + self.assertEqual(output, expected) + + def test_tokenized_output_json_segmentationTrue_layoutHorizontal(self): + expected = '{"table": [[["A small"], ["black"], ["cat"]], [["A small"], ["white"], ["cat"]]], "witnesses": ["A", "B"]}' + output = collate(self.c, output="json", segmentation=True, layout="horizontal") + self.assertEqual(output, expected) + + def test_tokenized_output_json_segmentationTrue_layoutVertical(self): + expected = '{"table": [[["A small"], ["A small"]], [["black"], ["white"]], [["cat"], ["cat"]]], "witnesses": ["A", "B"]}' + output = collate(self.c, output="json", segmentation=True, layout="vertical") + self.assertEqual(output, expected) + + #-------------------------------------------------- + #TABLE output + + def test_tokenized_output_table_segmentationFalse_layoutHorizontal(self): + expected = """\ ++---+---+-------+-------+---------+ +| A | A | small | black | cat | +| B | A | small | white | kitten. | ++---+---+-------+-------+---------+""" + output = str(collate(self.c, output="table", segmentation=False, layout="horizontal")) + self.assertEqual(output, expected) + + def test_tokenized_output_table_segmentationFalse_layoutVertical(self): + expected = '''\ ++-------+---------+ +| A | B | ++-------+---------+ +| A | A | ++-------+---------+ +| small | small | ++-------+---------+ +| black | white | ++-------+---------+ +| cat | kitten. | ++-------+---------+''' + output = str(collate(self.c, output="table", segmentation=False, layout="vertical")) + self.assertEqual(output, expected) + + def test_tokenized_output_table_segmentationTrue_layoutHorizontal(self): + expected = """\ ++---+---------+-------+-----+ +| A | A small | black | cat | +| B | A small | white | cat | ++---+---------+-------+-----+""" + output = str(collate(self.c, output="table", segmentation=True, layout="horizontal")) + self.assertEqual(output, expected) + + def test_tokenized_output_table_segmentationTrue_layoutVertical(self): + expected = '''\ ++---------+---------+ +| A | B | ++---------+---------+ +| A small | A small | ++---------+---------+ +| black | white | ++---------+---------+ +| cat | cat | ++---------+---------+''' + output = str(collate(self.c, output="table", segmentation=True, layout="vertical")) + self.assertEqual(output, expected) + + #-------------------------------------------------- + #HTML output + + def test_tokenized_output_html_segmentationFalse_layoutHorizontal(self): + expected = '''\ + + + + + + + + + + + + + + + +
AAsmallblackcat
BAsmallwhitekitten.
''' + output = collate(self.c, output="html", segmentation=False, layout="horizontal") + self.assertEqual(output, expected) + + def test_tokenized_output_html_segmentationFalse_layoutVertical(self): + expected = '''\ + + + + + + + + + + + + + + + + + + + + + +
AB
AA
smallsmall
blackwhite
catkitten.
''' + output = collate(self.c, output="html", segmentation=False, layout="vertical") + self.assertEqual(output, expected) + + def test_tokenized_output_html_segmentationTrue_layoutHorizontal(self): + expected = '''\ + + + + + + + + + + + + + +
AA smallblackcat
BA smallwhitecat
''' + output = collate(self.c, output="html", segmentation=True, layout="horizontal") + self.assertEqual(output, expected) + + def test_tokenized_output_html_segmentationTrue_layoutVertical(self): + expected = '''\ + + + + + + + + + + + + + + + + + +
AB
A smallA small
blackwhite
catcat
''' + output = collate(self.c, output="html", segmentation=True, layout="vertical") + self.assertEqual(output, expected) + + + + +#-------------------------------------------------- +#Empty cells output + +class TestOutputEmptyCells(unittest.TestCase): + def setUp(self): + data = { + "witnesses" : [ + { + "id" : "A", + "tokens" : [ + { "t" : "A"}, + { "t" : "black"}, + { "t" : "cat"} + ] + }, + { + "id" : "B", + "tokens" : [ + { "t": "A" }, + { "t": "kitten.", "n": "cat" } + ] + } + ] + } + self.c = Collation.create_from_dict(data) + + def test_json_segmentationTrue_output_with_empty_cells(self): + expected = '{"table": [[["A"], ["black"], ["cat"]], [["A"], ["-"], ["cat"]]], "witnesses": ["A", "B"]}' + output = collate(self.c, output="json") + self.assertEqual(output, expected) + + def test_json_segmentationFalse_output_with_empty_cells(self): + expected = '{"table": [[[{"t": "A"}], [{"t": "black"}], [{"t": "cat"}]], [[{"t": "A"}], [{"t": "-"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}' + output = collate(self.c, output="json", segmentation=False) + self.assertEqual(output, expected) + + +if __name__ == '__main__': + unittest.main() From 42546cf2c5b0dab0ff0c8f468ea59e1311e44281 Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 24 Mar 2015 11:49:34 +0000 Subject: [PATCH 27/34] Update core_classes.py --- collatex-pythonport/collatex/core_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collatex-pythonport/collatex/core_classes.py b/collatex-pythonport/collatex/core_classes.py index 882af47e4..9f1d21d92 100644 --- a/collatex-pythonport/collatex/core_classes.py +++ b/collatex-pythonport/collatex/core_classes.py @@ -13,7 +13,7 @@ import re from prettytable import PrettyTable from textwrap import fill -from collatex.exceptions import TokenError +from collatex.exceptions import TokenError, UnsupportedError class Row(object): From c2923d5859c5137b2b10d355f32bf757cecf9586 Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 24 Mar 2015 12:38:51 +0000 Subject: [PATCH 28/34] Update test_collatex_block_witnesses.py --- collatex-pythonport/tests/test_collatex_block_witnesses.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/collatex-pythonport/tests/test_collatex_block_witnesses.py b/collatex-pythonport/tests/test_collatex_block_witnesses.py index 7cb412822..48e78361e 100644 --- a/collatex-pythonport/tests/test_collatex_block_witnesses.py +++ b/collatex-pythonport/tests/test_collatex_block_witnesses.py @@ -29,7 +29,7 @@ def test_combined_string_hermans_case(self): collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") # $ is meant to separate witnesses here - self.assertEquals("a b c d F g h i ! K ! q r s t $1 a b c d F g h i ! q r s t", collation.get_combined_string()) + self.assertEquals("a b c d F g h i ! K ! q r s t $ 1 a b c d F g h i ! q r s t", " ".join(collation.combined_tokens)) # test whether the witness->range mapping works @unit_disabled @@ -241,4 +241,4 @@ def test_filter_potential_blocks(self): if __name__ == "__main__": #import sys;sys.argv = ['', 'Test.testName'] - unittest.main() \ No newline at end of file + unittest.main() From 2914b6b2407452aff6f74624251f35939c826601 Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 24 Mar 2015 12:44:25 +0000 Subject: [PATCH 29/34] Update test_near_matching_pretokenized.py --- .../tests/test_near_matching_pretokenized.py | 82 ++++++++++--------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/collatex-pythonport/tests/test_near_matching_pretokenized.py b/collatex-pythonport/tests/test_near_matching_pretokenized.py index cad73a67e..7beb7f043 100644 --- a/collatex-pythonport/tests/test_near_matching_pretokenized.py +++ b/collatex-pythonport/tests/test_near_matching_pretokenized.py @@ -5,61 +5,63 @@ ''' import unittest from tests import unit_disabled -from collatex.core_functions import collate_pretokenized_json +from collatex.core_functions import * class Test(unittest.TestCase): - json_in = { - "witnesses" : [ - { - "id" : "A", - "tokens" : [ - { "t" : "I", "ref" : 123 }, - { "t" : "bought" , "adj" : True }, - { "t" : "this", "id" : "x3" }, - { "t" : "glass", "id" : "x4" }, - { "t" : ",", "type" : "punct" }, - { "t" : "because", "id" : "x5" }, - { "t" : "it", "id" : "x6" }, - { "t" : "matches" }, - { "t" : "those", "id" : "x7" }, - { "t" : "dinner", "id" : "x8" }, - { "t" : "plates", "id" : "x9" }, - { "t" : ".", "type" : "punct" } - ] - }, - { - "id" : "B", - "tokens" : [ - { "t" : "I" }, - { "t" : "bought" , "adj" : True }, - { "t" : "those", "id" : "abc" }, - { "t" : "glasses", "id" : "xyz" }, - { "t" : ".", "type" : "punct" } - ] + def setUp(self): + json_in = { + "witnesses" : [ + { + "id" : "A", + "tokens" : [ + { "t" : "I", "ref" : 123 }, + { "t" : "bought" , "adj" : True }, + { "t" : "this", "id" : "x3" }, + { "t" : "glass", "id" : "x4" }, + { "t" : ",", "type" : "punct" }, + { "t" : "because", "id" : "x5" }, + { "t" : "it", "id" : "x6" }, + { "t" : "matches" }, + { "t" : "those", "id" : "x7" }, + { "t" : "dinner", "id" : "x8" }, + { "t" : "plates", "id" : "x9" }, + { "t" : ".", "type" : "punct" } + ] + }, + { + "id" : "B", + "tokens" : [ + { "t" : "I" }, + { "t" : "bought" , "adj" : True }, + { "t" : "those", "id" : "abc" }, + { "t" : "glasses", "id" : "xyz" }, + { "t" : ".", "type" : "punct" } + ] + } + ] } - ] - } + self.c = Collation.create_from_dict(json_in) def test_exact_matching(self): - result = collate_pretokenized_json(self.json_in) - self.assertEquals(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."], + result = collate(self.c, segmentation=False) + self.assertEqual(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."], result.rows[0].to_list()) - self.assertEquals(["I", "bought", "-", "-", "-", "-", "-", "-", "those", "glasses", "-", "."], result.rows[1].to_list()) + self.assertEqual(["I", "bought", "-", "-", "-", "-", "-", "-", "those", "glasses", "-", "."], result.rows[1].to_list()) def test_near_matching(self): - result = collate_pretokenized_json(self.json_in, near_match=True) - self.assertEquals(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."], + result = collate(self.c, segmentation=False, near_match=True) + self.assertEqual(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."], result.rows[0].to_list()) - self.assertEquals(["I", "bought", "those", "glasses", "-", "-", "-", "-", "-", "-", "-", "."], result.rows[1].to_list()) + self.assertEqual(["I", "bought", "those", "glasses", "-", "-", "-", "-", "-", "-", "-", "."], result.rows[1].to_list()) # Re-enable this one if segmented output is ever supported on tokenized collation @unit_disabled def test_near_matching_segmented(self): - result = collate_pretokenized_json(self.json_in, near_match=True, segmentation=True) - self.assertEquals(["I bought", "this glass, because it matches those dinner plates."], + result = collate(self.c, near_match=True, segmentation=True) + self.assertEqual(["I bought", "this glass, because it matches those dinner plates."], result.rows[0].to_list()) - self.assertEquals(["I bought", "those glasses."], result.rows[1].to_list()) + self.assertEqual(["I bought", "those glasses."], result.rows[1].to_list()) if __name__ == "__main__": From 977b0fa97c598061ddb48d3578c0f8c78f6c751d Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 24 Mar 2015 12:48:10 +0000 Subject: [PATCH 30/34] Update test_witness_tokens.py --- collatex-pythonport/tests/test_witness_tokens.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/collatex-pythonport/tests/test_witness_tokens.py b/collatex-pythonport/tests/test_witness_tokens.py index 56e97f437..0a0ff7f07 100644 --- a/collatex-pythonport/tests/test_witness_tokens.py +++ b/collatex-pythonport/tests/test_witness_tokens.py @@ -6,7 +6,7 @@ import unittest from collatex import Collation -from collatex.core_functions import collate_pretokenized_json +from collatex.core_functions import collate class Test(unittest.TestCase): @@ -52,7 +52,8 @@ def testPretokenizedWitness(self): } ] } - result = collate_pretokenized_json(pretokenized_witness) + c = Collation.create_from_dict(pretokenized_witness) + result = collate(c, segmentation=False) self.assertEqual(len(result.rows[0].to_list()), 4) self.assertEqual(len(result.rows[1].to_list()), 4) # The second witness should have a token that reads 'mousedog bird'. From 65a18f8dbd0b96d1e614a419f772a3a48aa66a3f Mon Sep 17 00:00:00 2001 From: enury Date: Tue, 24 Mar 2015 17:54:39 +0000 Subject: [PATCH 31/34] Update test_collatex_block_witnesses.py --- .../tests/test_collatex_block_witnesses.py | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/collatex-pythonport/tests/test_collatex_block_witnesses.py b/collatex-pythonport/tests/test_collatex_block_witnesses.py index 48e78361e..bd9a7a130 100644 --- a/collatex-pythonport/tests/test_collatex_block_witnesses.py +++ b/collatex-pythonport/tests/test_collatex_block_witnesses.py @@ -32,13 +32,12 @@ def test_combined_string_hermans_case(self): self.assertEquals("a b c d F g h i ! K ! q r s t $ 1 a b c d F g h i ! q r s t", " ".join(collation.combined_tokens)) # test whether the witness->range mapping works - @unit_disabled def test_witness_ranges_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") self.assertEquals(RangeSet("0-14"), collation.get_range_for_witness("W1")) - self.assertEquals(RangeSet("16-28"), collation.get_range_for_witness("W2")) + self.assertEquals(RangeSet("17-29"), collation.get_range_for_witness("W2")) # TODO: re-enable test! # Note: LCP intervals can overlap @@ -74,14 +73,13 @@ def test_lcp_child_intervals_hermans_case(self): _, child_lcp_intervals = collation.get_lcp_intervals() self.assertFalse(child_lcp_intervals) - @unit_disabled def test_non_overlapping_blocks_black_cat(self): collation = Collation() collation.add_plain_witness("W1", "the black cat") collation.add_plain_witness("W2", "the black cat") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() - block1 = Block(RangeSet("0-2, 4-6")) + block1 = Block(RangeSet("0-2, 5-7")) self.assertEqual([block1], blocks) #TODO: Fix number of siblings! @@ -97,17 +95,15 @@ def test_blocks_failing_transposition_use_case_old_algorithm(self): block3 = Block(RangeSet("2, 8")) self.assertEqual([block1, block2, block3], blocks) - @unit_disabled def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() - self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i ! - self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t + self.assertIn(Block(RangeSet("0-8, 17-25")), blocks) # a b c d F g h i ! + self.assertIn(Block(RangeSet("11-14, 26-29")), blocks) # q r s t - @unit_disabled def test_blocks_Hermans_case_three_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") @@ -115,21 +111,20 @@ def test_blocks_Hermans_case_three_witnesses(self): collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() - self.assertIn(Block(RangeSet("0-3, 16-19, 30-33")), blocks) # a b c d - self.assertIn(Block(RangeSet("5-7, 21-23, 35-37")), blocks) # g h i - self.assertIn(Block(RangeSet("10-14, 24-28, 38-42")), blocks) # ! q r s t - self.assertIn(Block(RangeSet("4, 20")), blocks) # F + self.assertIn(Block(RangeSet("0-3, 17-20, 32-35")), blocks) # a b c d + self.assertIn(Block(RangeSet("5-7, 22-24, 37-39")), blocks) # g h i + self.assertIn(Block(RangeSet("10-14, 25-29, 40-44")), blocks) # ! q r s t + self.assertIn(Block(RangeSet("4, 21")), blocks) # F # In the new approach nothing should be split - @unit_disabled def test_blocks_splitting_token_case(self): collation = Collation() collation.add_plain_witness("W1", "a c b c") collation.add_plain_witness("W2", "a c b") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() - block1 = Block(RangeSet("0-2, 5-7")) # a c b + block1 = Block(RangeSet("0-2, 6-8")) # a c b self.assertIn(block1, blocks) @unit_disabled From c70db1a4a5ac99d4a473cc93d3030591649e1562 Mon Sep 17 00:00:00 2001 From: enury Date: Wed, 25 Mar 2015 10:49:04 +0000 Subject: [PATCH 32/34] Update test_collation_class.py --- collatex-pythonport/tests/test_collation_class.py | 1 + 1 file changed, 1 insertion(+) diff --git a/collatex-pythonport/tests/test_collation_class.py b/collatex-pythonport/tests/test_collation_class.py index a97fb684e..fd3d2ac7a 100644 --- a/collatex-pythonport/tests/test_collation_class.py +++ b/collatex-pythonport/tests/test_collation_class.py @@ -54,6 +54,7 @@ def test_collation_function_add_witness(self): self.c.add_witness(witnessdata) self.assertEqual(len(self.c.witnesses), 3) + @unittest.expectedFailure def test_collation_function_add_witnesses_with_same_id(self): witnessdata1 = {'id': 'C', 'tokens': [{ 't' : 'The'},{ 't': 'fox'}]} witnessdata2 = {'id': 'C', 'tokens': [{ 't' : 'The'},{ 't': 'dog'}]} From ecdb7645456a96c695e6db0878e727bef0244c43 Mon Sep 17 00:00:00 2001 From: enury Date: Wed, 25 Mar 2015 14:59:44 +0000 Subject: [PATCH 33/34] Port to Python 3: use next(generator) instead of generator.next() --- collatex-pythonport/collatex/collatex_suffix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collatex-pythonport/collatex/collatex_suffix.py b/collatex-pythonport/collatex/collatex_suffix.py index 4cd5ab4ee..713f7f3ac 100644 --- a/collatex-pythonport/collatex/collatex_suffix.py +++ b/collatex-pythonport/collatex/collatex_suffix.py @@ -188,7 +188,7 @@ def __init__(self, occurrences, tokens): def debug(self): result = [] for occurrence in self.occurrences: - result.append(' '.join(self.tokens[occurrence.token_range.slices().next()])) + result.append(' '.join(self.tokens[next(occurrence.token_range.slices())])) return result From 193351e6d054d7dbcbec7c36bc71753040f59f00 Mon Sep 17 00:00:00 2001 From: enury Date: Wed, 25 Mar 2015 15:02:30 +0000 Subject: [PATCH 34/34] Update test_collatex_block_witnesses.py --- collatex-pythonport/tests/test_collatex_block_witnesses.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/collatex-pythonport/tests/test_collatex_block_witnesses.py b/collatex-pythonport/tests/test_collatex_block_witnesses.py index bd9a7a130..ee7cc3790 100644 --- a/collatex-pythonport/tests/test_collatex_block_witnesses.py +++ b/collatex-pythonport/tests/test_collatex_block_witnesses.py @@ -127,7 +127,6 @@ def test_blocks_splitting_token_case(self): block1 = Block(RangeSet("0-2, 6-8")) # a c b self.assertIn(block1, blocks) - @unit_disabled def test_block_witnesses_Hermans_case_two_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") @@ -138,7 +137,6 @@ def test_block_witnesses_Hermans_case_two_witnesses(self): block_witness = algorithm._get_block_witness(collation.witnesses[1]) self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug()) - @unit_disabled def test_block_witnesses_Hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")