From 14133016a356adf83c63fc614c77e1637a2b6ff6 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Mon, 9 Feb 2015 17:49:32 +0000
Subject: [PATCH 01/34] Update core_functions.py

Update create_from_json method.
---
 collatex-pythonport/collatex/core_functions.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
index c2d2d4709..23d7bc026 100644
--- a/collatex-pythonport/collatex/core_functions.py
+++ b/collatex-pythonport/collatex/core_functions.py
@@ -108,10 +108,17 @@ def create_from_dict(cls, data, limit=None):
             collation.add_witness(witness)
         return collation
 
+    # json input can be a string or a file
     @classmethod
-    # json_data can be a string or a file
-    def create_from_json(cls, json_data):
-        data = json.load(json_data)
+    def create_from_json_string(cls, json_string):
+        data = json.loads(json_string)
+        collation = cls.create_from_dict(data)
+        return collation
+    
+    @classmethod
+    def create_from_json_file(cls, json_path):
+        with open(json_path, 'r') as json_file:
+            data = json.load(json_file)
         collation = cls.create_from_dict(data)
         return collation
 

From 6842ca85cfd23e07a995158d63f62e9c7f4cf672 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Mon, 9 Feb 2015 18:03:52 +0000
Subject: [PATCH 02/34] Create json-test1

---
 collatex-pythonport/use_cases/json-test1 | 29 ++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 collatex-pythonport/use_cases/json-test1

diff --git a/collatex-pythonport/use_cases/json-test1 b/collatex-pythonport/use_cases/json-test1
new file mode 100644
index 000000000..a419c0e49
--- /dev/null
+++ b/collatex-pythonport/use_cases/json-test1
@@ -0,0 +1,29 @@
+{"witnesses" :
+    [
+        {"id" : "A","tokens" :
+            [
+            {"t" : "The"},
+            {"t" : "quick"},
+            {"t" : "brown"},
+            {"t" : "fox"},
+            {"t" : "jumps"},
+            {"t" : "over"},
+            {"t" : "the"},
+            {"t" : "dog."}
+            ]
+        },
+        
+        {"id" : "B", "tokens" :
+            [
+            {"t" : "The"},
+            {"t" : "brown"},
+            {"t" : "fox"},
+            {"t" : "jumps"},
+            {"t" : "over"},
+            {"t" : "the"},
+            {"t" : "lazy"},
+            {"t" : "dog."}
+            ]
+        }
+    ]
+}

From 5d12d0873230eddde4e5226a8e4f447010ea373d Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Mon, 9 Feb 2015 18:04:47 +0000
Subject: [PATCH 03/34] Create json-test2.json

---
 collatex-pythonport/use_cases/json-test2.json | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 collatex-pythonport/use_cases/json-test2.json

diff --git a/collatex-pythonport/use_cases/json-test2.json b/collatex-pythonport/use_cases/json-test2.json
new file mode 100644
index 000000000..97d91d424
--- /dev/null
+++ b/collatex-pythonport/use_cases/json-test2.json
@@ -0,0 +1,31 @@
+{"witnesses" :
+    [
+        {"id" : "E","tokens" :
+            [
+            {"t" : "The"},
+            {"t" : "quick"},
+            {"t" : "brown"},
+            {"t" : "fox"},
+            {"t" : "jumps"},
+            {"t" : "over"},
+            {"t" : "the"},
+            {"t" : "dog"},
+            {"t" : "."}
+            ]
+        },
+        
+        {"id" : "F", "tokens" :
+            [
+            {"t" : "The"},
+            {"t" : "brown"},
+            {"t" : "fox"},
+            {"t" : "jumps"},
+            {"t" : "over"},
+            {"t" : "the"},
+            {"t" : "lazy"},
+            {"t" : "dog"},
+            {"t" : "."}
+            ]
+        }
+    ]
+}

From aad18be8ece6b9567538e1cf3ccdaaa9ad809a9c Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Mon, 9 Feb 2015 18:05:15 +0000
Subject: [PATCH 04/34] Delete json-test1

---
 collatex-pythonport/use_cases/json-test1 | 29 ------------------------
 1 file changed, 29 deletions(-)
 delete mode 100644 collatex-pythonport/use_cases/json-test1

diff --git a/collatex-pythonport/use_cases/json-test1 b/collatex-pythonport/use_cases/json-test1
deleted file mode 100644
index a419c0e49..000000000
--- a/collatex-pythonport/use_cases/json-test1
+++ /dev/null
@@ -1,29 +0,0 @@
-{"witnesses" :
-    [
-        {"id" : "A","tokens" :
-            [
-            {"t" : "The"},
-            {"t" : "quick"},
-            {"t" : "brown"},
-            {"t" : "fox"},
-            {"t" : "jumps"},
-            {"t" : "over"},
-            {"t" : "the"},
-            {"t" : "dog."}
-            ]
-        },
-        
-        {"id" : "B", "tokens" :
-            [
-            {"t" : "The"},
-            {"t" : "brown"},
-            {"t" : "fox"},
-            {"t" : "jumps"},
-            {"t" : "over"},
-            {"t" : "the"},
-            {"t" : "lazy"},
-            {"t" : "dog."}
-            ]
-        }
-    ]
-}

From bebc85692fbfe05d266fd1c7a2d470e2e5c0c216 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Mon, 9 Feb 2015 18:05:45 +0000
Subject: [PATCH 05/34] Create json-test1.json

---
 collatex-pythonport/use_cases/json-test1.json | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 collatex-pythonport/use_cases/json-test1.json

diff --git a/collatex-pythonport/use_cases/json-test1.json b/collatex-pythonport/use_cases/json-test1.json
new file mode 100644
index 000000000..6f4f00d38
--- /dev/null
+++ b/collatex-pythonport/use_cases/json-test1.json
@@ -0,0 +1,29 @@
+{"witnesses" :
+    [
+        {"id" : "E","tokens" :
+            [
+            {"t" : "The"},
+            {"t" : "quick"},
+            {"t" : "brown"},
+            {"t" : "fox"},
+            {"t" : "jumps"},
+            {"t" : "over"},
+            {"t" : "the"},
+            {"t" : "dog."}
+            ]
+        },
+        
+        {"id" : "F", "tokens" :
+            [
+            {"t" : "The"},
+            {"t" : "brown"},
+            {"t" : "fox"},
+            {"t" : "jumps"},
+            {"t" : "over"},
+            {"t" : "the"},
+            {"t" : "lazy"},
+            {"t" : "dog."}
+            ]
+        }
+    ]
+}

From f68f4cccde91d72a029fcdee23ab08d7d8f7f24f Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Mon, 9 Feb 2015 18:08:16 +0000
Subject: [PATCH 06/34] Update json-test1.json

---
 collatex-pythonport/use_cases/json-test1.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/collatex-pythonport/use_cases/json-test1.json b/collatex-pythonport/use_cases/json-test1.json
index 6f4f00d38..a419c0e49 100644
--- a/collatex-pythonport/use_cases/json-test1.json
+++ b/collatex-pythonport/use_cases/json-test1.json
@@ -1,6 +1,6 @@
 {"witnesses" :
     [
-        {"id" : "E","tokens" :
+        {"id" : "A","tokens" :
             [
             {"t" : "The"},
             {"t" : "quick"},
@@ -13,7 +13,7 @@
             ]
         },
         
-        {"id" : "F", "tokens" :
+        {"id" : "B", "tokens" :
             [
             {"t" : "The"},
             {"t" : "brown"},

From b0fa0b469417e0cc46893b23cd3ffb19f0167047 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Mon, 9 Feb 2015 18:08:34 +0000
Subject: [PATCH 07/34] Update json-test2.json

---
 collatex-pythonport/use_cases/json-test2.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/collatex-pythonport/use_cases/json-test2.json b/collatex-pythonport/use_cases/json-test2.json
index 97d91d424..ca26f69af 100644
--- a/collatex-pythonport/use_cases/json-test2.json
+++ b/collatex-pythonport/use_cases/json-test2.json
@@ -1,6 +1,6 @@
 {"witnesses" :
     [
-        {"id" : "E","tokens" :
+        {"id" : "C","tokens" :
             [
             {"t" : "The"},
             {"t" : "quick"},
@@ -14,7 +14,7 @@
             ]
         },
         
-        {"id" : "F", "tokens" :
+        {"id" : "D", "tokens" :
             [
             {"t" : "The"},
             {"t" : "brown"},

From ed8c8e00a296ab0720ec183a5a0bc7df37c245ec Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 10 Feb 2015 00:07:37 +0000
Subject: [PATCH 08/34] Update core_functions.py

Update tokens property of Collation.
---
 collatex-pythonport/collatex/core_functions.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
index 23d7bc026..43a7b96c1 100644
--- a/collatex-pythonport/collatex/core_functions.py
+++ b/collatex-pythonport/collatex/core_functions.py
@@ -179,8 +179,19 @@ def to_extended_suffix_array(self):
     def tokens(self):
         #print("COLLATION TOKENIZE IS CALLED!")
         #TODO: complete set of witnesses is retokenized here!
-        tokenizer = WordPunctuationTokenizer()
-        tokens = tokenizer.tokenize(self.get_combined_string())
+        #tokenizer = WordPunctuationTokenizer()
+        #tokens = tokenizer.tokenize(self.get_combined_string())
+        
+        #tokens = [token.token_string for witness in self.witnesses for token in witness._tokens]
+        tokens = []
+        for i, witness in enumerate(self.witnesses):
+            for tk in witness._tokens:
+                tokens.append(tk.token_string)
+            # if last witness, do not append $ or i to the list of tokens
+            if i == len(self.witnesses)-1:
+                break
+            tokens.append('$')
+            tokens.append(str(i+1))
         return tokens
 
 

From babd89c04edd86e2a60cfcec129e63384216700e Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 10 Feb 2015 08:00:08 +0000
Subject: [PATCH 09/34] Update core_functions.py

---
 collatex-pythonport/collatex/core_functions.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
index 43a7b96c1..d73ad96d5 100644
--- a/collatex-pythonport/collatex/core_functions.py
+++ b/collatex-pythonport/collatex/core_functions.py
@@ -185,13 +185,11 @@ def tokens(self):
         #tokens = [token.token_string for witness in self.witnesses for token in witness._tokens]
         tokens = []
         for i, witness in enumerate(self.witnesses):
+            if i > 0 :
+                tokens.append('$')
+                tokens.append(str(i))
             for tk in witness._tokens:
                 tokens.append(tk.token_string)
-            # if last witness, do not append $ or i to the list of tokens
-            if i == len(self.witnesses)-1:
-                break
-            tokens.append('$')
-            tokens.append(str(i+1))
         return tokens
 
 

From 5b5ab0483674f025fe047f7680dd364ca9a4e444 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 10 Feb 2015 17:47:36 +0000
Subject: [PATCH 10/34] Update core_functions.py

---
 collatex-pythonport/collatex/core_functions.py | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
index d73ad96d5..b5bcc5333 100644
--- a/collatex-pythonport/collatex/core_functions.py
+++ b/collatex-pythonport/collatex/core_functions.py
@@ -126,11 +126,8 @@ def __init__(self):
         self.witnesses = []
         self.counter = 0
         self.witness_ranges = {}
-        self.combined_string = ""
         self.cached_suffix_array = None
 
-    # the tokenization process happens multiple times
-    # and by different tokenizers. This should be fixed
     def add_witness(self, witnessdata):
         # clear the suffix array and LCP array cache
         self.cached_suffix_array = None
@@ -141,9 +138,6 @@ def add_witness(self, witnessdata):
         # the extra one is for the marker token
         self.counter += len(witness.tokens()) +2 # $ + number 
         self.witness_ranges[witness.sigil] = witness_range
-        if not self.combined_string == "":
-            self.combined_string += " $"+str(len(self.witnesses)-1)+ " "
-        self.combined_string += witness.content
 
     def add_plain_witness(self, sigil, content):
         return self.add_witness({'id':sigil, 'content':content})
@@ -153,9 +147,6 @@ def get_range_for_witness(self, witness_sigil):
             raise Exception("Witness "+witness_sigil+" is not added to the collation!")
         return self.witness_ranges[witness_sigil]
 
-    def get_combined_string(self):
-        return self.combined_string
-
     def get_sa(self):
         #NOTE: implemented in a lazy manner, since calculation of the Suffix Array and LCP Array takes time
         if not self.cached_suffix_array:
@@ -171,24 +162,17 @@ def get_lcp_array(self):
         sa = self.get_sa()
         return sa._LCP_values
 
-
     def to_extended_suffix_array(self):
         return ExtendedSuffixArray(self.tokens, self.get_suffix_array(), self.get_lcp_array())
 
     @property
     def tokens(self):
-        #print("COLLATION TOKENIZE IS CALLED!")
-        #TODO: complete set of witnesses is retokenized here!
-        #tokenizer = WordPunctuationTokenizer()
-        #tokens = tokenizer.tokenize(self.get_combined_string())
-        
-        #tokens = [token.token_string for witness in self.witnesses for token in witness._tokens]
         tokens = []
         for i, witness in enumerate(self.witnesses):
             if i > 0 :
                 tokens.append('$')
                 tokens.append(str(i))
-            for tk in witness._tokens:
+            for tk in witness.tokens():
                 tokens.append(tk.token_string)
         return tokens
 

From 8bc6bec44ec5d236089e517bc57b82113a7648cb Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 10 Feb 2015 18:06:08 +0000
Subject: [PATCH 11/34] Update Witness class in core_classes.py

---
 collatex-pythonport/collatex/core_classes.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/collatex-pythonport/collatex/core_classes.py b/collatex-pythonport/collatex/core_classes.py
index 6d62f2a06..aee498dcf 100644
--- a/collatex-pythonport/collatex/core_classes.py
+++ b/collatex-pythonport/collatex/core_classes.py
@@ -164,17 +164,15 @@ def __init__(self, witnessdata):
         self.sigil = witnessdata['id']
         self._tokens = []
         if 'content' in witnessdata:
-            self.content = witnessdata['content']
-            # print("Witness "+sigil+" TOKENIZER IS CALLED!")
             tokenizer = WordPunctuationTokenizer()
-            tokens_as_strings = tokenizer.tokenize(self.content)
+            tokens_as_strings = tokenizer.tokenize(witnessdata['content'])
             for token_string in tokens_as_strings:
                 self._tokens.append(Token({'t':token_string}))
         elif 'tokens' in witnessdata:
             for tk in witnessdata['tokens']:
                 self._tokens.append(Token(tk))
-            # TODO no idea what this content string is needed for.
-            self.content = ' '.join([x.token_string for x in self._tokens])
+        #else raise an exception, if neither 'content' or 'tokens' in witnessdata? 
+        #also if no 'id' in witnessdata?
             
     def tokens(self):
         return self._tokens

From eaa7a883db676461091996e0ce127f68b09d9e5d Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Wed, 11 Feb 2015 00:07:42 +0000
Subject: [PATCH 12/34] Update core_functions.py

Added pretokenized keyword argument in function collate, and created new function get_tokenized_at.
Deleted function collate_pretokenized_json.
Added option of vertical layout also for json output.
---
 .../collatex/core_functions.py                | 67 +++++++------------
 1 file changed, 25 insertions(+), 42 deletions(-)

diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
index b5bcc5333..071e175fc 100644
--- a/collatex-pythonport/collatex/core_functions.py
+++ b/collatex-pythonport/collatex/core_functions.py
@@ -16,7 +16,7 @@
 # "table" for the alignment table (default)
 # "graph" for the variant graph
 # "json" for the alignment table exported as JSON
-def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, debug_scores=False):
+def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, debug_scores=False, pretokenized=False):
     algorithm = EditGraphAligner(collation, near_match=near_match, astar=astar, debug_scores=debug_scores)
     # build graph
     graph = VariantGraph()
@@ -27,10 +27,17 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n
     # check which output format is requested: graph or table
     if output=="graph": 
         return graph
+    
     # create alignment table
     table = AlignmentTable(collation, graph, layout)
+    if pretokenized and not segmentation:
+        token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses]
+        #for the moment only with segmentation=False
+        #there could be a different comportment of get_tokenized_table if semgentation=True
+        table = get_tokenized_at(table, token_list, segmentation=segmentation)
+    
     if output == "json":
-        return export_alignment_table_as_json(table)
+        return export_alignment_table_as_json(table, layout=layout)
     if output == "html":
         return display_alignment_table_as_HTML(table)
     if output == "table":
@@ -38,48 +45,21 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n
     else:
         raise Exception("Unknown output type: "+output)
     
-
-
-#TODO: this only works with a table output at the moment
-#TODO: store the tokens on the graph instead
-def collate_pretokenized_json(json, output='table', layout='horizontal', **kwargs):
-    # Takes more or less the same arguments as collate() above, but with some restrictions.
-    # Only output types 'json' and 'table' are supported.
-    if output not in ['json', 'table']:
-        raise UnsupportedError("Output type" + kwargs['output'] + "not supported for pretokenized collation")
-    if 'segmentation' in kwargs and kwargs['segmentation']:
-        raise UnsupportedError("Segmented output not supported for pretokenized collation")
-    kwargs['segmentation'] = False
-
-    # For each witness given, make a 'shadow' witness based on the normalization tokens
-    # that will actually be collated.
-    tokenized_witnesses = []
-    collation = Collation()
-    for witness in json["witnesses"]:
-        collation.add_witness(witness)
-        tokenized_witnesses.append(witness["tokens"])
-    at = collate(collation, output="table", **kwargs)
-    tokenized_at = AlignmentTable(collation, layout=layout)
-    for row, tokenized_witness in zip(at.rows, tokenized_witnesses):
-        new_row = Row(row.header)
+def get_tokenized_at(table, token_list, segmentation=False):
+    tokenized_at = AlignmentTable(Collation())
+    for witness_row, witness_tokens in zip(table.rows, token_list):
+        new_row = Row(witness_row.header)
         tokenized_at.rows.append(new_row)
-        token_counter = 0
-        for cell in row.cells:
+        counter = 0
+        for cell in witness_row.cells:
             if cell != "-":
-                new_row.cells.append(tokenized_witness[token_counter])
-                token_counter+=1
-            else:
-                #TODO: should probably be null or None instead, but that would break the rendering at the moment 
-                new_row.cells.append({"t":"-"})
-    if output=="json":
-        return export_alignment_table_as_json(tokenized_at)
-    if output=="table":
-        # transform JSON objects to "t" form.
-        for row in tokenized_at.rows:
-            row.cells = [cell["t"]  for cell in row.cells]
-        return tokenized_at
-
-def export_alignment_table_as_json(table, indent=None, status=False):
+                new_row.cells.append(witness_tokens[counter])
+                counter+=1
+            else: 
+                new_row.cells.append({})
+    return tokenized_at
+
+def export_alignment_table_as_json(table, indent=None, status=False, layout="horizontal"):
     json_output = {}
     json_output["table"]=[]
     sigli = []
@@ -92,6 +72,9 @@ def export_alignment_table_as_json(table, indent=None, status=False):
         for column in table.columns:
             variant_status.append(column.variant)
         json_output["status"]=variant_status
+    if layout=="vertical":
+        new_table = [[row[i] for row in json_output["table"]] for i in range(len(row.cells))]
+        json_output["table"] = new_table
     return json.dumps(json_output, sort_keys=True, indent=indent)
 
 '''

From 6672e5288fdbf94dd646eb0ab465829b0e2fb194 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Wed, 11 Feb 2015 00:08:45 +0000
Subject: [PATCH 13/34] Update __init__.py

---
 collatex-pythonport/collatex/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/collatex-pythonport/collatex/__init__.py b/collatex-pythonport/collatex/__init__.py
index 72cd2d045..a277707f2 100755
--- a/collatex-pythonport/collatex/__init__.py
+++ b/collatex-pythonport/collatex/__init__.py
@@ -7,8 +7,7 @@
 
 from collatex.core_functions import Collation
 from collatex.core_functions import collate
-from collatex.core_functions import collate_pretokenized_json
 
-__all__ = ["Collation", "collate", "collate_pretokenized_json"]
+__all__ = ["Collation", "collate"]
 
 

From 070f8256e21276046c58a1b018757932169ccf4e Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Wed, 11 Feb 2015 10:55:49 +0000
Subject: [PATCH 14/34] Update core_functions.py

---
 .../collatex/core_functions.py                | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
index 071e175fc..354ce3375 100644
--- a/collatex-pythonport/collatex/core_functions.py
+++ b/collatex-pythonport/collatex/core_functions.py
@@ -35,6 +35,10 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n
         #for the moment only with segmentation=False
         #there could be a different comportment of get_tokenized_table if semgentation=True
         table = get_tokenized_at(table, token_list, segmentation=segmentation)
+        # for display purpose, table and html output will return only token 't' (string) and not the full token_data (dict)
+        if output=="table" or output=="html":
+            for row in table.rows:
+                row.cells = [cell["t"] for cell in row.cells]
     
     if output == "json":
         return export_alignment_table_as_json(table, layout=layout)
@@ -52,11 +56,22 @@ def get_tokenized_at(table, token_list, segmentation=False):
         tokenized_at.rows.append(new_row)
         counter = 0
         for cell in witness_row.cells:
-            if cell != "-":
+            if cell == "-":
+                # TODO: should probably be null or None instead, but that would break the rendering at the moment (line 41)
+                new_row.cells.append({"t" : "-"})
+            # if segmentation=False    
+            else: 
                 new_row.cells.append(witness_tokens[counter])
                 counter+=1
-            else: 
-                new_row.cells.append({})
+            # else if segmentation=True
+                #string = witness_tokens[counter].token_string
+                #token_counter = 1
+                #while string != cell:
+                    ##add token_string of the next token until it is equivalent to the string in the cell
+                    #string += next token string
+                    #token_counter += 1
+                #new_row.cells.append([tk for tk in witness_tokens[counter:counter+token_counter]])
+                #update counter (counter += token_counter)
     return tokenized_at
 
 def export_alignment_table_as_json(table, indent=None, status=False, layout="horizontal"):

From 42fb5ebf7c240dd081a9f3ef998652a15f3969a3 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Wed, 11 Feb 2015 15:27:57 +0000
Subject: [PATCH 15/34] Update core_functions.py

Replaced Collation.tokens property with an attribute combined_tokens.
---
 .../collatex/core_functions.py                | 26 +++++++------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
index 354ce3375..212ba9b31 100644
--- a/collatex-pythonport/collatex/core_functions.py
+++ b/collatex-pythonport/collatex/core_functions.py
@@ -32,8 +32,8 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n
     table = AlignmentTable(collation, graph, layout)
     if pretokenized and not segmentation:
         token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses]
-        #for the moment only with segmentation=False
-        #there could be a different comportment of get_tokenized_table if semgentation=True
+        # only with segmentation=False
+        # there could be a different comportment of get_tokenized_table if semgentation=True
         table = get_tokenized_at(table, token_list, segmentation=segmentation)
         # for display purpose, table and html output will return only token 't' (string) and not the full token_data (dict)
         if output=="table" or output=="html":
@@ -125,6 +125,7 @@ def __init__(self):
         self.counter = 0
         self.witness_ranges = {}
         self.cached_suffix_array = None
+        self.combined_tokens =[]
 
     def add_witness(self, witnessdata):
         # clear the suffix array and LCP array cache
@@ -136,6 +137,11 @@ def add_witness(self, witnessdata):
         # the extra one is for the marker token
         self.counter += len(witness.tokens()) +2 # $ + number 
         self.witness_ranges[witness.sigil] = witness_range
+        if len(self.witnesses) > 1:
+            self.combined_tokens.append('$')
+            self.combined_tokens.append(str(len(self.witnesses)-1))
+        for tk in witness.tokens():
+            self.combined_tokens.append(tk.token_string)
 
     def add_plain_witness(self, sigil, content):
         return self.add_witness({'id':sigil, 'content':content})
@@ -149,7 +155,7 @@ def get_sa(self):
         #NOTE: implemented in a lazy manner, since calculation of the Suffix Array and LCP Array takes time
         if not self.cached_suffix_array:
             # Unit byte is done to skip tokenization in third party library
-            self.cached_suffix_array = SuffixArray(self.tokens, unit=UNIT_BYTE)
+            self.cached_suffix_array = SuffixArray(self.combined_tokens, unit=UNIT_BYTE)
         return self.cached_suffix_array
 
     def get_suffix_array(self):
@@ -161,18 +167,6 @@ def get_lcp_array(self):
         return sa._LCP_values
 
     def to_extended_suffix_array(self):
-        return ExtendedSuffixArray(self.tokens, self.get_suffix_array(), self.get_lcp_array())
-
-    @property
-    def tokens(self):
-        tokens = []
-        for i, witness in enumerate(self.witnesses):
-            if i > 0 :
-                tokens.append('$')
-                tokens.append(str(i))
-            for tk in witness.tokens():
-                tokens.append(tk.token_string)
-        return tokens
-
+        return ExtendedSuffixArray(self.combined_tokens, self.get_suffix_array(), self.get_lcp_array())
 
 

From 3392a422967130b9d2e06f89f8e9075fa702fbe8 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Wed, 11 Feb 2015 15:29:34 +0000
Subject: [PATCH 16/34] Update suffix_based_scorer.py

Replaced Collation.tokens property with combined_tokens attribute.
---
 collatex-pythonport/collatex/suffix_based_scorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/collatex-pythonport/collatex/suffix_based_scorer.py b/collatex-pythonport/collatex/suffix_based_scorer.py
index cfb2713f2..27c8b468e 100644
--- a/collatex-pythonport/collatex/suffix_based_scorer.py
+++ b/collatex-pythonport/collatex/suffix_based_scorer.py
@@ -147,7 +147,7 @@ def _get_block_witness(self, witness):
                 occurrences.append(occurrence) 
         # sort occurrences on position
         sorted_o = sorted(occurrences, key=attrgetter('lower_end'))
-        block_witness = BlockWitness(sorted_o, self.collation.tokens)
+        block_witness = BlockWitness(sorted_o, self.collation.combined_tokens)
         return block_witness
 
     '''

From 58a3a5e93143f38e08084dd9ec0c1b2fbefb0fcf Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Wed, 11 Feb 2015 15:59:34 +0000
Subject: [PATCH 17/34] Update core_functions.py

Added auto-detection of pretokenized json.
---
 collatex-pythonport/collatex/core_functions.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
index 212ba9b31..5b66988ce 100644
--- a/collatex-pythonport/collatex/core_functions.py
+++ b/collatex-pythonport/collatex/core_functions.py
@@ -16,7 +16,7 @@
 # "table" for the alignment table (default)
 # "graph" for the variant graph
 # "json" for the alignment table exported as JSON
-def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, debug_scores=False, pretokenized=False):
+def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, debug_scores=False):
     algorithm = EditGraphAligner(collation, near_match=near_match, astar=astar, debug_scores=debug_scores)
     # build graph
     graph = VariantGraph()
@@ -30,7 +30,7 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n
     
     # create alignment table
     table = AlignmentTable(collation, graph, layout)
-    if pretokenized and not segmentation:
+    if collation.pretokenized and not segmentation:
         token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses]
         # only with segmentation=False
         # there could be a different comportment of get_tokenized_table if semgentation=True
@@ -99,8 +99,13 @@ class Collation(object):
 
     @classmethod
     def create_from_dict(cls, data, limit=None):
+        if "witnesses" not in data:
+            raise UnsupportedError("Json input not valid")
         witnesses = data["witnesses"]
         collation = Collation()
+        # determine if data is pretokenized (check for the first witness)
+        if 'tokens' in witnesses[0]:
+            collation.pretokenized = True
         for witness in witnesses[:limit]:
             # generate collation object from json_data
             collation.add_witness(witness)
@@ -122,6 +127,7 @@ def create_from_json_file(cls, json_path):
 
     def __init__(self):
         self.witnesses = []
+        self.pretokenized = False
         self.counter = 0
         self.witness_ranges = {}
         self.cached_suffix_array = None

From b5e2bad1340b36db7f0b407c6e46ffae8ec68075 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Wed, 11 Feb 2015 16:14:58 +0000
Subject: [PATCH 18/34] Update core_classes.py

Added exception handling in Witness class.
---
 collatex-pythonport/collatex/core_classes.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/collatex-pythonport/collatex/core_classes.py b/collatex-pythonport/collatex/core_classes.py
index aee498dcf..882af47e4 100644
--- a/collatex-pythonport/collatex/core_classes.py
+++ b/collatex-pythonport/collatex/core_classes.py
@@ -161,6 +161,8 @@ def __repr__(self):
 class Witness(object):
     
     def __init__(self, witnessdata):
+        if 'id' not in witnessdata:
+            raise UnsupportedError("No defined id in witnessdata")
         self.sigil = witnessdata['id']
         self._tokens = []
         if 'content' in witnessdata:
@@ -171,8 +173,8 @@ def __init__(self, witnessdata):
         elif 'tokens' in witnessdata:
             for tk in witnessdata['tokens']:
                 self._tokens.append(Token(tk))
-        #else raise an exception, if neither 'content' or 'tokens' in witnessdata? 
-        #also if no 'id' in witnessdata?
+        else:
+            raise UnsupportedError("No defined content/tokens in witness "+self.sigil)
             
     def tokens(self):
         return self._tokens

From 7fc92ebf0d8df16cb4fe35286759056de0421aeb Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Wed, 11 Feb 2015 16:32:37 +0000
Subject: [PATCH 19/34] Create json-test3.json

---
 collatex-pythonport/use_cases/json-test3.json | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 collatex-pythonport/use_cases/json-test3.json

diff --git a/collatex-pythonport/use_cases/json-test3.json b/collatex-pythonport/use_cases/json-test3.json
new file mode 100644
index 000000000..dcbee333e
--- /dev/null
+++ b/collatex-pythonport/use_cases/json-test3.json
@@ -0,0 +1,29 @@
+{"witnesses" :
+    [
+        {"id" : "E","tokens" :
+            [
+            {"t" : "The", "id": 1, "n": "the"},
+            {"t" : "quick", "id": 2},
+            {"t" : "brown", "id": 3},
+            {"t" : "fox", "id": 4},
+            {"t" : "jumps", "id": 5},
+            {"t" : "over", "id": 6},
+            {"t" : "the", "id": 7},
+            {"t" : "dog.", "id": 8, "n": "dog"}
+            ]
+        },
+        
+        {"id" : "F", "tokens" :
+            [
+            {"t" : "The"},
+            {"t" : "brown"},
+            {"t" : "fox"},
+            {"t" : "jumps"},
+            {"t" : "over"},
+            {"t" : "the"},
+            {"t" : "lazy"},
+            {"t" : "dog."}
+            ]
+        }
+    ]
+}

From b7beb23639f8ec49693437dda1074e79da159836 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Wed, 11 Feb 2015 16:47:12 +0000
Subject: [PATCH 20/34] Update core_functions.py

---
 collatex-pythonport/collatex/core_functions.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
index 5b66988ce..69b010c70 100644
--- a/collatex-pythonport/collatex/core_functions.py
+++ b/collatex-pythonport/collatex/core_functions.py
@@ -64,14 +64,7 @@ def get_tokenized_at(table, token_list, segmentation=False):
                 new_row.cells.append(witness_tokens[counter])
                 counter+=1
             # else if segmentation=True
-                #string = witness_tokens[counter].token_string
-                #token_counter = 1
-                #while string != cell:
-                    ##add token_string of the next token until it is equivalent to the string in the cell
-                    #string += next token string
-                    #token_counter += 1
-                #new_row.cells.append([tk for tk in witness_tokens[counter:counter+token_counter]])
-                #update counter (counter += token_counter)
+                # do something else...
     return tokenized_at
 
 def export_alignment_table_as_json(table, indent=None, status=False, layout="horizontal"):

From a5bc15b3173f40f167c644c410f1971495141e20 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Thu, 12 Feb 2015 10:42:40 +0000
Subject: [PATCH 21/34] Update core_functions.py

---
 collatex-pythonport/collatex/core_functions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
index 69b010c70..7875cf271 100644
--- a/collatex-pythonport/collatex/core_functions.py
+++ b/collatex-pythonport/collatex/core_functions.py
@@ -96,12 +96,12 @@ def create_from_dict(cls, data, limit=None):
             raise UnsupportedError("Json input not valid")
         witnesses = data["witnesses"]
         collation = Collation()
-        # determine if data is pretokenized (check for the first witness)
-        if 'tokens' in witnesses[0]:
-            collation.pretokenized = True
         for witness in witnesses[:limit]:
             # generate collation object from json_data
             collation.add_witness(witness)
+            # determine if data is pretokenized
+            if 'tokens' in witness:
+                collation.pretokenized = True
         return collation
 
     # json input can be a string or a file

From 42de8b1e3d9b65cc43fda22ce866feabe6ca5873 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Thu, 12 Feb 2015 17:10:39 +0000
Subject: [PATCH 22/34] Update core_functions.py

Added layout for tokenized alignment table
Attempt at creating the tokenized alignment table when tokens are joined into segments.
---
 .../collatex/core_functions.py                | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
index 7875cf271..318e0437f 100644
--- a/collatex-pythonport/collatex/core_functions.py
+++ b/collatex-pythonport/collatex/core_functions.py
@@ -34,7 +34,7 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n
         token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses]
         # only with segmentation=False
         # there could be a different comportment of get_tokenized_table if semgentation=True
-        table = get_tokenized_at(table, token_list, segmentation=segmentation)
+        table = get_tokenized_at(table, token_list, segmentation=segmentation, layout=layout)
         # for display purpose, table and html output will return only token 't' (string) and not the full token_data (dict)
         if output=="table" or output=="html":
             for row in table.rows:
@@ -49,8 +49,8 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n
     else:
         raise Exception("Unknown output type: "+output)
     
-def get_tokenized_at(table, token_list, segmentation=False):
-    tokenized_at = AlignmentTable(Collation())
+def get_tokenized_at(table, token_list, segmentation=False, layout="horizontal"):
+    tokenized_at = AlignmentTable(Collation(), layout=layout)
     for witness_row, witness_tokens in zip(table.rows, token_list):
         new_row = Row(witness_row.header)
         tokenized_at.rows.append(new_row)
@@ -64,7 +64,19 @@ def get_tokenized_at(table, token_list, segmentation=False):
                 new_row.cells.append(witness_tokens[counter])
                 counter+=1
             # else if segmentation=True
-                # do something else...
+                ##token_list must be a list of Token instead of list of dict (update lines 34, 64)
+                ##line 41 will not be happy in case of table/html output
+                #string = witness_tokens[counter].token_string
+                #token_counter = 1
+                #while string != cell :
+                #    if counter+token_counter-1 < len(witness_tokens)-1:
+                #        #add token_string of the next token until it is equivalent to the string in the cell
+                #        #if we are not at the last token
+                #        string += ' '+witness_tokens[counter+token_counter].token_string
+                #        token_counter += 1
+                ##there is one list level too many in the output
+                #new_row.cells.append([tk.token_data for tk in witness_tokens[counter:counter+token_counter]])
+                #counter += token_counter.
     return tokenized_at
 
 def export_alignment_table_as_json(table, indent=None, status=False, layout="horizontal"):

From 9aeaebbd744632f8fad63469e6965c945a3164da Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 24 Mar 2015 11:43:23 +0000
Subject: [PATCH 23/34] Create test_token_class.py

---
 collatex-pythonport/tests/test_token_class.py | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 collatex-pythonport/tests/test_token_class.py

diff --git a/collatex-pythonport/tests/test_token_class.py b/collatex-pythonport/tests/test_token_class.py
new file mode 100644
index 000000000..fe4088ad0
--- /dev/null
+++ b/collatex-pythonport/tests/test_token_class.py
@@ -0,0 +1,39 @@
+'''
+Created on March 24, 2015
+
+@author: Elisa Nury
+'''
+
+import unittest
+from collatex.core_classes import Token
+from collatex.exceptions import TokenError
+
+
+class TestToken(unittest.TestCase):
+
+    def test_creation_token_t(self):
+        data = {'t': 'fox', 'id': 123 }
+        t = Token(data)
+        self.assertEqual(t.token_string, 'fox')
+        self.assertEqual(t.token_data, data)
+        
+    def test_creation_token_n(self):
+        data = {'t': 'kitten', 'n': 'cat'}
+        t = Token(data)
+        self.assertEqual(t.token_string, 'cat')
+        self.assertEqual(t.token_data, data)
+    
+    def test_creation_token_none(self):
+        t = Token(None)
+        self.assertEqual(t.token_string, '')
+        self.assertIsNone(t.token_data)
+        
+    def test_invalid_token_raises_exception(self):
+        with self.assertRaises(TokenError):
+            #data = {'x': 'abc'}
+            data = {}
+            Token(data)
+
+if __name__ == '__main__':
+    unittest.main()
+    

From 13af6ddcc3a9a5430e315cfee1a048e3e43dfe9e Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 24 Mar 2015 11:44:46 +0000
Subject: [PATCH 24/34] Create test_witness_class.py

---
 .../tests/test_witness_class.py               | 54 +++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 collatex-pythonport/tests/test_witness_class.py

diff --git a/collatex-pythonport/tests/test_witness_class.py b/collatex-pythonport/tests/test_witness_class.py
new file mode 100644
index 000000000..8f2e6e33b
--- /dev/null
+++ b/collatex-pythonport/tests/test_witness_class.py
@@ -0,0 +1,54 @@
+'''
+Created on March 24, 2015
+
+@author: Elisa Nury
+'''
+
+import unittest
+from collatex.core_classes import Witness, Token, Tokenizer
+from collatex.exceptions import UnsupportedError, TokenError
+
+class TestWitness(unittest.TestCase):
+
+    def test_creation_witness_plain(self):
+        data = {'id': 'A', 'content': 'The quick brown fox jumped over the lazy dogs.'}
+        w = Witness(data)
+        self.assertEqual(w.sigil, 'A')
+        self.assertEqual(len(w.tokens()), 10)
+        self.assertEqual(w.tokens()[3].token_string, 'fox')
+        
+    def test_creation_witness_pretokenized(self):
+        data = {    'id': 'B',
+                    'tokens': [
+                        {'t': 'A', 'ref': 123},
+                        {'t': 'black and blue', 'adj': True},
+                        {'t': 'cat', 'id': 'xyz'},
+                        {'t': 'bird.', 'id': 'abc'}
+                    ]
+                }
+        w = Witness(data)
+        self.assertEqual(w.sigil, 'B')
+        self.assertEqual(len(w.tokens()), 4)
+    
+    def test_invalid_witness_missing_id(self):
+        data = {'name': 'A', 'content': 'The quick brown fox jumped over the lazy dogs.'}
+        self.assertRaises(UnsupportedError, Witness, data)
+        
+    def test_invalid_witness_missing_content_tokens(self):
+        data = {'id': 'A'}
+        self.assertRaises(UnsupportedError, Witness, data)
+   
+    def test_invalid_witness_content_is_pretokenized(self):
+        #'content' is pretokenized instead of plain text
+        data = {'id': 'A', 'content': [{'t':'the'}, {'t':'fox'}]}
+        self.assertRaises(TypeError, Witness, data)  
+    
+    def test_invalid_witness_tokens_is_plain(self):
+        #'tokens' is plain text instead of pretokenized        
+        data = {'id': 'A', 'tokens': 'The quick brown fox jumped over the lazy dogs.'}    
+        self.assertRaises(TokenError, Witness, data) 
+
+
+if __name__ == '__main__':
+    unittest.main()
+    

From 87675aa504705267d658be023d39969e0672ae67 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 24 Mar 2015 11:46:44 +0000
Subject: [PATCH 25/34] Create test_collation_class.py

---
 .../tests/test_collation_class.py             | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 collatex-pythonport/tests/test_collation_class.py

diff --git a/collatex-pythonport/tests/test_collation_class.py b/collatex-pythonport/tests/test_collation_class.py
new file mode 100644
index 000000000..a97fb684e
--- /dev/null
+++ b/collatex-pythonport/tests/test_collation_class.py
@@ -0,0 +1,84 @@
+'''
+Created on March 24, 2015
+
+@author: Elisa Nury
+'''
+
+import unittest
+from collatex.core_functions import *
+from collatex.exceptions import UnsupportedError
+from testfixtures import TempDirectory
+import os
+import json
+
+class TestCollationMethods(unittest.TestCase):
+    
+    def test_collation_method_create_from_json_file(self):
+        with TempDirectory() as d:
+            #create a temporary file in a temporary directory
+            d.write('testfile.json', b'{"witnesses" : [{"id" : "A", "content" : "The fox."}, {"id" : "B", "content": "The dog"}]}')
+            c = Collation.create_from_json_file(os.path.join(d.path, 'testfile.json'))
+            self.assertEqual(len(c.witnesses), 2)
+    
+    def test_collation_create_from_dict(self):
+        data = {"witnesses" : [{"id" : "A", "content" : "The fox."}, {"id" : "B", "content": "The dog"}]}
+        c = Collation.create_from_dict(data)
+        self.assertEqual(len(c.witnesses), 2)
+  
+
+class TestCollationFunctions(unittest.TestCase):
+    def setUp(self):
+        data = {
+            'witnesses' : [
+                {
+                    'id' : 'A',
+                    'content' : 'The cat'
+                },
+                {
+                    'id' : 'B',
+                    'tokens' : [
+                        { 't' : 'The'},
+                        { 't' : 'kitten'}
+                    ]
+                }
+            ]
+        }
+        self.c = Collation.create_from_dict(data)
+    
+    def test_collation_function_add_plain_witness(self):
+        self.c.add_plain_witness('C', 'A cat')
+        self.assertEqual(len(self.c.witnesses), 3)
+    
+    def test_collation_function_add_witness(self):
+        witnessdata = {'id': 'C', 'tokens': [{ 't' : 'A'},{ 't' : 'cat'}]}
+        self.c.add_witness(witnessdata)
+        self.assertEqual(len(self.c.witnesses), 3)
+    
+    def test_collation_function_add_witnesses_with_same_id(self):
+        witnessdata1 = {'id': 'C', 'tokens': [{ 't' : 'The'},{ 't': 'fox'}]}
+        witnessdata2 = {'id': 'C', 'tokens': [{ 't' : 'The'},{ 't': 'dog'}]}
+        self.c.add_witness(witnessdata1)
+        self.c.add_witness(witnessdata2)
+        self.assertEqual(len(self.c.witnesses), 4)
+        
+        #error in the collation result => there should be an exception raised...
+        #json_result = json.loads(collate(self.c, output='json'))
+        #self.assertEqual(json_result['table'][2][1], 'fox')
+        #self.assertEqual(json_result['table'][3][1], 'dog')
+        self.fail("It should not be possible to add 2 witnesses with the same id")
+    
+    def test_collation_function_get_range_for_witness(self):
+        expected_range_B = RangeSet()
+        expected_range_B.add_range(4, 6)
+        self.assertEqual(self.c.get_range_for_witness('B'), expected_range_B)
+        self.assertRaises(Exception, self.c.get_range_for_witness, 'W')
+    
+    #test other functions?
+    #get suffix array
+    #get sa
+    #get lcp array
+    #to extended suffix array
+
+
+if __name__ == '__main__':
+    unittest.main()

From 8b9b0ece18c7c46c672dc59fb73614aeaa6012b0 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 24 Mar 2015 11:48:15 +0000
Subject: [PATCH 26/34] Create test_collate_outputs.py

---
 .../tests/test_collate_outputs.py             | 275 ++++++++++++++++++
 1 file changed, 275 insertions(+)
 create mode 100644 collatex-pythonport/tests/test_collate_outputs.py

diff --git a/collatex-pythonport/tests/test_collate_outputs.py b/collatex-pythonport/tests/test_collate_outputs.py
new file mode 100644
index 000000000..cb9c8e5dd
--- /dev/null
+++ b/collatex-pythonport/tests/test_collate_outputs.py
@@ -0,0 +1,275 @@
+'''
+Created on March 24, 2015
+
+@author: Elisa Nury
+'''
+
+import unittest
+from collatex.core_functions import *
+from collatex.exceptions import UnsupportedError
+
+class TestCollate(unittest.TestCase):
+    def test_collate_with_invalid_output(self):
+        data = {"witnesses" :
+            [
+            {"id" : "A", "tokens" :
+                [
+                {"t": "A", "id": 1},
+                {"t": "small"},
+                {"t": "black"},
+                {"t": "cat"}
+                ]
+            },
+            {"id" : "B", "tokens" :
+                [
+                {"t": "A"},
+                {"t": "small"},
+                {"t": "white"},
+                {"t": "kitten.", "n": "cat"}
+                ]
+            }
+            ]
+        }
+        c = Collation.create_from_dict(data)
+        with self.assertRaises(Exception):
+            collate(c, output="xyz")
+    
+    def test_collate_with_empty_collation(self):
+        c = Collation()
+        with self.assertRaises(IndexError):
+            collate(c)
+  
+
+class TestTokenizedJsonOutput(unittest.TestCase):
+    def setUp(self):
+        self.data = {"witnesses" :
+            [
+            {"id" : "A", "tokens" :
+                [
+                {"t": "A", "id": 1},
+                {"t": "small"},
+                {"t": "black"},
+                {"t": "cat"}
+                ]
+            },
+            {"id" : "B", "tokens" :
+                [
+                {"t": "A"},
+                {"t": "small"},
+                {"t": "white"},
+                {"t": "kitten.", "n": "cat"}
+                ]
+            }
+            ]
+        }
+        self.c = Collation.create_from_dict(self.data)
+        self.maxDiff = None
+    
+    #--------------------------------------------------
+    #JSON output
+    def test_tokenized_output_json_segmentationFalse_layoutHorizontal(self):
+        expected = '{"table": [[[{"id": 1, "t": "A"}], [{"t": "small"}], [{"t": "black"}], [{"t": "cat"}]], [[{"t": "A"}], [{"t": "small"}], [{"t": "white"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}'
+        output = collate(self.c, output="json", segmentation=False, layout="horizontal")
+        self.assertEqual(output, expected)
+    
+    def test_tokenized_output_json_segmentationFalse_layoutVertical(self):
+        expected = '{"table": [[[{"id": 1, "t": "A"}], [{"t": "A"}]], [[{"t": "small"}], [{"t": "small"}]], [[{"t": "black"}], [{"t": "white"}]], [[{"t": "cat"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}'
+        output = collate(self.c, output="json", segmentation=False, layout="vertical")
+        self.assertEqual(output, expected)
+    
+    def test_tokenized_output_json_segmentationTrue_layoutHorizontal(self):
+        expected = '{"table": [[["A small"], ["black"], ["cat"]], [["A small"], ["white"], ["cat"]]], "witnesses": ["A", "B"]}'
+        output = collate(self.c, output="json", segmentation=True, layout="horizontal")
+        self.assertEqual(output, expected)
+    
+    def test_tokenized_output_json_segmentationTrue_layoutVertical(self):
+        expected = '{"table": [[["A small"], ["A small"]], [["black"], ["white"]], [["cat"], ["cat"]]], "witnesses": ["A", "B"]}'
+        output = collate(self.c, output="json", segmentation=True, layout="vertical")
+        self.assertEqual(output, expected)
+    
+    #--------------------------------------------------
+    #TABLE output
+
+    def test_tokenized_output_table_segmentationFalse_layoutHorizontal(self):
+        expected = """\
++---+---+-------+-------+---------+
+| A | A | small | black | cat     |
+| B | A | small | white | kitten. |
++---+---+-------+-------+---------+"""
+        output = str(collate(self.c, output="table", segmentation=False, layout="horizontal"))
+        self.assertEqual(output, expected)
+    
+    def test_tokenized_output_table_segmentationFalse_layoutVertical(self):
+        expected = '''\
++-------+---------+
+|   A   |    B    |
++-------+---------+
+|   A   |    A    |
++-------+---------+
+| small |  small  |
++-------+---------+
+| black |  white  |
++-------+---------+
+|  cat  | kitten. |
++-------+---------+'''
+        output = str(collate(self.c, output="table", segmentation=False, layout="vertical"))
+        self.assertEqual(output, expected)
+    
+    def test_tokenized_output_table_segmentationTrue_layoutHorizontal(self):
+        expected = """\
++---+---------+-------+-----+
+| A | A small | black | cat |
+| B | A small | white | cat |
++---+---------+-------+-----+"""
+        output = str(collate(self.c, output="table", segmentation=True, layout="horizontal"))
+        self.assertEqual(output, expected)
+    
+    def test_tokenized_output_table_segmentationTrue_layoutVertical(self):
+        expected = '''\
++---------+---------+
+|    A    |    B    |
++---------+---------+
+| A small | A small |
++---------+---------+
+|  black  |  white  |
++---------+---------+
+|   cat   |   cat   |
++---------+---------+'''
+        output = str(collate(self.c, output="table", segmentation=True, layout="vertical"))
+        self.assertEqual(output, expected)
+   
+    #--------------------------------------------------
+    #HTML output
+
+    def test_tokenized_output_html_segmentationFalse_layoutHorizontal(self):
+        expected = '''\
+<table>
+    <tr>
+        <td>A</td>
+        <td>A</td>
+        <td>small</td>
+        <td>black</td>
+        <td>cat</td>
+    </tr>
+    <tr>
+        <td>B</td>
+        <td>A</td>
+        <td>small</td>
+        <td>white</td>
+        <td>kitten.</td>
+    </tr>
+</table>'''
+        output = collate(self.c, output="html", segmentation=False, layout="horizontal")
+        self.assertEqual(output, expected)
+    
+    def test_tokenized_output_html_segmentationFalse_layoutVertical(self):
+        expected = '''\
+<table>
+    <tr>
+        <th>A</th>
+        <th>B</th>
+    </tr>
+    <tr>
+        <td>A</td>
+        <td>A</td>
+    </tr>
+    <tr>
+        <td>small</td>
+        <td>small</td>
+    </tr>
+    <tr>
+        <td>black</td>
+        <td>white</td>
+    </tr>
+    <tr>
+        <td>cat</td>
+        <td>kitten.</td>
+    </tr>
+</table>'''
+        output = collate(self.c, output="html", segmentation=False, layout="vertical")
+        self.assertEqual(output, expected)
+    
+    def test_tokenized_output_html_segmentationTrue_layoutHorizontal(self):
+        expected = '''\
+<table>
+    <tr>
+        <td>A</td>
+        <td>A small</td>
+        <td>black</td>
+        <td>cat</td>
+    </tr>
+    <tr>
+        <td>B</td>
+        <td>A small</td>
+        <td>white</td>
+        <td>cat</td>
+    </tr>
+</table>'''
+        output = collate(self.c, output="html", segmentation=True, layout="horizontal")
+        self.assertEqual(output, expected)
+    
+    def test_tokenized_output_html_segmentationTrue_layoutVertical(self):
+        expected = '''\
+<table>
+    <tr>
+        <th>A</th>
+        <th>B</th>
+    </tr>
+    <tr>
+        <td>A small</td>
+        <td>A small</td>
+    </tr>
+    <tr>
+        <td>black</td>
+        <td>white</td>
+    </tr>
+    <tr>
+        <td>cat</td>
+        <td>cat</td>
+    </tr>
+</table>'''
+        output = collate(self.c, output="html", segmentation=True, layout="vertical")
+        self.assertEqual(output, expected)
+
+    
+    
+    
+#--------------------------------------------------
+#Empty cells output
+
+class TestOutputEmptyCells(unittest.TestCase):
+    def setUp(self):
+        data = {
+      "witnesses" : [
+        {
+          "id" : "A",
+          "tokens" : [
+              { "t" : "A"},
+              { "t" : "black"},
+              { "t" : "cat"}
+          ]
+        },
+        {
+          "id" : "B",
+          "tokens" : [
+              { "t": "A" },
+              { "t": "kitten.", "n": "cat" }
+          ]
+        }
+    ]
+    }
+        self.c = Collation.create_from_dict(data)
+    
+    def test_json_segmentationTrue_output_with_empty_cells(self):
+        expected = '{"table": [[["A"], ["black"], ["cat"]], [["A"], ["-"], ["cat"]]], "witnesses": ["A", "B"]}'
+        output = collate(self.c, output="json")
+        self.assertEqual(output, expected)
+    
+    def test_json_segmentationFalse_output_with_empty_cells(self):
+        expected = '{"table": [[[{"t": "A"}], [{"t": "black"}], [{"t": "cat"}]], [[{"t": "A"}], [{"t": "-"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}'
+        output = collate(self.c, output="json", segmentation=False)
+        self.assertEqual(output, expected)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 42546cf2c5b0dab0ff0c8f468ea59e1311e44281 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 24 Mar 2015 11:49:34 +0000
Subject: [PATCH 27/34] Update core_classes.py

---
 collatex-pythonport/collatex/core_classes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/collatex-pythonport/collatex/core_classes.py b/collatex-pythonport/collatex/core_classes.py
index 882af47e4..9f1d21d92 100644
--- a/collatex-pythonport/collatex/core_classes.py
+++ b/collatex-pythonport/collatex/core_classes.py
@@ -13,7 +13,7 @@
 import re
 from prettytable import PrettyTable
 from textwrap import fill
-from collatex.exceptions import TokenError
+from collatex.exceptions import TokenError, UnsupportedError
 
 class Row(object):
     

From c2923d5859c5137b2b10d355f32bf757cecf9586 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 24 Mar 2015 12:38:51 +0000
Subject: [PATCH 28/34] Update test_collatex_block_witnesses.py

---
 collatex-pythonport/tests/test_collatex_block_witnesses.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/collatex-pythonport/tests/test_collatex_block_witnesses.py b/collatex-pythonport/tests/test_collatex_block_witnesses.py
index 7cb412822..48e78361e 100644
--- a/collatex-pythonport/tests/test_collatex_block_witnesses.py
+++ b/collatex-pythonport/tests/test_collatex_block_witnesses.py
@@ -29,7 +29,7 @@ def test_combined_string_hermans_case(self):
         collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
         collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
         # $ is meant to separate witnesses here
-        self.assertEquals("a b c d F g h i ! K ! q r s t $1 a b c d F g h i ! q r s t", collation.get_combined_string())
+        self.assertEquals("a b c d F g h i ! K ! q r s t $ 1 a b c d F g h i ! q r s t", " ".join(collation.combined_tokens))
     
     # test whether the witness->range mapping works
     @unit_disabled
@@ -241,4 +241,4 @@ def test_filter_potential_blocks(self):
 
 if __name__ == "__main__":
     #import sys;sys.argv = ['', 'Test.testName']
-    unittest.main()
\ No newline at end of file
+    unittest.main()

From 2914b6b2407452aff6f74624251f35939c826601 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 24 Mar 2015 12:44:25 +0000
Subject: [PATCH 29/34] Update test_near_matching_pretokenized.py

---
 .../tests/test_near_matching_pretokenized.py  | 82 ++++++++++---------
 1 file changed, 42 insertions(+), 40 deletions(-)

diff --git a/collatex-pythonport/tests/test_near_matching_pretokenized.py b/collatex-pythonport/tests/test_near_matching_pretokenized.py
index cad73a67e..7beb7f043 100644
--- a/collatex-pythonport/tests/test_near_matching_pretokenized.py
+++ b/collatex-pythonport/tests/test_near_matching_pretokenized.py
@@ -5,61 +5,63 @@
 '''
 import unittest
 from tests import unit_disabled
-from collatex.core_functions import collate_pretokenized_json
+from collatex.core_functions import *
 
 
 class Test(unittest.TestCase):
-    json_in = {
-      "witnesses" : [
-        {
-          "id" : "A",
-          "tokens" : [
-              { "t" : "I", "ref" : 123 },
-              { "t" : "bought" , "adj" : True },
-              { "t" : "this", "id" : "x3" },
-              { "t" : "glass", "id" : "x4" },
-              { "t" : ",", "type" : "punct" },
-              { "t" : "because", "id" : "x5" },
-              { "t" : "it", "id" : "x6" },
-              { "t" : "matches" },
-              { "t" : "those", "id" : "x7" },
-              { "t" : "dinner", "id" : "x8" },
-              { "t" : "plates", "id" : "x9" },
-              { "t" : ".", "type" : "punct" }
-          ]
-        },
-        {
-          "id" : "B",
-          "tokens" : [
-              { "t" : "I" },
-              { "t" : "bought" , "adj" : True },
-              { "t" : "those", "id" : "abc" },
-              { "t" : "glasses", "id" : "xyz" },
-              { "t" : ".", "type" : "punct" }
-          ]
+    def setUp(self):
+        json_in = {
+        "witnesses" : [
+            {
+              "id" : "A",
+              "tokens" : [
+                  { "t" : "I", "ref" : 123 },
+                  { "t" : "bought" , "adj" : True },
+                  { "t" : "this", "id" : "x3" },
+                  { "t" : "glass", "id" : "x4" },
+                  { "t" : ",", "type" : "punct" },
+                  { "t" : "because", "id" : "x5" },
+                  { "t" : "it", "id" : "x6" },
+                  { "t" : "matches" },
+                  { "t" : "those", "id" : "x7" },
+                  { "t" : "dinner", "id" : "x8" },
+                  { "t" : "plates", "id" : "x9" },
+                  { "t" : ".", "type" : "punct" }
+              ]
+            },
+            {
+            "id" : "B",
+            "tokens" : [
+                  { "t" : "I" },
+                  { "t" : "bought" , "adj" : True },
+                  { "t" : "those", "id" : "abc" },
+                  { "t" : "glasses", "id" : "xyz" },
+                  { "t" : ".", "type" : "punct" }
+              ]
+            }
+            ]
         }
-      ]
-    }
+        self.c = Collation.create_from_dict(json_in)
 
     def test_exact_matching(self):
-        result = collate_pretokenized_json(self.json_in)
-        self.assertEquals(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."],
+        result = collate(self.c, segmentation=False)
+        self.assertEqual(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."],
                           result.rows[0].to_list())
-        self.assertEquals(["I", "bought", "-", "-", "-", "-", "-", "-", "those", "glasses", "-", "."], result.rows[1].to_list())
+        self.assertEqual(["I", "bought", "-", "-", "-", "-", "-", "-", "those", "glasses", "-", "."], result.rows[1].to_list())
 
     def test_near_matching(self):
-        result = collate_pretokenized_json(self.json_in, near_match=True)
-        self.assertEquals(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."],
+        result = collate(self.c, segmentation=False, near_match=True)
+        self.assertEqual(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."],
                           result.rows[0].to_list())
-        self.assertEquals(["I", "bought", "those", "glasses", "-", "-", "-", "-", "-", "-", "-", "."], result.rows[1].to_list())
+        self.assertEqual(["I", "bought", "those", "glasses", "-", "-", "-", "-", "-", "-", "-", "."], result.rows[1].to_list())
 
     # Re-enable this one if segmented output is ever supported on tokenized collation
     @unit_disabled
     def test_near_matching_segmented(self):
-        result = collate_pretokenized_json(self.json_in, near_match=True, segmentation=True)
-        self.assertEquals(["I bought", "this glass, because it matches those dinner plates."],
+        result = collate(self.c, near_match=True, segmentation=True)
+        self.assertEqual(["I bought", "this glass, because it matches those dinner plates."],
                           result.rows[0].to_list())
-        self.assertEquals(["I bought", "those glasses."], result.rows[1].to_list())
+        self.assertEqual(["I bought", "those glasses."], result.rows[1].to_list())
 
 
 if __name__ == "__main__":

From 977b0fa97c598061ddb48d3578c0f8c78f6c751d Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 24 Mar 2015 12:48:10 +0000
Subject: [PATCH 30/34] Update test_witness_tokens.py

---
 collatex-pythonport/tests/test_witness_tokens.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/collatex-pythonport/tests/test_witness_tokens.py b/collatex-pythonport/tests/test_witness_tokens.py
index 56e97f437..0a0ff7f07 100644
--- a/collatex-pythonport/tests/test_witness_tokens.py
+++ b/collatex-pythonport/tests/test_witness_tokens.py
@@ -6,7 +6,7 @@
 
 import unittest
 from collatex import Collation
-from collatex.core_functions import collate_pretokenized_json
+from collatex.core_functions import collate
 
 
 class Test(unittest.TestCase):
@@ -52,7 +52,8 @@ def testPretokenizedWitness(self):
                 }
             ]
         }
-        result = collate_pretokenized_json(pretokenized_witness)
+        c = Collation.create_from_dict(pretokenized_witness)
+        result = collate(c, segmentation=False)
         self.assertEqual(len(result.rows[0].to_list()), 4)
         self.assertEqual(len(result.rows[1].to_list()), 4)
         # The second witness should have a token that reads 'mousedog bird'.

From 65a18f8dbd0b96d1e614a419f772a3a48aa66a3f Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Tue, 24 Mar 2015 17:54:39 +0000
Subject: [PATCH 31/34] Update test_collatex_block_witnesses.py

---
 .../tests/test_collatex_block_witnesses.py    | 23 ++++++++-----------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/collatex-pythonport/tests/test_collatex_block_witnesses.py b/collatex-pythonport/tests/test_collatex_block_witnesses.py
index 48e78361e..bd9a7a130 100644
--- a/collatex-pythonport/tests/test_collatex_block_witnesses.py
+++ b/collatex-pythonport/tests/test_collatex_block_witnesses.py
@@ -32,13 +32,12 @@ def test_combined_string_hermans_case(self):
         self.assertEquals("a b c d F g h i ! K ! q r s t $ 1 a b c d F g h i ! q r s t", " ".join(collation.combined_tokens))
     
     # test whether the witness->range mapping works
-    @unit_disabled
     def test_witness_ranges_hermans_case(self):
         collation = Collation()
         collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
         collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
         self.assertEquals(RangeSet("0-14"), collation.get_range_for_witness("W1"))
-        self.assertEquals(RangeSet("16-28"), collation.get_range_for_witness("W2"))
+        self.assertEquals(RangeSet("17-29"), collation.get_range_for_witness("W2"))
 
 # TODO: re-enable test!    
     # Note: LCP intervals can overlap
@@ -74,14 +73,13 @@ def test_lcp_child_intervals_hermans_case(self):
         _, child_lcp_intervals = collation.get_lcp_intervals()
         self.assertFalse(child_lcp_intervals)
 
-    @unit_disabled
     def test_non_overlapping_blocks_black_cat(self):
         collation = Collation()
         collation.add_plain_witness("W1", "the black cat")
         collation.add_plain_witness("W2", "the black cat")
         algorithm = Scorer(collation)
         blocks = algorithm._get_non_overlapping_repeating_blocks()
-        block1 = Block(RangeSet("0-2, 4-6"))
+        block1 = Block(RangeSet("0-2, 5-7"))
         self.assertEqual([block1], blocks)
 
     #TODO: Fix number of siblings!
@@ -97,17 +95,15 @@ def test_blocks_failing_transposition_use_case_old_algorithm(self):
         block3 = Block(RangeSet("2, 8"))
         self.assertEqual([block1, block2, block3], blocks)
 
-    @unit_disabled
     def test_non_overlapping_blocks_Hermans(self):
         collation = Collation()
         collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
         collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
         algorithm = Scorer(collation)
         blocks = algorithm._get_non_overlapping_repeating_blocks()
-        self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i !
-        self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
+        self.assertIn(Block(RangeSet("0-8, 17-25")), blocks) # a b c d F g h i !
+        self.assertIn(Block(RangeSet("11-14, 26-29")), blocks) # q r s t
 
-    @unit_disabled
     def test_blocks_Hermans_case_three_witnesses(self):
         collation = Collation()
         collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
@@ -115,21 +111,20 @@ def test_blocks_Hermans_case_three_witnesses(self):
         collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
         algorithm = Scorer(collation)
         blocks = algorithm._get_non_overlapping_repeating_blocks()
-        self.assertIn(Block(RangeSet("0-3, 16-19, 30-33")), blocks) # a b c d
-        self.assertIn(Block(RangeSet("5-7, 21-23, 35-37")), blocks) # g h i
-        self.assertIn(Block(RangeSet("10-14, 24-28, 38-42")), blocks) # ! q r s t
-        self.assertIn(Block(RangeSet("4, 20")), blocks) # F
+        self.assertIn(Block(RangeSet("0-3, 17-20, 32-35")), blocks) # a b c d
+        self.assertIn(Block(RangeSet("5-7, 22-24, 37-39")), blocks) # g h i 
+        self.assertIn(Block(RangeSet("10-14, 25-29, 40-44")), blocks) # ! q r s t
+        self.assertIn(Block(RangeSet("4, 21")), blocks) # F
         
 
     # In the new approach nothing should be split
-    @unit_disabled
     def test_blocks_splitting_token_case(self):
         collation = Collation()
         collation.add_plain_witness("W1", "a c b c")
         collation.add_plain_witness("W2", "a c b")
         algorithm = Scorer(collation)
         blocks = algorithm._get_non_overlapping_repeating_blocks()
-        block1 = Block(RangeSet("0-2, 5-7")) # a c b
+        block1 = Block(RangeSet("0-2, 6-8")) # a c b
         self.assertIn(block1, blocks)
 
     @unit_disabled

From c70db1a4a5ac99d4a473cc93d3030591649e1562 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Wed, 25 Mar 2015 10:49:04 +0000
Subject: [PATCH 32/34] Update test_collation_class.py

---
 collatex-pythonport/tests/test_collation_class.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/collatex-pythonport/tests/test_collation_class.py b/collatex-pythonport/tests/test_collation_class.py
index a97fb684e..fd3d2ac7a 100644
--- a/collatex-pythonport/tests/test_collation_class.py
+++ b/collatex-pythonport/tests/test_collation_class.py
@@ -54,6 +54,7 @@ def test_collation_function_add_witness(self):
         self.c.add_witness(witnessdata)
         self.assertEqual(len(self.c.witnesses), 3)
     
+    @unittest.expectedFailure
     def test_collation_function_add_witnesses_with_same_id(self):
         witnessdata1 = {'id': 'C', 'tokens': [{ 't' : 'The'},{ 't': 'fox'}]}
         witnessdata2 = {'id': 'C', 'tokens': [{ 't' : 'The'},{ 't': 'dog'}]}

From ecdb7645456a96c695e6db0878e727bef0244c43 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Wed, 25 Mar 2015 14:59:44 +0000
Subject: [PATCH 33/34] Port to Python 3:

use next(generator) instead of generator.next()
---
 collatex-pythonport/collatex/collatex_suffix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/collatex-pythonport/collatex/collatex_suffix.py b/collatex-pythonport/collatex/collatex_suffix.py
index 4cd5ab4ee..713f7f3ac 100644
--- a/collatex-pythonport/collatex/collatex_suffix.py
+++ b/collatex-pythonport/collatex/collatex_suffix.py
@@ -188,7 +188,7 @@ def __init__(self, occurrences, tokens):
     def debug(self):
         result = []
         for occurrence in self.occurrences:
-            result.append(' '.join(self.tokens[occurrence.token_range.slices().next()]))
+            result.append(' '.join(self.tokens[next(occurrence.token_range.slices())]))
         return result
 
 

From 193351e6d054d7dbcbec7c36bc71753040f59f00 Mon Sep 17 00:00:00 2001
From: enury <elisa.nury@gmail.com>
Date: Wed, 25 Mar 2015 15:02:30 +0000
Subject: [PATCH 34/34] Update test_collatex_block_witnesses.py

---
 collatex-pythonport/tests/test_collatex_block_witnesses.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/collatex-pythonport/tests/test_collatex_block_witnesses.py b/collatex-pythonport/tests/test_collatex_block_witnesses.py
index bd9a7a130..ee7cc3790 100644
--- a/collatex-pythonport/tests/test_collatex_block_witnesses.py
+++ b/collatex-pythonport/tests/test_collatex_block_witnesses.py
@@ -127,7 +127,6 @@ def test_blocks_splitting_token_case(self):
         block1 = Block(RangeSet("0-2, 6-8")) # a c b
         self.assertIn(block1, blocks)
 
-    @unit_disabled
     def test_block_witnesses_Hermans_case_two_witnesses(self):
         collation = Collation()
         collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
@@ -138,7 +137,6 @@ def test_block_witnesses_Hermans_case_two_witnesses(self):
         block_witness = algorithm._get_block_witness(collation.witnesses[1])
         self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug())
 
-    @unit_disabled
     def test_block_witnesses_Hermans_case(self):
         collation = Collation()
         collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")