From f8dd356994f4b4a3b262cc274d46019b61473c57 Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Mon, 26 Jul 2021 17:39:59 +0100
Subject: [PATCH 01/16] CAT-18 upgrade spaCy to v3

---
 README.md                                     |   6 +-
 medcat/cdb_maker.py                           |  12 +-
 medcat/linking/context_based_linker.py        |   5 +-
 medcat/meta_cat.py                            |   1 +
 medcat/ner/vocab_based_ner.py                 |   3 +-
 medcat/pipe.py                                |  26 +-
 medcat/preprocessing/taggers.py               |  65 +++--
 medcat/utils/make_vocab.py                    |   2 +-
 setup.py                                      |   2 +-
 tests/__init__.py                             |   0
 tests/archive_tests/test_cdb_maker_archive.py | 237 ++++++++---------
 tests/archive_tests/test_ner_archive.py       | 246 +++++++++---------
 tests/test_cdb_maker.py                       |  16 +-
 tests/test_ner.py                             |   4 +-
 webapp/webapp/Dockerfile                      |   2 +-
 15 files changed, 338 insertions(+), 289 deletions(-)
 create mode 100644 tests/__init__.py

diff --git a/README.md b/README.md
index 7b8fc2518..c124c409e 100644
--- a/README.md
+++ b/README.md
@@ -30,9 +30,9 @@ A guide on how to use MedCAT is available in the [tutorial](https://github.com/C
 
 2. Get the scispacy models:
 
-`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz`
+`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz`
 
-`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz`
+`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz`
 
 3. Downlad the Vocabulary and CDB from the Models section bellow
 
@@ -98,7 +98,7 @@ CDB [Download](https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1.dat) - Buil
 MetaCAT Status [Download](https://medcat.rosalind.kcl.ac.uk/media/mc_status.zip) - Built from a sample from MIMIC-III, detects is an annotation Affirmed (Positve) or Other (Negated or Hypothetical)
 
 
-(Note: This is was compiled from MedMentions and does not have any data from [NLM](https://www.nlm.nih.gov/research/umls/) as
+(Note: This was compiled from MedMentions and does not have any data from [NLM](https://www.nlm.nih.gov/research/umls/) as
 that data is not publicaly available.)
 
 ### SNOMED-CT and UMLS
diff --git a/medcat/cdb_maker.py b/medcat/cdb_maker.py
index c1cdc758e..3597831aa 100644
--- a/medcat/cdb_maker.py
+++ b/medcat/cdb_maker.py
@@ -44,28 +44,28 @@ def __init__(self, config, cdb=None, name_max_words=20):
 
         # Build the required spacy pipeline
         self.nlp = Pipe(tokenizer=spacy_split_all, config=config)
-        self.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=self.config),
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                             name='skip_and_punct',
                             additional_fields=['is_punct'])
 
 
     def prepare_csvs(self, csv_paths, sep=',', encoding=None, escapechar=None, index_col=False, full_build=False, only_existing_cuis=False, **kwargs):
-        r''' Compile one or multipe CSVs into a CDB.
+        r''' Compile one or multiple CSVs into a CDB.
 
         Args:
             csv_paths (`List[str]`):
                 An array of paths to the csv files that should be processed
-            full_build (`bool`, defautls to `True`):
+            full_build (`bool`, defaults to `True`):
                 If False only the core portions of the CDB will be built (the ones required for
                 the functioning of MedCAT). If True, everything will be added to the CDB - this
                 usually includes concept descriptions, various forms of names etc (take care that
                 this option produces a much larger CDB).
             sep (`str`, defaults to `,`):
-                If necessarya a custom separator for the csv files
+                If necessary a custom separator for the csv files
             encoding (`str`, optional):
-                Encoing to be used for reading the CSV file
+                Encoding to be used for reading the CSV file
             escapechar (`str`, optional):
-                Escapechar for the CSV
+                Escape char for the CSV
             index_col (`bool`, defaults_to `False`):
                 Index column for pandas read_csv
             only_existing_cuis (`bool`, defaults to False):
diff --git a/medcat/linking/context_based_linker.py b/medcat/linking/context_based_linker.py
index e7fb09578..94d1a01dc 100644
--- a/medcat/linking/context_based_linker.py
+++ b/medcat/linking/context_based_linker.py
@@ -1,7 +1,8 @@
-from medcat.utils.filters import check_filters
-from medcat.linking.vector_context_model import ContextModel
 import random
 import logging
+from medcat.utils.filters import check_filters
+from medcat.linking.vector_context_model import ContextModel
+
 
 class Linker(object):
     r''' Link to a biomedical database.
diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py
index af197801f..3b46b856e 100644
--- a/medcat/meta_cat.py
+++ b/medcat/meta_cat.py
@@ -10,6 +10,7 @@
 from medcat.preprocessing.tokenizers import TokenizerWrapperBPE
 from medcat.preprocessing.tokenizers import TokenizerWrapperBERT
 
+
 class MetaCAT(object):
     r''' TODO: Add documentation
     '''
diff --git a/medcat/ner/vocab_based_ner.py b/medcat/ner/vocab_based_ner.py
index 9aef8f41f..7c12c5ff2 100644
--- a/medcat/ner/vocab_based_ner.py
+++ b/medcat/ner/vocab_based_ner.py
@@ -1,5 +1,6 @@
-from medcat.ner.vocab_based_annotator import maybe_annotate_name
 import logging
+from medcat.ner.vocab_based_annotator import maybe_annotate_name
+
 
 class NER(object):
     r'''
diff --git a/medcat/pipe.py b/medcat/pipe.py
index 470fc146c..d22094bb5 100644
--- a/medcat/pipe.py
+++ b/medcat/pipe.py
@@ -1,7 +1,8 @@
+import spacy
 from spacy.tokens import Token, Doc, Span
+from spacy.language import Language
 from medcat.utils.normalizers import TokenNormalizer
-import spacy
-import os
+
 
 class Pipe(object):
     r''' A wrapper around the standard spacy pipeline.
@@ -21,6 +22,7 @@ def __init__(self, tokenizer, config):
         if config.preprocessing['stopwords'] is not None:
             self.nlp.Defaults.stop_words = set(config.preprocessing['stopwords'])
         self.nlp.tokenizer = tokenizer(self.nlp)
+        self.config = config
 
 
     def add_tagger(self, tagger, name, additional_fields=[]):
@@ -35,7 +37,9 @@ def add_tagger(self, tagger, name, additional_fields=[]):
             additional_fields (`List[str]`):
                 Fields to be added to the `_` properties of a token.
         '''
-        self.nlp.add_pipe(tagger, name='tag_' + name, first=True)
+        component_name = spacy.util.get_object_name(tagger)
+        Language.factory(name=component_name, default_config={"config": self.config}, func=tagger)
+        self.nlp.add_pipe(component_name, name='tag_' + name, first=True)
         # Add custom fields needed for this usecase
         Token.set_extension('to_skip', default=False, force=True)
 
@@ -46,7 +50,9 @@ def add_tagger(self, tagger, name, additional_fields=[]):
 
     def add_token_normalizer(self, config, spell_checker=None):
         token_normalizer = TokenNormalizer(spell_checker=spell_checker, config=config)
-        self.nlp.add_pipe(token_normalizer, name='token_normalizer', last=True)
+        component_name = spacy.util.get_object_name(token_normalizer)
+        Language.component(name=component_name, func=token_normalizer)
+        self.nlp.add_pipe(component_name, name='token_normalizer', last=True)
 
         # Add custom fields needed for this usecase
         Token.set_extension('norm', default=None, force=True)
@@ -57,7 +63,9 @@ def add_ner(self, ner):
         to the document and Span objects.
 
         '''
-        self.nlp.add_pipe(ner, name='cat_ner', last=True)
+        component_name = spacy.util.get_object_name(ner)
+        Language.component(name=component_name, func=ner)
+        self.nlp.add_pipe(component_name, name='cat_ner', last=True)
 
         Doc.set_extension('ents', default=[], force=True)
         Span.set_extension('confidence', default=-1, force=True)
@@ -76,13 +84,17 @@ def add_linker(self, linker):
             Any object/function created based on the requirements for a spaCy pipeline components. Have
             a look at https://spacy.io/usage/processing-pipelines#custom-components
         '''
-        self.nlp.add_pipe(linker, name='cat_linker', last=True)
+        component_name = spacy.util.get_object_name(linker)
+        Language.component(name=component_name, func=linker)
+        self.nlp.add_pipe(component_name, name='cat_linker', last=True)
         Span.set_extension('cui', default=-1, force=True)
         Span.set_extension('context_similarity', default=-1, force=True)
 
 
     def add_meta_cat(self, meta_cat, name):
-        self.nlp.add_pipe(meta_cat, name=name, last=True)
+        component_name = spacy.util.get_object_name(meta_cat)
+        Language.component(name=component_name, func=meta_cat)
+        self.nlp.add_pipe(component_name, name=name, last=True)
 
         # Only the meta_anns field is needed, it will be a dictionary 
         #of {category_name: value, ...}
diff --git a/medcat/preprocessing/taggers.py b/medcat/preprocessing/taggers.py
index 474e2d019..c63539a12 100644
--- a/medcat/preprocessing/taggers.py
+++ b/medcat/preprocessing/taggers.py
@@ -1,30 +1,45 @@
 import re
 
-def tag_skip_and_punct(doc, config):
+def tag_skip_and_punct(nlp, name, config):
     r''' Detects and tags spacy tokens that are punctuation and that should be skipped.
 
-    Args:
-        doc (`spacy.tokens.Doc`):
-            Spacy document that will be tagged.
-        config (`medcat.config.Config`):
-            Global config for medcat.
-
-    Return:
-        (`spacy.tokens.Doc):
-            Tagged spacy document
+     Args:
+         config (`medcat.config.Config`):
+             Global config for medcat.
     '''
-    # Make life easier
-    cnf_p = config.preprocessing
-
-    for token in doc:
-        if config.punct_checker.match(token.lower_) and token.text not in cnf_p['keep_punct']:
-            # There can't be punct in a token if it also has text
-            token._.is_punct = True
-            token._.to_skip = True
-        elif config.word_skipper.match(token.lower_):
-            # Skip if specific strings
-            token._.to_skip = True
-        elif cnf_p['skip_stopwords'] and token.is_stop:
-            token._.to_skip = True
-
-    return doc
+
+    return TagSkipAndPunct(nlp, name, config)
+
+class TagSkipAndPunct(object):
+
+    def __init__(self, nlp, name, config):
+        self.nlp = nlp
+        self.name = name
+        self.config = config
+
+    def __call__(self, doc):
+        r''' Detects and tags spacy tokens that are punctuation and that should be skipped.
+
+        Args:
+            doc (`spacy.tokens.Doc`):
+                Spacy document that will be tagged.
+
+        Return:
+            (`spacy.tokens.Doc):
+                Tagged spacy document
+        '''
+        # Make life easier
+        cnf_p = self.config.preprocessing
+
+        for token in doc:
+            if self.config.punct_checker.match(token.lower_) and token.text not in cnf_p['keep_punct']:
+                # There can't be punct in a token if it also has text
+                token._.is_punct = True
+                token._.to_skip = True
+            elif self.config.word_skipper.match(token.lower_):
+                # Skip if specific strings
+                token._.to_skip = True
+            elif cnf_p['skip_stopwords'] and token.is_stop:
+                token._.to_skip = True
+
+        return doc
diff --git a/medcat/utils/make_vocab.py b/medcat/utils/make_vocab.py
index 01547d4c9..a9ebfc18f 100644
--- a/medcat/utils/make_vocab.py
+++ b/medcat/utils/make_vocab.py
@@ -40,7 +40,7 @@ def __init__(self, config, cdb=None, vocab=None, word_tokenizer=None):
 
         # Build the required spacy pipeline
         self.nlp = Pipe(tokenizer=spacy_split_all, config=config)
-        self.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=self.config),
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                             name='skip_and_punct',
                             additional_fields=['is_punct'])
 
diff --git a/setup.py b/setup.py
index 3e33f8f58..00042b257 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@
         'numpy~=1.20',
         'pandas~=1.0',
         'gensim~=3.8',
-        'spacy==2.3.4',
+        'spacy==3.1.1',
         'scipy~=1.5',
         'transformers~=4.5.1',
         'torch~=1.8.1',
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/archive_tests/test_cdb_maker_archive.py b/tests/archive_tests/test_cdb_maker_archive.py
index 422fde2cd..8b8bb2acd 100644
--- a/tests/archive_tests/test_cdb_maker_archive.py
+++ b/tests/archive_tests/test_cdb_maker_archive.py
@@ -1,119 +1,124 @@
-r''' The tests here are a bit messy but they work, should be converted to python unittests.
-'''
-from medcat.cdb_maker import CDBMaker
-from medcat.config import Config
-import numpy as np
 import logging
-
-config = Config()
-config.general['log_level'] = logging.DEBUG
-maker = CDBMaker(config)
-
-# Building a new CDB from two files (full_build)
-csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
-cdb = maker.prepare_csvs(csvs, full_build=True)
-
-assert len(cdb.cui2names) == 3
-assert len(cdb.cui2snames) == 3
-assert len(cdb.name2cuis) == 5
-assert len(cdb.cui2tags) == 3
-assert len(cdb.cui2preferred_name) == 2
-assert len(cdb.cui2context_vectors) == 3
-assert len(cdb.cui2count_train) == 3
-assert cdb.name2cuis2status['virus']['C0000039'] == 'P'
-assert cdb.cui2type_ids['C0000039'] == {'T234', 'T109', 'T123'}
-assert cdb.addl_info['cui2original_names']['C0000039'] == {'Virus', 'Virus K', 'Virus M', 'Virus Z'}
-assert cdb.addl_info['cui2description']['C0000039'].startswith("Synthetic")
-
-# Test name addition
-from medcat.preprocessing.cleaners import prepare_name
-cdb.add_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', maker.nlp, {}, config), name_status='P', full_build=True)
-assert cdb.addl_info['cui2original_names']['C0000239'] == {'MY: new,-_! Name.', 'Second csv'}
-assert 'my:newname.' in cdb.name2cuis
-assert 'my:new' in cdb.snames
-assert 'my:newname.' in cdb.name2cuis2status
-assert cdb.name2cuis2status['my:newname.'] == {'C0000239': 'P'}
-
-# Test name removal
-cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', maker.nlp, {}, config))
-# Run again to make sure it does not break anything
-cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', maker.nlp, {}, config))
-assert len(cdb.name2cuis) == 5
-assert 'my:newname.' not in cdb.name2cuis2status
-
-# Test filtering
-cuis_to_keep = {'C0000039'} # Because of transition 2 will be kept
-cdb.filter_by_cui(cuis_to_keep=cuis_to_keep)
-assert len(cdb.cui2names) == 2
-assert len(cdb.name2cuis) == 4
-assert len(cdb.snames) == 4
-
-# Test vector addition
+import unittest
 import numpy as np
-cdb.reset_training()
-np.random.seed(11)
-cuis = list(cdb.cui2names.keys())
-for i in range(2):
-    for cui in cuis:
-        vectors = {}
-        for cntx_type in config.linking['context_vector_sizes']:
-            vectors[cntx_type] = np.random.rand(300)
-        cdb.update_context_vector(cui, vectors, negative=False)
-
-assert cdb.cui2count_train['C0000139'] == 2
-assert cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300
-
-
-# Test negative
-for cui in cuis:
-    vectors = {}
-    for cntx_type in config.linking['context_vector_sizes']:
-        vectors[cntx_type] = np.random.rand(300)
-    cdb.update_context_vector(cui, vectors, negative=True)
-
-assert cdb.cui2count_train['C0000139'] == 2
-assert cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300
-
-# Test save/load
 from medcat.cdb import CDB
-cdb.save("./tmp_cdb.dat")
-cdb2 = CDB.load('./tmp_cdb.dat')
-# Check a random thing
-assert cdb2.cui2context_vectors['C0000139']['long'][7] == cdb.cui2context_vectors['C0000139']['long'][7]
-
-# Test training import
-cdb.reset_training()
-cdb2.reset_training()
-np.random.seed(11)
-cuis = list(cdb.cui2names.keys())
-for i in range(2):
-    for cui in cuis:
-        vectors = {}
-        for cntx_type in config.linking['context_vector_sizes']:
-            vectors[cntx_type] = np.random.rand(300)
-        cdb.update_context_vector(cui, vectors, negative=False)
-
-cdb2.import_training(cdb=cdb, overwrite=True)
-assert cdb2.cui2context_vectors['C0000139']['long'][7] == cdb.cui2context_vectors['C0000139']['long'][7]
-assert cdb2.cui2count_train['C0000139'] == cdb.cui2count_train['C0000139']
-
-# Test concept similarity
-cdb = CDB(config=config)
-np.random.seed(11)
-for i in range(500):
-    cui = "C" + str(i)
-    type_ids = {'T-' + str(i%10)}
-    cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), maker.nlp, {}, config), ontologies=set(),
-            name_status='P', type_ids=type_ids, description='', full_build=True)
-
-    vectors = {}
-    for cntx_type in config.linking['context_vector_sizes']:
-        vectors[cntx_type] = np.random.rand(300)
-    cdb.update_context_vector(cui, vectors, negative=False)
-res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True)
-assert len(res) == 10
-
-# Test training reset
-cdb.reset_training()
-assert len(cdb.cui2context_vectors['C0']) == 0
-assert cdb.cui2count_train['C0'] == 0
+from medcat.cdb_maker import CDBMaker
+from medcat.config import Config
+from medcat.preprocessing.cleaners import prepare_name
+
+
+class CdbMakerArchiveTests(unittest.TestCase):
+
+    def setUp(self):
+        self.config = Config()
+        self.config.general['log_level'] = logging.DEBUG
+        self.maker = CDBMaker(self.config)
+
+        # Building a new CDB from two files (full_build)
+        csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
+        self.cdb = self.maker.prepare_csvs(csvs, full_build=True)
+
+    def test_prepare_csvs(self):
+        assert len(self.cdb.cui2names) == 3
+        assert len(self.cdb.cui2snames) == 3
+        assert len(self.cdb.name2cuis) == 5
+        assert len(self.cdb.cui2tags) == 3
+        assert len(self.cdb.cui2preferred_name) == 2
+        assert len(self.cdb.cui2context_vectors) == 3
+        assert len(self.cdb.cui2count_train) == 3
+        assert self.cdb.name2cuis2status['virus']['C0000039'] == 'P'
+        assert self.cdb.cui2type_ids['C0000039'] == {'T234', 'T109', 'T123'}
+        assert self.cdb.addl_info['cui2original_names']['C0000039'] == {'Virus', 'Virus K', 'Virus M', 'Virus Z'}
+        assert self.cdb.addl_info['cui2description']['C0000039'].startswith("Synthetic")
+
+    def test_name_addition(self):
+        self.cdb.add_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config), name_status='P', full_build=True)
+        assert self.cdb.addl_info['cui2original_names']['C0000239'] == {'MY: new,-_! Name.', 'Second csv'}
+        assert 'my:newname.' in self.cdb.name2cuis
+        assert 'my:new' in self.cdb.snames
+        assert 'my:newname.' in self.cdb.name2cuis2status
+        assert self.cdb.name2cuis2status['my:newname.'] == {'C0000239': 'P'}
+
+    def test_name_removal(self):
+        self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config))
+        # Run again to make sure it does not break anything
+        self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config))
+        assert len(self.cdb.name2cuis) == 5
+        assert 'my:newname.' not in self.cdb.name2cuis2status
+
+    def test_filtering(self):
+        cuis_to_keep = {'C0000039'} # Because of transition 2 will be kept
+        self.cdb.filter_by_cui(cuis_to_keep=cuis_to_keep)
+        assert len(self.cdb.cui2names) == 2
+        assert len(self.cdb.name2cuis) == 4
+        assert len(self.cdb.snames) == 4
+
+    def test_vector_addition(self):
+        self.cdb.reset_training()
+        np.random.seed(11)
+        cuis = list(self.cdb.cui2names.keys())
+        for i in range(2):
+            for cui in cuis:
+                vectors = {}
+                for cntx_type in self.config.linking['context_vector_sizes']:
+                    vectors[cntx_type] = np.random.rand(300)
+                self.cdb.update_context_vector(cui, vectors, negative=False)
+
+        assert self.cdb.cui2count_train['C0000139'] == 2
+        assert self.cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300
+
+
+    def test_negative(self):
+        cuis = list(self.cdb.cui2names.keys())
+        for cui in cuis:
+            vectors = {}
+            for cntx_type in self.config.linking['context_vector_sizes']:
+                vectors[cntx_type] = np.random.rand(300)
+            self.cdb.update_context_vector(cui, vectors, negative=True)
+
+        assert self.cdb.cui2count_train['C0000139'] == 2
+        assert self.cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300
+
+    def test_save_and_load(self):
+        self.cdb.save("./tmp_cdb.dat")
+        cdb2 = CDB.load('./tmp_cdb.dat')
+        # Check a random thing
+        assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7]
+
+    def test_training_import(self):
+        cdb2 = CDB.load('./tmp_cdb.dat')
+        self.cdb.reset_training()
+        cdb2.reset_training()
+        np.random.seed(11)
+        cuis = list(self.cdb.cui2names.keys())
+        for i in range(2):
+            for cui in cuis:
+                vectors = {}
+                for cntx_type in self.config.linking['context_vector_sizes']:
+                    vectors[cntx_type] = np.random.rand(300)
+                self.cdb.update_context_vector(cui, vectors, negative=False)
+
+        cdb2.import_training(cdb=self.cdb, overwrite=True)
+        assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7]
+        assert cdb2.cui2count_train['C0000139'] == self.cdb.cui2count_train['C0000139']
+
+    def test_concept_similarity(self):
+        cdb = CDB(config=self.config)
+        np.random.seed(11)
+        for i in range(500):
+            cui = "C" + str(i)
+            type_ids = {'T-' + str(i%10)}
+            cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.nlp, {}, self.config), ontologies=set(),
+                    name_status='P', type_ids=type_ids, description='', full_build=True)
+
+            vectors = {}
+            for cntx_type in self.config.linking['context_vector_sizes']:
+                vectors[cntx_type] = np.random.rand(300)
+            cdb.update_context_vector(cui, vectors, negative=False)
+        res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True)
+        assert len(res) == 10
+
+    def test_training_reset(self):
+        self.cdb.reset_training()
+        assert len(self.cdb.cui2context_vectors['C0']) == 0
+        assert self.cdb.cui2count_train['C0'] == 0
diff --git a/tests/archive_tests/test_ner_archive.py b/tests/archive_tests/test_ner_archive.py
index 1be695f35..6037f3c16 100644
--- a/tests/archive_tests/test_ner_archive.py
+++ b/tests/archive_tests/test_ner_archive.py
@@ -1,3 +1,9 @@
+import logging
+import os
+import unittest
+import numpy as np
+from timeit import default_timer as timer
+from medcat.cdb import CDB
 from medcat.preprocessing.tokenizers import spacy_split_all
 from medcat.ner.vocab_based_ner import NER
 from medcat.preprocessing.taggers import tag_skip_and_punct
@@ -6,127 +12,127 @@
 from medcat.vocab import Vocab
 from medcat.preprocessing.cleaners import prepare_name
 from medcat.linking.vector_context_model import ContextModel
-from functools import partial
 from medcat.linking.context_based_linker import Linker
 from medcat.config import Config
-import logging
-from medcat.cdb import CDB
-import os
-import requests
-
-config = Config()
-config.general['log_level'] = logging.INFO
-cdb = CDB(config=config)
-
-nlp = Pipe(tokenizer=spacy_split_all, config=config)
-nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config),
-               name='skip_and_punct',
-               additional_fields=['is_punct'])
-
-# Add a couple of names
-cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config))
-cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', nlp, {}, config))
-cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config))
-# Check
-#assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}
-
-vocab_path = "./tmp_vocab.dat"
-if not os.path.exists(vocab_path):
-    import requests
-    tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
-    with open(vocab_path, 'wb') as f:
-        f.write(tmp.content)
-
-vocab = Vocab.load(vocab_path)
-# Make the pipeline
-nlp = Pipe(tokenizer=spacy_split_all, config=config)
-nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config),
-               name='skip_and_punct',
-               additional_fields=['is_punct'])
-spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=config, data_vocab=vocab)
-nlp.add_token_normalizer(spell_checker=spell_checker, config=config)
-ner = NER(cdb, config)
-nlp.add_ner(ner)
-
-# Add Linker
-link = Linker(cdb, vocab, config)
-nlp.add_linker(link)
-
-# Test limits for tokens and uppercase
-config.ner['max_skip_tokens'] = 1
-config.ner['upper_case_limit_len'] = 4
-config.linking['disamb_length_limit'] = 2
-text = "CDB - I was running and then Movar    Virus attacked and CDb"
-d = nlp(text)
-
-assert len(d._.ents) == 2
-assert d._.ents[0]._.link_candidates[0] == 'S-229004'
-
-# Change limit for skip
-config.ner['max_skip_tokens'] = 3
-d = nlp(text)
-assert len(d._.ents) == 3
-
-# Change limit for upper_case
-config.ner['upper_case_limit_len'] = 3
-d = nlp(text)
-assert len(d._.ents) == 4
-
-# Check name length limit
-config.ner['min_name_len'] = 4
-d = nlp(text)
-assert len(d._.ents) == 2
-
-# Speed tests
-from timeit import default_timer as timer
-text = "CDB - I was running and then Movar    Virus attacked and CDb"
-text = text * 300
-config.general['spell_check'] = True
-start = timer()
-for i in range(50):
-    d = nlp(text)
-end = timer()
-print("Time: ", end - start)
-
-# Now without spell check
-config.general['spell_check'] = False
-start = timer()
-for i in range(50):
-    d = nlp(text)
-end = timer()
-print("Time: ", end - start)
-
-
-# Test for linker
-import numpy as np
-
-config = Config()
-config.general['log_level'] = logging.DEBUG
-cdb = CDB(config=config)
-
-# Add a couple of names
-cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config))
-cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', nlp, {}, config))
-cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config))
-cdb.add_names(cui='S-2290045', names=prepare_name('Movar', nlp, {}, config))
-# Check
-#assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}}
-
-cuis = list(cdb.cui2names.keys())
-for cui in cuis[0:50]:
-    vectors = {'short': np.random.rand(300),
-              'long': np.random.rand(300),
-              'medium': np.random.rand(300)
-              }
-    cdb.update_context_vector(cui, vectors, negative=False)
-
-vocab = Vocab.load(vocab_path)
-cm = ContextModel(cdb, vocab, config)
-cm.train_using_negative_sampling('S-229004')
-config.linking['train_count_threshold'] = 0
-
-cm.train('S-229004', d._.ents[1], d)
 
-cm.similarity('S-229004', d._.ents[1], d)
 
-cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d)
+class NerArchiveTests(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.config = Config()
+        self.config.general['log_level'] = logging.INFO
+        cdb = CDB(config=self.config)
+
+        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
+                       name='skip_and_punct',
+                       additional_fields=['is_punct'])
+
+        # Add a couple of names
+        cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config))
+        cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config))
+        cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config))
+        # Check
+        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}
+
+        self.vocab_path = "./tmp_vocab.dat"
+        if not os.path.exists(self.vocab_path):
+            import requests
+            tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
+            with open(self.vocab_path, 'wb') as f:
+                f.write(tmp.content)
+
+        vocab = Vocab.load(self.vocab_path)
+        # Make the pipeline
+        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
+                       name='skip_and_punct',
+                       additional_fields=['is_punct'])
+        spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=self.config, data_vocab=vocab)
+        self.nlp.add_token_normalizer(spell_checker=spell_checker, config=self.config)
+        ner = NER(cdb, self.config)
+        self.nlp.add_ner(ner)
+
+        # Add Linker
+        link = Linker(cdb, vocab, self.config)
+        self.nlp.add_linker(link)
+
+        self.text = "CDB - I was running and then Movar    Virus attacked and CDb"
+
+    def test_limits_for_tokens_and_uppercase(self):
+        self.config.ner['max_skip_tokens'] = 1
+        self.config.ner['upper_case_limit_len'] = 4
+        self.config.linking['disamb_length_limit'] = 2
+
+        d = self.nlp(self.text)
+
+        assert len(d._.ents) == 2
+        assert d._.ents[0]._.link_candidates[0] == 'S-229004'
+
+    def test_change_limit_for_skip(self):
+        self.config.ner['max_skip_tokens'] = 3
+        d = self.nlp(self.text)
+        assert len(d._.ents) == 3
+
+    def test_change_limit_for_upper_case(self):
+        self.config.ner['upper_case_limit_len'] = 3
+        d = self.nlp(self.text)
+        assert len(d._.ents) == 4
+
+    def test_check_name_length_limit(self):
+        self.config.ner['min_name_len'] = 4
+        d = self.nlp(self.text)
+        assert len(d._.ents) == 2
+
+    def test_speed(self):
+        text = "CDB - I was running and then Movar    Virus attacked and CDb"
+        text = text * 300
+        self.config.general['spell_check'] = True
+        start = timer()
+        for i in range(50):
+            d = self.nlp(text)
+        end = timer()
+        print("Time: ", end - start)
+
+    def test_without_spell_check(self):
+        # Now without spell check
+        self.config.general['spell_check'] = False
+        start = timer()
+        for i in range(50):
+            d = self.nlp(self.text)
+        end = timer()
+        print("Time: ", end - start)
+
+
+    def test_for_linker(self):
+        self.config = Config()
+        self.config.general['log_level'] = logging.DEBUG
+        cdb = CDB(config=self.config)
+
+        # Add a couple of names
+        cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config))
+        cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config))
+        cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config))
+        cdb.add_names(cui='S-2290045', names=prepare_name('Movar', self.nlp, {}, self.config))
+        # Check
+        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}}
+
+        cuis = list(cdb.cui2names.keys())
+        for cui in cuis[0:50]:
+            vectors = {'short': np.random.rand(300),
+                      'long': np.random.rand(300),
+                      'medium': np.random.rand(300)
+                      }
+            cdb.update_context_vector(cui, vectors, negative=False)
+
+        d = self.nlp(self.text)
+        vocab = Vocab.load(self.vocab_path)
+        cm = ContextModel(cdb, vocab, self.config)
+        cm.train_using_negative_sampling('S-229004')
+        self.config.linking['train_count_threshold'] = 0
+
+        cm.train('S-229004', d._.ents[1], d)
+
+        cm.similarity('S-229004', d._.ents[1], d)
+
+        cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d)
diff --git a/tests/test_cdb_maker.py b/tests/test_cdb_maker.py
index 3917c075b..9d84215a5 100644
--- a/tests/test_cdb_maker.py
+++ b/tests/test_cdb_maker.py
@@ -1,10 +1,11 @@
 import unittest
+import logging
+import os
+import numpy as np
 from medcat.cdb_maker import CDBMaker
 from medcat.cdb import CDB
 from medcat.config import Config
 from medcat.preprocessing.cleaners import prepare_name
-import numpy as np
-import logging
 
 #cdb.csv
 #cui  name  ontologies  name_status type_ids  description
@@ -20,6 +21,7 @@
 
 #TESTS RUN IN ALPHABETICAL ORDER - CONTROLLING WITH '[class_letter]Class and test_[classletter subclassletter]' function syntax
 
+
 class A_CDBMakerLoadTests(unittest.TestCase):
     
     @classmethod
@@ -28,7 +30,10 @@ def setUpClass(cls):
         config = Config()
         config.general['log_level'] = logging.DEBUG
         maker = CDBMaker(config)
-        csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
+        csvs = [
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb.csv'),
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb_2.csv')
+        ]
         cls.cdb = maker.prepare_csvs(csvs, full_build=True)
 
     def test_aa_cdb_names_length(self):
@@ -110,7 +115,10 @@ def setUpClass(cls):
         cls.config = Config()
         cls.config.general['log_level'] = logging.DEBUG
         cls.maker = CDBMaker(cls.config)
-        csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
+        csvs = [
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb.csv'),
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb_2.csv')
+        ]
         cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)
         cls.cdb2 = CDB(cls.config)
 
diff --git a/tests/test_ner.py b/tests/test_ner.py
index e05be9659..14720c205 100644
--- a/tests/test_ner.py
+++ b/tests/test_ner.py
@@ -28,7 +28,7 @@ def setUpClass(cls):
         print("Set up Vocab")
         vocab_path = "./tmp_vocab.dat"
         if not os.path.exists(vocab_path):
-            tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
+            tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
             with open(vocab_path, 'wb') as f:
                 f.write(tmp.content)
 
@@ -37,7 +37,7 @@ def setUpClass(cls):
 
         print("Set up NLP pipeline")
         cls.nlp = Pipe(tokenizer=spacy_split_all, config=cls.config)
-        cls.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=cls.config),
+        cls.nlp.add_tagger(tagger=tag_skip_and_punct,
                        name='skip_and_punct',
                        additional_fields=['is_punct'])
 
diff --git a/webapp/webapp/Dockerfile b/webapp/webapp/Dockerfile
index ccf9dfa81..ccd5e591c 100644
--- a/webapp/webapp/Dockerfile
+++ b/webapp/webapp/Dockerfile
@@ -19,7 +19,7 @@ RUN pip install -r requirements.txt
 
 # Get the spacy and scipspacy model
 RUN python -m spacy download en_core_web_md
-RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_md-0.2.4.tar.gz
+RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz
 
 # Build the db
 RUN python manage.py makemigrations && \

From add6d7b3130ba1c4375f0324e31ee341a6f831d9 Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Tue, 27 Jul 2021 14:12:33 +0100
Subject: [PATCH 02/16] CAT-31 add github actions and manage scispaCy models
 using requirements-*.txt

---
 .github/workflows/main.yml  | 30 ++++++++++++++++++++++++++++++
 medcat/utils/normalizers.py |  5 +----
 requirements-lg.txt         |  2 ++
 requirements-sm.txt         |  2 ++
 requirements.txt            |  1 +
 setup.py                    |  4 ++--
 6 files changed, 38 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/main.yml
 create mode 100644 requirements-lg.txt
 create mode 100644 requirements-sm.txt

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 000000000..b5eaceb3c
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,30 @@
+name: Main
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [ 3.7 ]
+      max-parallel: 1
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Test
+        run: |
+          python -m unittest discover
\ No newline at end of file
diff --git a/medcat/utils/normalizers.py b/medcat/utils/normalizers.py
index cc15bb0f6..8e6a07e27 100644
--- a/medcat/utils/normalizers.py
+++ b/medcat/utils/normalizers.py
@@ -1,9 +1,6 @@
-#import hunspell
 import re
-from collections import Counter
-from spacy.tokens import Span
 import spacy
-import os
+
 
 CONTAINS_NUMBER = re.compile('[0-9]+')
 
diff --git a/requirements-lg.txt b/requirements-lg.txt
new file mode 100644
index 000000000..7d293df4e
--- /dev/null
+++ b/requirements-lg.txt
@@ -0,0 +1,2 @@
+.
+https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz
\ No newline at end of file
diff --git a/requirements-sm.txt b/requirements-sm.txt
new file mode 100644
index 000000000..03885cbf2
--- /dev/null
+++ b/requirements-sm.txt
@@ -0,0 +1,2 @@
+.
+https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 9c558e357..78c7cf284 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
 .
+https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 0012bfa07..f65830479 100644
--- a/setup.py
+++ b/setup.py
@@ -21,11 +21,11 @@
         'numpy~=1.20',
         'pandas~=1.0',
         'gensim~=3.8',
-        'spacy==3.1.1',
+        'spacy<3.1.0,>=3.0.1',
         'scipy~=1.5',
         'transformers~=4.5.1',
         'torch~=1.8.1',
-        'Flask~=1.1',
+        'tqdm<4.50.0,>=4.27',
         'sklearn~=0.0',
         'elasticsearch~=7.10',
         'dill~=0.3.3',

From 340ced732c84eaa2b591c865f06fe86505880aea Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Tue, 27 Jul 2021 14:14:36 +0100
Subject: [PATCH 03/16] CAT-31 test github actions

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b5eaceb3c..865cfd186 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -2,7 +2,7 @@ name: Main
 
 on:
   push:
-    branches: [ master ]
+    branches: [ master, spacy-v3 ]
   pull_request:
     branches: [ master ]
 

From 5e5e885a56d25bb3884b60ea37371c8bc340d42a Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Tue, 27 Jul 2021 14:14:36 +0100
Subject: [PATCH 04/16] CAT-31 add badges to readme and fix vocab urls

---
 .github/workflows/main.yml | 6 +++---
 README.md                  | 6 +++++-
 webapp/envs/env_medmen     | 4 ++--
 webapp/webapp/Dockerfile   | 7 +++----
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b5eaceb3c..df7f5d505 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -1,8 +1,8 @@
-name: Main
+name: build
 
 on:
   push:
-    branches: [ master ]
+    branches: [ master, spacy-v3 ]
   pull_request:
     branches: [ master ]
 
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ 3.7 ]
+        python-version: [ 3.7, 3.8, 3.9 ]
       max-parallel: 1
 
     steps:
diff --git a/README.md b/README.md
index f1eea4679..2437541b8 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,12 @@
 # Medical  <img src="https://github.com/CogStack/MedCAT/blob/master/media/cat-logo.png" width=45> oncept Annotation Tool
 
+[![Build Status](https://github.com/CogStack/MedCAT/actions/workflows/main.yml/badge.svg?branch=master)](https://github.com/CogStack/MedCAT/actions/workflows/main.yml?query=branch%3Amaster)
+[![Latest release](https://img.shields.io/github/v/release/CogStack/MedCAT)](https://github.com/CogStack/MedCAT/releases/latest)
+[![pypi Version](https://img.shields.io/pypi/v/medcat.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/medcat/)
+
 MedCAT can be used to extract information from Electronic Health Records (EHRs) and link it to biomedical ontologies like SNOMED-CT and UMLS. Paper on [arXiv](https://arxiv.org/abs/2010.01165). 
 
-## News 
+## News
 - **New Feature and Tutorial \[8. July 2021\]**: [Integrating 🤗 Transformers with MedCAT for biomedical NER+L](https://towardsdatascience.com/integrating-transformers-with-medcat-for-biomedical-ner-l-8869c76762a)
 - **General \[1. April 2021\]**: MedCAT is upgraded to v1, unforunately this introduces breaking changes with older models (MedCAT v0.4), 
 as well as potential problems with all code that used the MedCAT package. MedCAT v0.4 is available on the legacy 
diff --git a/webapp/envs/env_medmen b/webapp/envs/env_medmen
index 85b79f850..447f0dfb9 100644
--- a/webapp/envs/env_medmen
+++ b/webapp/envs/env_medmen
@@ -19,8 +19,8 @@ KEEP_PUNCT=:|.
 SPACY_MODEL=en_core_sci_md
 VOCAB_PATH=/webapp/models/vocab.dat
 CDB_PATH=/webapp/models/cdb.dat
-VOCAB_URL=https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat
-CDB_URL=https://s3-eu-west-1.amazonaws.com/zkcl/cdb-medmen.dat
+VOCAB_URL=https://medcat.rosalind.kcl.ac.uk/media/vocab.dat
+CDB_URL=https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1.dat
 
 MKL_NUM_THREAD=1
 NUMEXPR_NUM_THREADS=1
diff --git a/webapp/webapp/Dockerfile b/webapp/webapp/Dockerfile
index ccd5e591c..61575edba 100644
--- a/webapp/webapp/Dockerfile
+++ b/webapp/webapp/Dockerfile
@@ -6,8 +6,8 @@ RUN mkdir -p /webapp/models
 # Copy everything
 COPY . /webapp
 
-ENV VOCAB_URL=https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat
-ENV CDB_URL=https://s3-eu-west-1.amazonaws.com/zkcl/cdb-medmen.dat
+ENV VOCAB_URL=https://medcat.rosalind.kcl.ac.uk/media/vocab.dat
+ENV CDB_URL=https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1.dat
 
 ENV CDB_PATH=/webapp/models/cdb.dat
 ENV VOCAB_PATH=/webapp/models/vocab.dat
@@ -17,9 +17,8 @@ WORKDIR /webapp
 
 RUN pip install -r requirements.txt
 
-# Get the spacy and scipspacy model
+# Get the spacy model
 RUN python -m spacy download en_core_web_md
-RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz
 
 # Build the db
 RUN python manage.py makemigrations && \

From e1f53c31acabeecdb439cf1c71c539bbc5b4f88b Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Tue, 27 Jul 2021 17:54:15 +0100
Subject: [PATCH 05/16] CAT-18 replace another partial function

---
 .github/workflows/main.yml | 2 +-
 medcat/cat.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index df7f5d505..849333ac5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -13,7 +13,7 @@ jobs:
     strategy:
       matrix:
         python-version: [ 3.7, 3.8, 3.9 ]
-      max-parallel: 1
+      max-parallel: 3
 
     steps:
       - uses: actions/checkout@v2
diff --git a/medcat/cat.py b/medcat/cat.py
index 9a7db9f15..f5c86dbed 100644
--- a/medcat/cat.py
+++ b/medcat/cat.py
@@ -70,7 +70,7 @@ def __init__(self, cdb, config, vocab, meta_cats=[]):
 
         # Build the pipeline
         self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
-        self.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=self.config),
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                             name='skip_and_punct',
                             additional_fields=['is_punct'])
 

From f91239f6ab092f5d54fc58014c97307c262c32ad Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Wed, 28 Jul 2021 11:32:06 +0100
Subject: [PATCH 06/16] CAT-18 close files after their use and pass on the
 replace param

---
 medcat/vocab.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/medcat/vocab.py b/medcat/vocab.py
index d2558744a..f21de2c8a 100644
--- a/medcat/vocab.py
+++ b/medcat/vocab.py
@@ -134,7 +134,7 @@ def add_word(self, word, cnt=1, vec=None, replace=True):
             cnt (int):
                 count of this word in your dataset
             vec (np.array):
-                the vector repesentation of the word
+                the vector representation of the word
             replace (bool):
                 will replace old vector representation
         """
@@ -170,17 +170,16 @@ def add_words(self, path, replace=True):
             replace (bool):
                 existing words in the vocabulary will be replaced
         """
-        f = open(path)
+        with open(path) as f:
+            for line in f:
+                parts = line.split("\t")
+                word = parts[0]
+                cnt = int(parts[1].strip())
+                vec = None
+                if len(parts) == 3:
+                    vec = np.array([float(x) for x in parts[2].strip().split(" ")])
 
-        for line in f:
-            parts = line.split("\t")
-            word = parts[0]
-            cnt = int(parts[1].strip())
-            vec = None
-            if len(parts) == 3:
-                vec = np.array([float(x) for x in parts[2].strip().split(" ")])
-
-            self.add_word(word, cnt, vec)
+                self.add_word(word, cnt, vec, replace)
 
 
     def make_unigram_table(self, table_size=100000000):
@@ -232,13 +231,17 @@ def get_negative_samples(self, n=6, ignore_punct_and_num=False):
 
 
     def __getitem__(self, word):
-        return self.vocab[word]['cnt']
+        return self.count(word)
 
 
     def vec(self, word):
         return self.vocab[word]['vec']
 
 
+    def count(self, word):
+        return self.vocab[word]['cnt']
+
+
     def item(self, word):
         return self.vocab[word]
 

From edd3d6e64601ecda85d4d0da55da63b5dd5cb82c Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Wed, 28 Jul 2021 11:32:58 +0100
Subject: [PATCH 07/16] CAT-18 refactor and add more tests

---
 examples/vocab_data.txt         |  2 ++
 medcat/cat.py                   |  2 +-
 medcat/pipe.py                  | 12 ++------
 medcat/preprocessing/taggers.py | 21 +++++---------
 tests/test_cat.py               | 27 +++++++++++++++++
 tests/test_cdb.py               | 51 +++++++++++++++++++++++++++++++++
 tests/test_vocab.py             | 39 +++++++++++++++++++++++++
 7 files changed, 130 insertions(+), 24 deletions(-)
 create mode 100644 examples/vocab_data.txt
 create mode 100644 tests/test_cat.py
 create mode 100644 tests/test_cdb.py
 create mode 100644 tests/test_vocab.py

diff --git a/examples/vocab_data.txt b/examples/vocab_data.txt
new file mode 100644
index 000000000..da3f43048
--- /dev/null
+++ b/examples/vocab_data.txt
@@ -0,0 +1,2 @@
+house	34444	 0.3232 0.123213 1.231231
+dog	14444	0.76762 0.76767 1.45454
diff --git a/medcat/cat.py b/medcat/cat.py
index f5c86dbed..ce8a24628 100644
--- a/medcat/cat.py
+++ b/medcat/cat.py
@@ -116,7 +116,7 @@ def __call__(self, text, do_train=False):
         Returns:
             A spacy document with the extracted entities
         '''
-        # Should we train - do not use this for training, unles you know what you are doing. Use the
+        # Should we train - do not use this for training, unless you know what you are doing. Use the
         #self.train() function
         self.config.linking['train'] = do_train
 
diff --git a/medcat/pipe.py b/medcat/pipe.py
index d22094bb5..7fd7c5a10 100644
--- a/medcat/pipe.py
+++ b/medcat/pipe.py
@@ -24,7 +24,6 @@ def __init__(self, tokenizer, config):
         self.nlp.tokenizer = tokenizer(self.nlp)
         self.config = config
 
-
     def add_tagger(self, tagger, name, additional_fields=[]):
         r''' Add any kind of a tagger for tokens.
 
@@ -37,9 +36,9 @@ def add_tagger(self, tagger, name, additional_fields=[]):
             additional_fields (`List[str]`):
                 Fields to be added to the `_` properties of a token.
         '''
-        component_name = spacy.util.get_object_name(tagger)
-        Language.factory(name=component_name, default_config={"config": self.config}, func=tagger)
-        self.nlp.add_pipe(component_name, name='tag_' + name, first=True)
+        component_factory_name = spacy.util.get_object_name(tagger)
+        Language.factory(name=component_factory_name, default_config={"config": self.config}, func=tagger)
+        self.nlp.add_pipe(component_factory_name, name='tag_' + name, first=True)
         # Add custom fields needed for this usecase
         Token.set_extension('to_skip', default=False, force=True)
 
@@ -47,7 +46,6 @@ def add_tagger(self, tagger, name, additional_fields=[]):
         for field in additional_fields:
             Token.set_extension(field, default=False, force=True)
 
-
     def add_token_normalizer(self, config, spell_checker=None):
         token_normalizer = TokenNormalizer(spell_checker=spell_checker, config=config)
         component_name = spacy.util.get_object_name(token_normalizer)
@@ -57,7 +55,6 @@ def add_token_normalizer(self, config, spell_checker=None):
         # Add custom fields needed for this usecase
         Token.set_extension('norm', default=None, force=True)
 
-
     def add_ner(self, ner):
         r''' Add NER from CAT to the pipeline, will also add the necessary fields
         to the document and Span objects.
@@ -75,7 +72,6 @@ def add_ner(self, ner):
         Span.set_extension('detected_name', default=None, force=True)
         Span.set_extension('link_candidates', default=None, force=True)
 
-
     def add_linker(self, linker):
         r''' Add entity linker to the pipeline, will also add the necessary fields
         to Span object.
@@ -90,7 +86,6 @@ def add_linker(self, linker):
         Span.set_extension('cui', default=-1, force=True)
         Span.set_extension('context_similarity', default=-1, force=True)
 
-
     def add_meta_cat(self, meta_cat, name):
         component_name = spacy.util.get_object_name(meta_cat)
         Language.component(name=component_name, func=meta_cat)
@@ -100,6 +95,5 @@ def add_meta_cat(self, meta_cat, name):
         #of {category_name: value, ...}
         Span.set_extension('meta_anns', default=None, force=True)
 
-
     def __call__(self, text):
         return self.nlp(text)
diff --git a/medcat/preprocessing/taggers.py b/medcat/preprocessing/taggers.py
index c63539a12..6ec2ac6e0 100644
--- a/medcat/preprocessing/taggers.py
+++ b/medcat/preprocessing/taggers.py
@@ -1,16 +1,19 @@
-import re
-
 def tag_skip_and_punct(nlp, name, config):
     r''' Detects and tags spacy tokens that are punctuation and that should be skipped.
 
      Args:
+         nlp (spacy.language.<lng>):
+             The base spacy NLP pipeline.
+         name (`str`):
+             The component instance name.
          config (`medcat.config.Config`):
              Global config for medcat.
     '''
 
-    return TagSkipAndPunct(nlp, name, config)
+    return _Tagger(nlp, name, config)
+
 
-class TagSkipAndPunct(object):
+class _Tagger(object):
 
     def __init__(self, nlp, name, config):
         self.nlp = nlp
@@ -18,16 +21,6 @@ def __init__(self, nlp, name, config):
         self.config = config
 
     def __call__(self, doc):
-        r''' Detects and tags spacy tokens that are punctuation and that should be skipped.
-
-        Args:
-            doc (`spacy.tokens.Doc`):
-                Spacy document that will be tagged.
-
-        Return:
-            (`spacy.tokens.Doc):
-                Tagged spacy document
-        '''
         # Make life easier
         cnf_p = self.config.preprocessing
 
diff --git a/tests/test_cat.py b/tests/test_cat.py
new file mode 100644
index 000000000..869bcba54
--- /dev/null
+++ b/tests/test_cat.py
@@ -0,0 +1,27 @@
+import os
+import unittest
+from medcat.vocab import Vocab
+from medcat.cdb import CDB
+from medcat.cat import CAT
+
+
+class CATTests(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.cdb = CDB.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat"))
+        self.vocab = Vocab.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat"))
+        self.cdb.config.ner['min_name_len'] = 2
+        self.cdb.config.ner['upper_case_limit_len'] = 3
+        self.cdb.config.general['spell_check'] = True
+        self.cdb.config.linking['train_count_threshold'] = 10
+        self.cdb.config.linking['similarity_threshold'] = 0.3
+        self.cdb.config.linking['train'] = True
+        self.cdb.config.linking['disamb_length_limit'] = 5
+        self.cdb.config.general['full_unlink'] = True
+        self.undertest = CAT(cdb=self.cdb, config=self.cdb.config, vocab=self.vocab)
+
+    def test_pipeline(self):
+        text = "The dog is sitting outside the house."
+        doc = self.undertest(text)
+        self.assertEqual(text, doc.text)
+
diff --git a/tests/test_cdb.py b/tests/test_cdb.py
new file mode 100644
index 000000000..0d9bad237
--- /dev/null
+++ b/tests/test_cdb.py
@@ -0,0 +1,51 @@
+import os
+import shutil
+import unittest
+from medcat.config import Config
+from medcat.cdb_maker import CDBMaker
+
+
+class CDBTests(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        config = Config()
+        config.general["spacy_model"] = "en_core_sci_md"
+        cls.cdb_maker = CDBMaker(config)
+
+    def setUp(self) -> None:
+        cdb_csv = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.csv")
+        cdb_2_csv = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb_2.csv")
+        self.tmp_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tmp")
+        os.makedirs(self.tmp_dir, exist_ok=True)
+        self.undertest = CDBTests.cdb_maker.prepare_csvs([cdb_csv, cdb_2_csv], full_build=True)
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.tmp_dir)
+
+    def test_name2cuis(self):
+        self.assertEqual({
+            'second~csv': ['C0000239'],
+            'virus': ['C0000039', 'C0000139'],
+            'virus~k': ['C0000039', 'C0000139'],
+            'virus~m': ['C0000039', 'C0000139'],
+            'virus~z': ['C0000039', 'C0000139']
+        }, self.undertest.name2cuis)
+
+    def test_cui2names(self):
+        self.assertEqual({
+            'C0000039': {'virus~z', 'virus~k', 'virus~m', 'virus'},
+            'C0000139': {'virus~z', 'virus', 'virus~m', 'virus~k'},
+            'C0000239': {'second~csv'}
+        }, self.undertest.cui2names)
+
+    def test_cui2preferred_name(self):
+        self.assertEqual({'C0000039': 'Virus', 'C0000139': 'Virus Z'}, self.undertest.cui2preferred_name)
+
+    def test_cui2type_ids(self):
+        self.assertEqual({'C0000039': {'T109', 'T234', 'T123'}, 'C0000139': set(), 'C0000239': set()}, self.undertest.cui2type_ids)
+
+    def test_save_and_load(self):
+        cdb_path = f"{self.tmp_dir}/cdb.dat"
+        self.undertest.save(cdb_path)
+        self.undertest.load(cdb_path)
diff --git a/tests/test_vocab.py b/tests/test_vocab.py
new file mode 100644
index 000000000..8db82df41
--- /dev/null
+++ b/tests/test_vocab.py
@@ -0,0 +1,39 @@
+import os
+import shutil
+import unittest
+from medcat.vocab import Vocab
+
+
+class CATTests(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.undertest = Vocab()
+        self.tmp_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tmp")
+        os.makedirs(self.tmp_dir, exist_ok=True)
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.tmp_dir)
+
+    def test_add_words(self):
+        self.undertest.add_words(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt"))
+        self.assertEqual(["house", "dog"], list(self.undertest.vocab.keys()))
+
+    def test_add_word(self):
+        self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55])
+        self.assertEqual(["test"], list(self.undertest.vocab.keys()))
+        self.assertTrue("test" in self.undertest)
+
+    def test_count(self):
+        self.undertest.add_words(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt"))
+        self.assertEqual(34444, self.undertest.count("house"))
+
+    def test_save_and_load(self):
+        self.undertest.add_words(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt"))
+        self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55])
+        vocab_path = f"{self.tmp_dir}/vocab.dat"
+        self.undertest.save(vocab_path)
+        vocab = Vocab.load(vocab_path)
+        self.assertEqual(["house", "dog", "test"], list(vocab.vocab.keys()))
+
+
+

From 91e173068466295c1bd079fe0e719dac01973a25 Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Wed, 28 Jul 2021 12:00:43 +0100
Subject: [PATCH 08/16] CAT-18 fix git ignore

---
 .gitignore         |   1 +
 examples/cdb.dat   | Bin 0 -> 3301 bytes
 examples/vocab.dat | Bin 0 -> 369 bytes
 3 files changed, 1 insertion(+)
 create mode 100644 examples/cdb.dat
 create mode 100644 examples/vocab.dat

diff --git a/.gitignore b/.gitignore
index eeff9f585..028221ea9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,3 +41,4 @@ tmp.py
 
 # models files
 *.dat
+!examples/*.dat
diff --git a/examples/cdb.dat b/examples/cdb.dat
new file mode 100644
index 0000000000000000000000000000000000000000..d972e22f5b09e6cd97cf6fbca200dfe9b93966cb
GIT binary patch
literal 3301
zcmaJ^-ESO85#RON*?4agC$R~4N|09KA~`~_6X%3<(TU)g!;p=Ak)03jPIs-|ncnST
zXQqegX(x^@5fTDosim{z4pAIk6y4kL!~rk-3A_Ohd_5sv`6J+0J+o^E3C7x)?&|95
z`uJ5<`^S?{{@$+n=b@XoM*@}hqk*IGbP)Df!iOUF)0xD?MZ~noc+LxzJ6r$6*<&eB
z1ltn1j#R4MN7y%Y<Gs7@Za6!&a^v0QbtmkpTqJ5sFl-cQA%%{#>rUCZBuZH<iUONi
zE`DU9T!gMWVeCnsrmEltL^5TH%oJXOlc5TjxL=4g6d{Y!zH&RZ4UG#BA*o-u>;J}M
zkBw+~G>{|W$ynVJdBC+`)v|kRY;zz|k@L8;I93D5*%Gnq&e;~t`YKO&;q2GX&HDv1
z_q@JxZoV3H&dn!6Yd!$g*`u^2<Arp#y|9>wFyO@kXwJT&GahWSFw(piL;FA_nM&b~
z(q|1Q;p`E3NYkLXB?7n@5}RZjq`=c)j*4v-M=6EGzNSGCX%Ic2S-V=zO-$31iO2_n
zrOKo%kR}~M)bWz$86yG0Oi9qDSE!*t^aO-@H4_!W2llwYZj}qBgNP-ev(ty8|A=(R
zL>%J;5e(g9d*aZPg+gSA&sPr9VhD_*Br06DY3H<5_gE@)0e_6O+Oe<o4P=^>426Kx
zUOF>bCUWM(N3Pdy$Jk?8F0xz&2tbsUpgpdKQO0zkvU@5I!DWy60V@=vCHE;1+DMgl
zyS6oiiYzl}P<V2E0hqpmg_9*RQ}_?jkKtj$>-2%tbvv8z`wVzA2{Z?0(gA{g{oB`O
zYW75^u(3F_*`Imm+PbriJJ-JB-#7hx74PfUH~f3k?HYNO60c3i`@Di&*KHtBILsSX
z1*$t?=MFi<yOfk5=F>&KjdY@v6pU<V_yH5}SCd>i^3+t3^C(5src_o$iBEvZVmlMS
zA@~FbjG8@4mpFIbx;^5ZlDpNbt#AJGUuRb?7&}XFPI{I3qsTVw+$jG_7&8qIYJWiU
z{{8#0Lae&)*dwFi1v^y{&)CUw^%Ra1VPq25owic|N)E_048TxjdnNgvpbYo2<(+O#
z-o@_+t5<*bjGle@{_54gRl^Ssh98a@jGgO8F$xgt31qtP`I+<xw%$8Hzd?8lnG5I;
zqdL{?86Mx`+Zs@e0~7NC*#ShyJV$b3;~S+)fHX%4P#0LA0ESZLY%7f7cmapRNUjqs
z$OUa5ukX2e*|Y5tJwOsV(7N?*4S)6SRCjabbWscI;lwAkurXT;C+BLf*M2gwS#$I2
zW23-1v3m7N86*^r2GHYv>g*&EEu+De#oMSddNJWaPpRSJU6l_h@5N%E1{ZX}gW=+u
z2v?SGRV55`>Iw_lHcH$(F+v{lZ@F%|Qgi3e7jDA^RjDiaS_~!z1!FCluCG6I>%?Nu
z95^i82D+o*kYC}q11}!XS%pXFR78P8$r!u(fC-5FeHmmsmx=*6sn_JH$9qvsL0P@}
zkFvwruZ`EE0dgB`6dT6Vkfp%dV>^qA(gz%}JW4q7h_Vp-|J?VFeZ5-lQ@f(Xhrxl!
z0KfyD8Pro`MHUOj`+$~PyJ+lJL$SqUgIuEOP36DZw`0#@>>F66QUJNopvv?e+bj`E
z7`KywLGS$3X=B?ywW%neE;>7rBX5sFZjwbY!NxW}o=2HG|EBA9en$R3R@&;TDR#)8
z+t!C4^9v6yUU>VXZ&XaiHqi9TUPz`d`SYkIO>ITFDSOq?m5cOTej5k+;Ux?OdKPg!
zO8gz*|MLw0i%0H{w?CjI`594KTEI2oqd;$gSe><mv$(M*#@a0rHlP`llza4gV(T}Z
z{DrMAJ0qVG<r$6TKGBVQPL$8&7jj?z(#RLD%w9fp@^kqGyn95`0p#lWSM<PwSBhG6
zW$tJ+baHPret;%24c{%5QoLFaC7;Wua!<b4mHQ;P5-~h1tn%|{4T9w7wDaYzu`QBV
zDK-}QYxx_9{OzkEld(t$|6d}1CBKBwSrR&e;ON=`sLhmgU;ccf0PQHz3O?{yZ2JlN
z1&h#UNBY>wTbC{Z8Tc$+rq9ylw_pS8B|D5gMZ3x@!fmh2FTdAL-SG%RoSJ^r`2g&v
ziG5T^p=Cb`ape(|3l#*)0zJ~OO%7*~_RX=x#EyKrBcJcc7d!GLdf+q<!??UW;@mPy
zQ4n6bNHTrQR2JyBXm*?qM{+9M;ON2xbUdNBHP$x2=XP~zi^3f<N_p%}Ipl}Rh^~`=
zIWTB6S}R>Lf3Vg?#7eW*ocwDUblckWVCzr@AXGjiq0m7dp<|<=kV1BEbDJ*JsE?gY
zDve(pWl`8MT7;b_?O>*~0+>4}b2P&m3M1b}3+NQ8qYD#mgW|13jqW5O>E#^vvW3p~
zi%x`nu~r@4N%#*{-a!yF?x9ruI;!8fdHcrNd+W;^?{_wDZ+v%o<<|20t<43eZJSCr
zpy5-1yg4SotEHd*K)`;oEB{!1|3u$sFw}=(kSmj6qXB?bz<pJcd$<nNaVcon`Vjvr
f*!ly!<(Fv74RTVqj!Pl)_Zj+cgD2Xp8?*2q_gO9U

literal 0
HcmV?d00001

diff --git a/examples/vocab.dat b/examples/vocab.dat
new file mode 100644
index 0000000000000000000000000000000000000000..32fd3ed2b9e2e47405c987ab167196093a915948
GIT binary patch
literal 369
zcmZo*nVQDP00y;FG<sOe@{<#jrhu6l`K86FU?y`}YVwpG(Y(^!f=a#Q{GwF7+|rzq
z%*3Li#L6i>Jn==T$@zK3B}JvlB~vC(>0t#coYKRdmjY2Td5Skf>lA0kq$%xFf~IJA
zGkP<(PRZcxVNEHiEJy`uVM?=@(%Au#W$t0Lnd0Z?=k*^5z=StL$&{o{X9-5%7GD4L
zSM2$<3S}R6?YECsdFv*>`Lq2LpmYy&a$d<4-=4M}=FGelpoOJ9%qjWlAopjmW^iP1
zgPf4Ti|hbVpph236&J%Nys%$+lX>j{AboR+_MX$O-@!&^NcrYiWJr56mX`K#0S!&9
wFe=Y4N&y+^&5*(D&6pum+QSD7n|NH3yrp@W=|zdT@g<2#IjK`(r<Cde00|J0LjV8(

literal 0
HcmV?d00001


From 3fad158a88874755325ad24387af20a55ccd73ee Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Mon, 26 Jul 2021 17:39:59 +0100
Subject: [PATCH 09/16] CAT-18 upgrade spaCy to v3

---
 README.md                                     |   6 +-
 medcat/cdb_maker.py                           |  12 +-
 medcat/linking/context_based_linker.py        |   5 +-
 medcat/meta_cat.py                            |   1 +
 medcat/ner/vocab_based_ner.py                 |   3 +-
 medcat/pipe.py                                |  26 +-
 medcat/preprocessing/taggers.py               |  65 +++--
 medcat/utils/make_vocab.py                    |   2 +-
 setup.py                                      |   2 +-
 tests/__init__.py                             |   0
 tests/archive_tests/test_cdb_maker_archive.py | 237 ++++++++---------
 tests/archive_tests/test_ner_archive.py       | 246 +++++++++---------
 tests/test_cdb_maker.py                       |  16 +-
 tests/test_ner.py                             |   4 +-
 webapp/webapp/Dockerfile                      |   2 +-
 15 files changed, 338 insertions(+), 289 deletions(-)
 create mode 100644 tests/__init__.py

diff --git a/README.md b/README.md
index 7b8fc2518..c124c409e 100644
--- a/README.md
+++ b/README.md
@@ -30,9 +30,9 @@ A guide on how to use MedCAT is available in the [tutorial](https://github.com/C
 
 2. Get the scispacy models:
 
-`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz`
+`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz`
 
-`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz`
+`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz`
 
 3. Downlad the Vocabulary and CDB from the Models section bellow
 
@@ -98,7 +98,7 @@ CDB [Download](https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1.dat) - Buil
 MetaCAT Status [Download](https://medcat.rosalind.kcl.ac.uk/media/mc_status.zip) - Built from a sample from MIMIC-III, detects is an annotation Affirmed (Positve) or Other (Negated or Hypothetical)
 
 
-(Note: This is was compiled from MedMentions and does not have any data from [NLM](https://www.nlm.nih.gov/research/umls/) as
+(Note: This was compiled from MedMentions and does not have any data from [NLM](https://www.nlm.nih.gov/research/umls/) as
 that data is not publicaly available.)
 
 ### SNOMED-CT and UMLS
diff --git a/medcat/cdb_maker.py b/medcat/cdb_maker.py
index c1cdc758e..3597831aa 100644
--- a/medcat/cdb_maker.py
+++ b/medcat/cdb_maker.py
@@ -44,28 +44,28 @@ def __init__(self, config, cdb=None, name_max_words=20):
 
         # Build the required spacy pipeline
         self.nlp = Pipe(tokenizer=spacy_split_all, config=config)
-        self.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=self.config),
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                             name='skip_and_punct',
                             additional_fields=['is_punct'])
 
 
     def prepare_csvs(self, csv_paths, sep=',', encoding=None, escapechar=None, index_col=False, full_build=False, only_existing_cuis=False, **kwargs):
-        r''' Compile one or multipe CSVs into a CDB.
+        r''' Compile one or multiple CSVs into a CDB.
 
         Args:
             csv_paths (`List[str]`):
                 An array of paths to the csv files that should be processed
-            full_build (`bool`, defautls to `True`):
+            full_build (`bool`, defaults to `True`):
                 If False only the core portions of the CDB will be built (the ones required for
                 the functioning of MedCAT). If True, everything will be added to the CDB - this
                 usually includes concept descriptions, various forms of names etc (take care that
                 this option produces a much larger CDB).
             sep (`str`, defaults to `,`):
-                If necessarya a custom separator for the csv files
+                If necessary a custom separator for the csv files
             encoding (`str`, optional):
-                Encoing to be used for reading the CSV file
+                Encoding to be used for reading the CSV file
             escapechar (`str`, optional):
-                Escapechar for the CSV
+                Escape char for the CSV
             index_col (`bool`, defaults_to `False`):
                 Index column for pandas read_csv
             only_existing_cuis (`bool`, defaults to False):
diff --git a/medcat/linking/context_based_linker.py b/medcat/linking/context_based_linker.py
index e7fb09578..94d1a01dc 100644
--- a/medcat/linking/context_based_linker.py
+++ b/medcat/linking/context_based_linker.py
@@ -1,7 +1,8 @@
-from medcat.utils.filters import check_filters
-from medcat.linking.vector_context_model import ContextModel
 import random
 import logging
+from medcat.utils.filters import check_filters
+from medcat.linking.vector_context_model import ContextModel
+
 
 class Linker(object):
     r''' Link to a biomedical database.
diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py
index af197801f..3b46b856e 100644
--- a/medcat/meta_cat.py
+++ b/medcat/meta_cat.py
@@ -10,6 +10,7 @@
 from medcat.preprocessing.tokenizers import TokenizerWrapperBPE
 from medcat.preprocessing.tokenizers import TokenizerWrapperBERT
 
+
 class MetaCAT(object):
     r''' TODO: Add documentation
     '''
diff --git a/medcat/ner/vocab_based_ner.py b/medcat/ner/vocab_based_ner.py
index 9aef8f41f..7c12c5ff2 100644
--- a/medcat/ner/vocab_based_ner.py
+++ b/medcat/ner/vocab_based_ner.py
@@ -1,5 +1,6 @@
-from medcat.ner.vocab_based_annotator import maybe_annotate_name
 import logging
+from medcat.ner.vocab_based_annotator import maybe_annotate_name
+
 
 class NER(object):
     r'''
diff --git a/medcat/pipe.py b/medcat/pipe.py
index 470fc146c..d22094bb5 100644
--- a/medcat/pipe.py
+++ b/medcat/pipe.py
@@ -1,7 +1,8 @@
+import spacy
 from spacy.tokens import Token, Doc, Span
+from spacy.language import Language
 from medcat.utils.normalizers import TokenNormalizer
-import spacy
-import os
+
 
 class Pipe(object):
     r''' A wrapper around the standard spacy pipeline.
@@ -21,6 +22,7 @@ def __init__(self, tokenizer, config):
         if config.preprocessing['stopwords'] is not None:
             self.nlp.Defaults.stop_words = set(config.preprocessing['stopwords'])
         self.nlp.tokenizer = tokenizer(self.nlp)
+        self.config = config
 
 
     def add_tagger(self, tagger, name, additional_fields=[]):
@@ -35,7 +37,9 @@ def add_tagger(self, tagger, name, additional_fields=[]):
             additional_fields (`List[str]`):
                 Fields to be added to the `_` properties of a token.
         '''
-        self.nlp.add_pipe(tagger, name='tag_' + name, first=True)
+        component_name = spacy.util.get_object_name(tagger)
+        Language.factory(name=component_name, default_config={"config": self.config}, func=tagger)
+        self.nlp.add_pipe(component_name, name='tag_' + name, first=True)
         # Add custom fields needed for this usecase
         Token.set_extension('to_skip', default=False, force=True)
 
@@ -46,7 +50,9 @@ def add_tagger(self, tagger, name, additional_fields=[]):
 
     def add_token_normalizer(self, config, spell_checker=None):
         token_normalizer = TokenNormalizer(spell_checker=spell_checker, config=config)
-        self.nlp.add_pipe(token_normalizer, name='token_normalizer', last=True)
+        component_name = spacy.util.get_object_name(token_normalizer)
+        Language.component(name=component_name, func=token_normalizer)
+        self.nlp.add_pipe(component_name, name='token_normalizer', last=True)
 
         # Add custom fields needed for this usecase
         Token.set_extension('norm', default=None, force=True)
@@ -57,7 +63,9 @@ def add_ner(self, ner):
         to the document and Span objects.
 
         '''
-        self.nlp.add_pipe(ner, name='cat_ner', last=True)
+        component_name = spacy.util.get_object_name(ner)
+        Language.component(name=component_name, func=ner)
+        self.nlp.add_pipe(component_name, name='cat_ner', last=True)
 
         Doc.set_extension('ents', default=[], force=True)
         Span.set_extension('confidence', default=-1, force=True)
@@ -76,13 +84,17 @@ def add_linker(self, linker):
             Any object/function created based on the requirements for a spaCy pipeline components. Have
             a look at https://spacy.io/usage/processing-pipelines#custom-components
         '''
-        self.nlp.add_pipe(linker, name='cat_linker', last=True)
+        component_name = spacy.util.get_object_name(linker)
+        Language.component(name=component_name, func=linker)
+        self.nlp.add_pipe(component_name, name='cat_linker', last=True)
         Span.set_extension('cui', default=-1, force=True)
         Span.set_extension('context_similarity', default=-1, force=True)
 
 
     def add_meta_cat(self, meta_cat, name):
-        self.nlp.add_pipe(meta_cat, name=name, last=True)
+        component_name = spacy.util.get_object_name(meta_cat)
+        Language.component(name=component_name, func=meta_cat)
+        self.nlp.add_pipe(component_name, name=name, last=True)
 
         # Only the meta_anns field is needed, it will be a dictionary 
         #of {category_name: value, ...}
diff --git a/medcat/preprocessing/taggers.py b/medcat/preprocessing/taggers.py
index 474e2d019..c63539a12 100644
--- a/medcat/preprocessing/taggers.py
+++ b/medcat/preprocessing/taggers.py
@@ -1,30 +1,45 @@
 import re
 
-def tag_skip_and_punct(doc, config):
+def tag_skip_and_punct(nlp, name, config):
     r''' Detects and tags spacy tokens that are punctuation and that should be skipped.
 
-    Args:
-        doc (`spacy.tokens.Doc`):
-            Spacy document that will be tagged.
-        config (`medcat.config.Config`):
-            Global config for medcat.
-
-    Return:
-        (`spacy.tokens.Doc):
-            Tagged spacy document
+     Args:
+         config (`medcat.config.Config`):
+             Global config for medcat.
     '''
-    # Make life easier
-    cnf_p = config.preprocessing
-
-    for token in doc:
-        if config.punct_checker.match(token.lower_) and token.text not in cnf_p['keep_punct']:
-            # There can't be punct in a token if it also has text
-            token._.is_punct = True
-            token._.to_skip = True
-        elif config.word_skipper.match(token.lower_):
-            # Skip if specific strings
-            token._.to_skip = True
-        elif cnf_p['skip_stopwords'] and token.is_stop:
-            token._.to_skip = True
-
-    return doc
+
+    return TagSkipAndPunct(nlp, name, config)
+
+class TagSkipAndPunct(object):
+
+    def __init__(self, nlp, name, config):
+        self.nlp = nlp
+        self.name = name
+        self.config = config
+
+    def __call__(self, doc):
+        r''' Detects and tags spacy tokens that are punctuation and that should be skipped.
+
+        Args:
+            doc (`spacy.tokens.Doc`):
+                Spacy document that will be tagged.
+
+        Return:
+            (`spacy.tokens.Doc):
+                Tagged spacy document
+        '''
+        # Make life easier
+        cnf_p = self.config.preprocessing
+
+        for token in doc:
+            if self.config.punct_checker.match(token.lower_) and token.text not in cnf_p['keep_punct']:
+                # There can't be punct in a token if it also has text
+                token._.is_punct = True
+                token._.to_skip = True
+            elif self.config.word_skipper.match(token.lower_):
+                # Skip if specific strings
+                token._.to_skip = True
+            elif cnf_p['skip_stopwords'] and token.is_stop:
+                token._.to_skip = True
+
+        return doc
diff --git a/medcat/utils/make_vocab.py b/medcat/utils/make_vocab.py
index 01547d4c9..a9ebfc18f 100644
--- a/medcat/utils/make_vocab.py
+++ b/medcat/utils/make_vocab.py
@@ -40,7 +40,7 @@ def __init__(self, config, cdb=None, vocab=None, word_tokenizer=None):
 
         # Build the required spacy pipeline
         self.nlp = Pipe(tokenizer=spacy_split_all, config=config)
-        self.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=self.config),
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                             name='skip_and_punct',
                             additional_fields=['is_punct'])
 
diff --git a/setup.py b/setup.py
index 3e33f8f58..00042b257 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@
         'numpy~=1.20',
         'pandas~=1.0',
         'gensim~=3.8',
-        'spacy==2.3.4',
+        'spacy==3.1.1',
         'scipy~=1.5',
         'transformers~=4.5.1',
         'torch~=1.8.1',
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/archive_tests/test_cdb_maker_archive.py b/tests/archive_tests/test_cdb_maker_archive.py
index 422fde2cd..8b8bb2acd 100644
--- a/tests/archive_tests/test_cdb_maker_archive.py
+++ b/tests/archive_tests/test_cdb_maker_archive.py
@@ -1,119 +1,124 @@
-r''' The tests here are a bit messy but they work, should be converted to python unittests.
-'''
-from medcat.cdb_maker import CDBMaker
-from medcat.config import Config
-import numpy as np
 import logging
-
-config = Config()
-config.general['log_level'] = logging.DEBUG
-maker = CDBMaker(config)
-
-# Building a new CDB from two files (full_build)
-csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
-cdb = maker.prepare_csvs(csvs, full_build=True)
-
-assert len(cdb.cui2names) == 3
-assert len(cdb.cui2snames) == 3
-assert len(cdb.name2cuis) == 5
-assert len(cdb.cui2tags) == 3
-assert len(cdb.cui2preferred_name) == 2
-assert len(cdb.cui2context_vectors) == 3
-assert len(cdb.cui2count_train) == 3
-assert cdb.name2cuis2status['virus']['C0000039'] == 'P'
-assert cdb.cui2type_ids['C0000039'] == {'T234', 'T109', 'T123'}
-assert cdb.addl_info['cui2original_names']['C0000039'] == {'Virus', 'Virus K', 'Virus M', 'Virus Z'}
-assert cdb.addl_info['cui2description']['C0000039'].startswith("Synthetic")
-
-# Test name addition
-from medcat.preprocessing.cleaners import prepare_name
-cdb.add_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', maker.nlp, {}, config), name_status='P', full_build=True)
-assert cdb.addl_info['cui2original_names']['C0000239'] == {'MY: new,-_! Name.', 'Second csv'}
-assert 'my:newname.' in cdb.name2cuis
-assert 'my:new' in cdb.snames
-assert 'my:newname.' in cdb.name2cuis2status
-assert cdb.name2cuis2status['my:newname.'] == {'C0000239': 'P'}
-
-# Test name removal
-cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', maker.nlp, {}, config))
-# Run again to make sure it does not break anything
-cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', maker.nlp, {}, config))
-assert len(cdb.name2cuis) == 5
-assert 'my:newname.' not in cdb.name2cuis2status
-
-# Test filtering
-cuis_to_keep = {'C0000039'} # Because of transition 2 will be kept
-cdb.filter_by_cui(cuis_to_keep=cuis_to_keep)
-assert len(cdb.cui2names) == 2
-assert len(cdb.name2cuis) == 4
-assert len(cdb.snames) == 4
-
-# Test vector addition
+import unittest
 import numpy as np
-cdb.reset_training()
-np.random.seed(11)
-cuis = list(cdb.cui2names.keys())
-for i in range(2):
-    for cui in cuis:
-        vectors = {}
-        for cntx_type in config.linking['context_vector_sizes']:
-            vectors[cntx_type] = np.random.rand(300)
-        cdb.update_context_vector(cui, vectors, negative=False)
-
-assert cdb.cui2count_train['C0000139'] == 2
-assert cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300
-
-
-# Test negative
-for cui in cuis:
-    vectors = {}
-    for cntx_type in config.linking['context_vector_sizes']:
-        vectors[cntx_type] = np.random.rand(300)
-    cdb.update_context_vector(cui, vectors, negative=True)
-
-assert cdb.cui2count_train['C0000139'] == 2
-assert cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300
-
-# Test save/load
 from medcat.cdb import CDB
-cdb.save("./tmp_cdb.dat")
-cdb2 = CDB.load('./tmp_cdb.dat')
-# Check a random thing
-assert cdb2.cui2context_vectors['C0000139']['long'][7] == cdb.cui2context_vectors['C0000139']['long'][7]
-
-# Test training import
-cdb.reset_training()
-cdb2.reset_training()
-np.random.seed(11)
-cuis = list(cdb.cui2names.keys())
-for i in range(2):
-    for cui in cuis:
-        vectors = {}
-        for cntx_type in config.linking['context_vector_sizes']:
-            vectors[cntx_type] = np.random.rand(300)
-        cdb.update_context_vector(cui, vectors, negative=False)
-
-cdb2.import_training(cdb=cdb, overwrite=True)
-assert cdb2.cui2context_vectors['C0000139']['long'][7] == cdb.cui2context_vectors['C0000139']['long'][7]
-assert cdb2.cui2count_train['C0000139'] == cdb.cui2count_train['C0000139']
-
-# Test concept similarity
-cdb = CDB(config=config)
-np.random.seed(11)
-for i in range(500):
-    cui = "C" + str(i)
-    type_ids = {'T-' + str(i%10)}
-    cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), maker.nlp, {}, config), ontologies=set(),
-            name_status='P', type_ids=type_ids, description='', full_build=True)
-
-    vectors = {}
-    for cntx_type in config.linking['context_vector_sizes']:
-        vectors[cntx_type] = np.random.rand(300)
-    cdb.update_context_vector(cui, vectors, negative=False)
-res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True)
-assert len(res) == 10
-
-# Test training reset
-cdb.reset_training()
-assert len(cdb.cui2context_vectors['C0']) == 0
-assert cdb.cui2count_train['C0'] == 0
+from medcat.cdb_maker import CDBMaker
+from medcat.config import Config
+from medcat.preprocessing.cleaners import prepare_name
+
+
+class CdbMakerArchiveTests(unittest.TestCase):
+
+    def setUp(self):
+        self.config = Config()
+        self.config.general['log_level'] = logging.DEBUG
+        self.maker = CDBMaker(self.config)
+
+        # Building a new CDB from two files (full_build)
+        csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
+        self.cdb = self.maker.prepare_csvs(csvs, full_build=True)
+
+    def test_prepare_csvs(self):
+        assert len(self.cdb.cui2names) == 3
+        assert len(self.cdb.cui2snames) == 3
+        assert len(self.cdb.name2cuis) == 5
+        assert len(self.cdb.cui2tags) == 3
+        assert len(self.cdb.cui2preferred_name) == 2
+        assert len(self.cdb.cui2context_vectors) == 3
+        assert len(self.cdb.cui2count_train) == 3
+        assert self.cdb.name2cuis2status['virus']['C0000039'] == 'P'
+        assert self.cdb.cui2type_ids['C0000039'] == {'T234', 'T109', 'T123'}
+        assert self.cdb.addl_info['cui2original_names']['C0000039'] == {'Virus', 'Virus K', 'Virus M', 'Virus Z'}
+        assert self.cdb.addl_info['cui2description']['C0000039'].startswith("Synthetic")
+
+    def test_name_addition(self):
+        self.cdb.add_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config), name_status='P', full_build=True)
+        assert self.cdb.addl_info['cui2original_names']['C0000239'] == {'MY: new,-_! Name.', 'Second csv'}
+        assert 'my:newname.' in self.cdb.name2cuis
+        assert 'my:new' in self.cdb.snames
+        assert 'my:newname.' in self.cdb.name2cuis2status
+        assert self.cdb.name2cuis2status['my:newname.'] == {'C0000239': 'P'}
+
+    def test_name_removal(self):
+        self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config))
+        # Run again to make sure it does not break anything
+        self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config))
+        assert len(self.cdb.name2cuis) == 5
+        assert 'my:newname.' not in self.cdb.name2cuis2status
+
+    def test_filtering(self):
+        cuis_to_keep = {'C0000039'} # Because of transition 2 will be kept
+        self.cdb.filter_by_cui(cuis_to_keep=cuis_to_keep)
+        assert len(self.cdb.cui2names) == 2
+        assert len(self.cdb.name2cuis) == 4
+        assert len(self.cdb.snames) == 4
+
+    def test_vector_addition(self):
+        self.cdb.reset_training()
+        np.random.seed(11)
+        cuis = list(self.cdb.cui2names.keys())
+        for i in range(2):
+            for cui in cuis:
+                vectors = {}
+                for cntx_type in self.config.linking['context_vector_sizes']:
+                    vectors[cntx_type] = np.random.rand(300)
+                self.cdb.update_context_vector(cui, vectors, negative=False)
+
+        assert self.cdb.cui2count_train['C0000139'] == 2
+        assert self.cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300
+
+
+    def test_negative(self):
+        cuis = list(self.cdb.cui2names.keys())
+        for cui in cuis:
+            vectors = {}
+            for cntx_type in self.config.linking['context_vector_sizes']:
+                vectors[cntx_type] = np.random.rand(300)
+            self.cdb.update_context_vector(cui, vectors, negative=True)
+
+        assert self.cdb.cui2count_train['C0000139'] == 2
+        assert self.cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300
+
+    def test_save_and_load(self):
+        self.cdb.save("./tmp_cdb.dat")
+        cdb2 = CDB.load('./tmp_cdb.dat')
+        # Check a random thing
+        assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7]
+
+    def test_training_import(self):
+        cdb2 = CDB.load('./tmp_cdb.dat')
+        self.cdb.reset_training()
+        cdb2.reset_training()
+        np.random.seed(11)
+        cuis = list(self.cdb.cui2names.keys())
+        for i in range(2):
+            for cui in cuis:
+                vectors = {}
+                for cntx_type in self.config.linking['context_vector_sizes']:
+                    vectors[cntx_type] = np.random.rand(300)
+                self.cdb.update_context_vector(cui, vectors, negative=False)
+
+        cdb2.import_training(cdb=self.cdb, overwrite=True)
+        assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7]
+        assert cdb2.cui2count_train['C0000139'] == self.cdb.cui2count_train['C0000139']
+
+    def test_concept_similarity(self):
+        cdb = CDB(config=self.config)
+        np.random.seed(11)
+        for i in range(500):
+            cui = "C" + str(i)
+            type_ids = {'T-' + str(i%10)}
+            cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.nlp, {}, self.config), ontologies=set(),
+                    name_status='P', type_ids=type_ids, description='', full_build=True)
+
+            vectors = {}
+            for cntx_type in self.config.linking['context_vector_sizes']:
+                vectors[cntx_type] = np.random.rand(300)
+            cdb.update_context_vector(cui, vectors, negative=False)
+        res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True)
+        assert len(res) == 10
+
+    def test_training_reset(self):
+        self.cdb.reset_training()
+        assert len(self.cdb.cui2context_vectors['C0']) == 0
+        assert self.cdb.cui2count_train['C0'] == 0
diff --git a/tests/archive_tests/test_ner_archive.py b/tests/archive_tests/test_ner_archive.py
index 1be695f35..6037f3c16 100644
--- a/tests/archive_tests/test_ner_archive.py
+++ b/tests/archive_tests/test_ner_archive.py
@@ -1,3 +1,9 @@
+import logging
+import os
+import unittest
+import numpy as np
+from timeit import default_timer as timer
+from medcat.cdb import CDB
 from medcat.preprocessing.tokenizers import spacy_split_all
 from medcat.ner.vocab_based_ner import NER
 from medcat.preprocessing.taggers import tag_skip_and_punct
@@ -6,127 +12,127 @@
 from medcat.vocab import Vocab
 from medcat.preprocessing.cleaners import prepare_name
 from medcat.linking.vector_context_model import ContextModel
-from functools import partial
 from medcat.linking.context_based_linker import Linker
 from medcat.config import Config
-import logging
-from medcat.cdb import CDB
-import os
-import requests
-
-config = Config()
-config.general['log_level'] = logging.INFO
-cdb = CDB(config=config)
-
-nlp = Pipe(tokenizer=spacy_split_all, config=config)
-nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config),
-               name='skip_and_punct',
-               additional_fields=['is_punct'])
-
-# Add a couple of names
-cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config))
-cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', nlp, {}, config))
-cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config))
-# Check
-#assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}
-
-vocab_path = "./tmp_vocab.dat"
-if not os.path.exists(vocab_path):
-    import requests
-    tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
-    with open(vocab_path, 'wb') as f:
-        f.write(tmp.content)
-
-vocab = Vocab.load(vocab_path)
-# Make the pipeline
-nlp = Pipe(tokenizer=spacy_split_all, config=config)
-nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config),
-               name='skip_and_punct',
-               additional_fields=['is_punct'])
-spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=config, data_vocab=vocab)
-nlp.add_token_normalizer(spell_checker=spell_checker, config=config)
-ner = NER(cdb, config)
-nlp.add_ner(ner)
-
-# Add Linker
-link = Linker(cdb, vocab, config)
-nlp.add_linker(link)
-
-# Test limits for tokens and uppercase
-config.ner['max_skip_tokens'] = 1
-config.ner['upper_case_limit_len'] = 4
-config.linking['disamb_length_limit'] = 2
-text = "CDB - I was running and then Movar    Virus attacked and CDb"
-d = nlp(text)
-
-assert len(d._.ents) == 2
-assert d._.ents[0]._.link_candidates[0] == 'S-229004'
-
-# Change limit for skip
-config.ner['max_skip_tokens'] = 3
-d = nlp(text)
-assert len(d._.ents) == 3
-
-# Change limit for upper_case
-config.ner['upper_case_limit_len'] = 3
-d = nlp(text)
-assert len(d._.ents) == 4
-
-# Check name length limit
-config.ner['min_name_len'] = 4
-d = nlp(text)
-assert len(d._.ents) == 2
-
-# Speed tests
-from timeit import default_timer as timer
-text = "CDB - I was running and then Movar    Virus attacked and CDb"
-text = text * 300
-config.general['spell_check'] = True
-start = timer()
-for i in range(50):
-    d = nlp(text)
-end = timer()
-print("Time: ", end - start)
-
-# Now without spell check
-config.general['spell_check'] = False
-start = timer()
-for i in range(50):
-    d = nlp(text)
-end = timer()
-print("Time: ", end - start)
-
-
-# Test for linker
-import numpy as np
-
-config = Config()
-config.general['log_level'] = logging.DEBUG
-cdb = CDB(config=config)
-
-# Add a couple of names
-cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config))
-cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', nlp, {}, config))
-cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config))
-cdb.add_names(cui='S-2290045', names=prepare_name('Movar', nlp, {}, config))
-# Check
-#assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}}
-
-cuis = list(cdb.cui2names.keys())
-for cui in cuis[0:50]:
-    vectors = {'short': np.random.rand(300),
-              'long': np.random.rand(300),
-              'medium': np.random.rand(300)
-              }
-    cdb.update_context_vector(cui, vectors, negative=False)
-
-vocab = Vocab.load(vocab_path)
-cm = ContextModel(cdb, vocab, config)
-cm.train_using_negative_sampling('S-229004')
-config.linking['train_count_threshold'] = 0
-
-cm.train('S-229004', d._.ents[1], d)
 
-cm.similarity('S-229004', d._.ents[1], d)
 
-cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d)
+class NerArchiveTests(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.config = Config()
+        self.config.general['log_level'] = logging.INFO
+        cdb = CDB(config=self.config)
+
+        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
+                       name='skip_and_punct',
+                       additional_fields=['is_punct'])
+
+        # Add a couple of names
+        cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config))
+        cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config))
+        cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config))
+        # Check
+        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}
+
+        self.vocab_path = "./tmp_vocab.dat"
+        if not os.path.exists(self.vocab_path):
+            import requests
+            tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
+            with open(self.vocab_path, 'wb') as f:
+                f.write(tmp.content)
+
+        vocab = Vocab.load(self.vocab_path)
+        # Make the pipeline
+        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
+                       name='skip_and_punct',
+                       additional_fields=['is_punct'])
+        spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=self.config, data_vocab=vocab)
+        self.nlp.add_token_normalizer(spell_checker=spell_checker, config=self.config)
+        ner = NER(cdb, self.config)
+        self.nlp.add_ner(ner)
+
+        # Add Linker
+        link = Linker(cdb, vocab, self.config)
+        self.nlp.add_linker(link)
+
+        self.text = "CDB - I was running and then Movar    Virus attacked and CDb"
+
+    def test_limits_for_tokens_and_uppercase(self):
+        self.config.ner['max_skip_tokens'] = 1
+        self.config.ner['upper_case_limit_len'] = 4
+        self.config.linking['disamb_length_limit'] = 2
+
+        d = self.nlp(self.text)
+
+        assert len(d._.ents) == 2
+        assert d._.ents[0]._.link_candidates[0] == 'S-229004'
+
+    def test_change_limit_for_skip(self):
+        self.config.ner['max_skip_tokens'] = 3
+        d = self.nlp(self.text)
+        assert len(d._.ents) == 3
+
+    def test_change_limit_for_upper_case(self):
+        self.config.ner['upper_case_limit_len'] = 3
+        d = self.nlp(self.text)
+        assert len(d._.ents) == 4
+
+    def test_check_name_length_limit(self):
+        self.config.ner['min_name_len'] = 4
+        d = self.nlp(self.text)
+        assert len(d._.ents) == 2
+
+    def test_speed(self):
+        text = "CDB - I was running and then Movar    Virus attacked and CDb"
+        text = text * 300
+        self.config.general['spell_check'] = True
+        start = timer()
+        for i in range(50):
+            d = self.nlp(text)
+        end = timer()
+        print("Time: ", end - start)
+
+    def test_without_spell_check(self):
+        # Now without spell check
+        self.config.general['spell_check'] = False
+        start = timer()
+        for i in range(50):
+            d = self.nlp(self.text)
+        end = timer()
+        print("Time: ", end - start)
+
+
+    def test_for_linker(self):
+        self.config = Config()
+        self.config.general['log_level'] = logging.DEBUG
+        cdb = CDB(config=self.config)
+
+        # Add a couple of names
+        cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config))
+        cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config))
+        cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config))
+        cdb.add_names(cui='S-2290045', names=prepare_name('Movar', self.nlp, {}, self.config))
+        # Check
+        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}}
+
+        cuis = list(cdb.cui2names.keys())
+        for cui in cuis[0:50]:
+            vectors = {'short': np.random.rand(300),
+                      'long': np.random.rand(300),
+                      'medium': np.random.rand(300)
+                      }
+            cdb.update_context_vector(cui, vectors, negative=False)
+
+        d = self.nlp(self.text)
+        vocab = Vocab.load(self.vocab_path)
+        cm = ContextModel(cdb, vocab, self.config)
+        cm.train_using_negative_sampling('S-229004')
+        self.config.linking['train_count_threshold'] = 0
+
+        cm.train('S-229004', d._.ents[1], d)
+
+        cm.similarity('S-229004', d._.ents[1], d)
+
+        cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d)
diff --git a/tests/test_cdb_maker.py b/tests/test_cdb_maker.py
index 3917c075b..9d84215a5 100644
--- a/tests/test_cdb_maker.py
+++ b/tests/test_cdb_maker.py
@@ -1,10 +1,11 @@
 import unittest
+import logging
+import os
+import numpy as np
 from medcat.cdb_maker import CDBMaker
 from medcat.cdb import CDB
 from medcat.config import Config
 from medcat.preprocessing.cleaners import prepare_name
-import numpy as np
-import logging
 
 #cdb.csv
 #cui  name  ontologies  name_status type_ids  description
@@ -20,6 +21,7 @@
 
 #TESTS RUN IN ALPHABETICAL ORDER - CONTROLLING WITH '[class_letter]Class and test_[classletter subclassletter]' function syntax
 
+
 class A_CDBMakerLoadTests(unittest.TestCase):
     
     @classmethod
@@ -28,7 +30,10 @@ def setUpClass(cls):
         config = Config()
         config.general['log_level'] = logging.DEBUG
         maker = CDBMaker(config)
-        csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
+        csvs = [
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb.csv'),
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb_2.csv')
+        ]
         cls.cdb = maker.prepare_csvs(csvs, full_build=True)
 
     def test_aa_cdb_names_length(self):
@@ -110,7 +115,10 @@ def setUpClass(cls):
         cls.config = Config()
         cls.config.general['log_level'] = logging.DEBUG
         cls.maker = CDBMaker(cls.config)
-        csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
+        csvs = [
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb.csv'),
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb_2.csv')
+        ]
         cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)
         cls.cdb2 = CDB(cls.config)
 
diff --git a/tests/test_ner.py b/tests/test_ner.py
index e05be9659..14720c205 100644
--- a/tests/test_ner.py
+++ b/tests/test_ner.py
@@ -28,7 +28,7 @@ def setUpClass(cls):
         print("Set up Vocab")
         vocab_path = "./tmp_vocab.dat"
         if not os.path.exists(vocab_path):
-            tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
+            tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
             with open(vocab_path, 'wb') as f:
                 f.write(tmp.content)
 
@@ -37,7 +37,7 @@ def setUpClass(cls):
 
         print("Set up NLP pipeline")
         cls.nlp = Pipe(tokenizer=spacy_split_all, config=cls.config)
-        cls.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=cls.config),
+        cls.nlp.add_tagger(tagger=tag_skip_and_punct,
                        name='skip_and_punct',
                        additional_fields=['is_punct'])
 
diff --git a/webapp/webapp/Dockerfile b/webapp/webapp/Dockerfile
index ccf9dfa81..ccd5e591c 100644
--- a/webapp/webapp/Dockerfile
+++ b/webapp/webapp/Dockerfile
@@ -19,7 +19,7 @@ RUN pip install -r requirements.txt
 
 # Get the spacy and scipspacy model
 RUN python -m spacy download en_core_web_md
-RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_md-0.2.4.tar.gz
+RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz
 
 # Build the db
 RUN python manage.py makemigrations && \

From 365faff1fa3a3046c0044b113d6f3058ec99af8a Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Tue, 27 Jul 2021 14:12:33 +0100
Subject: [PATCH 10/16] CAT-31 add github actions and manage scispaCy models
 using requirements-*.txt

---
 .github/workflows/main.yml  | 30 ++++++++++++++++++++++++++++++
 medcat/utils/normalizers.py |  5 +----
 requirements-lg.txt         |  2 ++
 requirements-sm.txt         |  2 ++
 requirements.txt            |  1 +
 setup.py                    |  4 ++--
 6 files changed, 38 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/main.yml
 create mode 100644 requirements-lg.txt
 create mode 100644 requirements-sm.txt

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 000000000..b5eaceb3c
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,30 @@
+name: Main
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [ 3.7 ]
+      max-parallel: 1
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Test
+        run: |
+          python -m unittest discover
\ No newline at end of file
diff --git a/medcat/utils/normalizers.py b/medcat/utils/normalizers.py
index cc15bb0f6..8e6a07e27 100644
--- a/medcat/utils/normalizers.py
+++ b/medcat/utils/normalizers.py
@@ -1,9 +1,6 @@
-#import hunspell
 import re
-from collections import Counter
-from spacy.tokens import Span
 import spacy
-import os
+
 
 CONTAINS_NUMBER = re.compile('[0-9]+')
 
diff --git a/requirements-lg.txt b/requirements-lg.txt
new file mode 100644
index 000000000..7d293df4e
--- /dev/null
+++ b/requirements-lg.txt
@@ -0,0 +1,2 @@
+.
+https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz
\ No newline at end of file
diff --git a/requirements-sm.txt b/requirements-sm.txt
new file mode 100644
index 000000000..03885cbf2
--- /dev/null
+++ b/requirements-sm.txt
@@ -0,0 +1,2 @@
+.
+https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 9c558e357..78c7cf284 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
 .
+https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 0012bfa07..f65830479 100644
--- a/setup.py
+++ b/setup.py
@@ -21,11 +21,11 @@
         'numpy~=1.20',
         'pandas~=1.0',
         'gensim~=3.8',
-        'spacy==3.1.1',
+        'spacy<3.1.0,>=3.0.1',
         'scipy~=1.5',
         'transformers~=4.5.1',
         'torch~=1.8.1',
-        'Flask~=1.1',
+        'tqdm<4.50.0,>=4.27',
         'sklearn~=0.0',
         'elasticsearch~=7.10',
         'dill~=0.3.3',

From b49a4523e0c300fcd6bff84c8975d0f6dce44201 Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Tue, 27 Jul 2021 14:14:36 +0100
Subject: [PATCH 11/16] CAT-31 test github actions

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b5eaceb3c..865cfd186 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -2,7 +2,7 @@ name: Main
 
 on:
   push:
-    branches: [ master ]
+    branches: [ master, spacy-v3 ]
   pull_request:
     branches: [ master ]
 

From 84a2595efe27b32d2bf845cddd26a235f8780705 Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Tue, 27 Jul 2021 14:14:36 +0100
Subject: [PATCH 12/16] CAT-31 add badges to readme and fix vocab urls

---
 .github/workflows/main.yml | 6 +++---
 README.md                  | 6 +++++-
 webapp/envs/env_medmen     | 4 ++--
 webapp/webapp/Dockerfile   | 7 +++----
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b5eaceb3c..df7f5d505 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -1,8 +1,8 @@
-name: Main
+name: build
 
 on:
   push:
-    branches: [ master ]
+    branches: [ master, spacy-v3 ]
   pull_request:
     branches: [ master ]
 
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ 3.7 ]
+        python-version: [ 3.7, 3.8, 3.9 ]
       max-parallel: 1
 
     steps:
diff --git a/README.md b/README.md
index f1eea4679..2437541b8 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,12 @@
 # Medical  <img src="https://github.com/CogStack/MedCAT/blob/master/media/cat-logo.png" width=45> oncept Annotation Tool
 
+[![Build Status](https://github.com/CogStack/MedCAT/actions/workflows/main.yml/badge.svg?branch=master)](https://github.com/CogStack/MedCAT/actions/workflows/main.yml?query=branch%3Amaster)
+[![Latest release](https://img.shields.io/github/v/release/CogStack/MedCAT)](https://github.com/CogStack/MedCAT/releases/latest)
+[![pypi Version](https://img.shields.io/pypi/v/medcat.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/medcat/)
+
 MedCAT can be used to extract information from Electronic Health Records (EHRs) and link it to biomedical ontologies like SNOMED-CT and UMLS. Paper on [arXiv](https://arxiv.org/abs/2010.01165). 
 
-## News 
+## News
 - **New Feature and Tutorial \[8. July 2021\]**: [Integrating 🤗 Transformers with MedCAT for biomedical NER+L](https://towardsdatascience.com/integrating-transformers-with-medcat-for-biomedical-ner-l-8869c76762a)
 - **General \[1. April 2021\]**: MedCAT is upgraded to v1, unforunately this introduces breaking changes with older models (MedCAT v0.4), 
 as well as potential problems with all code that used the MedCAT package. MedCAT v0.4 is available on the legacy 
diff --git a/webapp/envs/env_medmen b/webapp/envs/env_medmen
index 85b79f850..447f0dfb9 100644
--- a/webapp/envs/env_medmen
+++ b/webapp/envs/env_medmen
@@ -19,8 +19,8 @@ KEEP_PUNCT=:|.
 SPACY_MODEL=en_core_sci_md
 VOCAB_PATH=/webapp/models/vocab.dat
 CDB_PATH=/webapp/models/cdb.dat
-VOCAB_URL=https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat
-CDB_URL=https://s3-eu-west-1.amazonaws.com/zkcl/cdb-medmen.dat
+VOCAB_URL=https://medcat.rosalind.kcl.ac.uk/media/vocab.dat
+CDB_URL=https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1.dat
 
 MKL_NUM_THREAD=1
 NUMEXPR_NUM_THREADS=1
diff --git a/webapp/webapp/Dockerfile b/webapp/webapp/Dockerfile
index ccd5e591c..61575edba 100644
--- a/webapp/webapp/Dockerfile
+++ b/webapp/webapp/Dockerfile
@@ -6,8 +6,8 @@ RUN mkdir -p /webapp/models
 # Copy everything
 COPY . /webapp
 
-ENV VOCAB_URL=https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat
-ENV CDB_URL=https://s3-eu-west-1.amazonaws.com/zkcl/cdb-medmen.dat
+ENV VOCAB_URL=https://medcat.rosalind.kcl.ac.uk/media/vocab.dat
+ENV CDB_URL=https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1.dat
 
 ENV CDB_PATH=/webapp/models/cdb.dat
 ENV VOCAB_PATH=/webapp/models/vocab.dat
@@ -17,9 +17,8 @@ WORKDIR /webapp
 
 RUN pip install -r requirements.txt
 
-# Get the spacy and scipspacy model
+# Get the spacy model
 RUN python -m spacy download en_core_web_md
-RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz
 
 # Build the db
 RUN python manage.py makemigrations && \

From bfeeba9c801d5224e345fa0de10865bf3c2945bb Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Tue, 27 Jul 2021 17:54:15 +0100
Subject: [PATCH 13/16] CAT-18 replace another partial function

---
 .github/workflows/main.yml | 2 +-
 medcat/cat.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index df7f5d505..849333ac5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -13,7 +13,7 @@ jobs:
     strategy:
       matrix:
         python-version: [ 3.7, 3.8, 3.9 ]
-      max-parallel: 1
+      max-parallel: 3
 
     steps:
       - uses: actions/checkout@v2
diff --git a/medcat/cat.py b/medcat/cat.py
index 9a7db9f15..f5c86dbed 100644
--- a/medcat/cat.py
+++ b/medcat/cat.py
@@ -70,7 +70,7 @@ def __init__(self, cdb, config, vocab, meta_cats=[]):
 
         # Build the pipeline
         self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
-        self.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=self.config),
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                             name='skip_and_punct',
                             additional_fields=['is_punct'])
 

From 6c07e78d7f5266682cffe1f123a199604df8ca60 Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Wed, 28 Jul 2021 11:32:06 +0100
Subject: [PATCH 14/16] CAT-18 close files after their use and pass on the
 replace param

---
 medcat/vocab.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/medcat/vocab.py b/medcat/vocab.py
index d2558744a..f21de2c8a 100644
--- a/medcat/vocab.py
+++ b/medcat/vocab.py
@@ -134,7 +134,7 @@ def add_word(self, word, cnt=1, vec=None, replace=True):
             cnt (int):
                 count of this word in your dataset
             vec (np.array):
-                the vector repesentation of the word
+                the vector representation of the word
             replace (bool):
                 will replace old vector representation
         """
@@ -170,17 +170,16 @@ def add_words(self, path, replace=True):
             replace (bool):
                 existing words in the vocabulary will be replaced
         """
-        f = open(path)
+        with open(path) as f:
+            for line in f:
+                parts = line.split("\t")
+                word = parts[0]
+                cnt = int(parts[1].strip())
+                vec = None
+                if len(parts) == 3:
+                    vec = np.array([float(x) for x in parts[2].strip().split(" ")])
 
-        for line in f:
-            parts = line.split("\t")
-            word = parts[0]
-            cnt = int(parts[1].strip())
-            vec = None
-            if len(parts) == 3:
-                vec = np.array([float(x) for x in parts[2].strip().split(" ")])
-
-            self.add_word(word, cnt, vec)
+                self.add_word(word, cnt, vec, replace)
 
 
     def make_unigram_table(self, table_size=100000000):
@@ -232,13 +231,17 @@ def get_negative_samples(self, n=6, ignore_punct_and_num=False):
 
 
     def __getitem__(self, word):
-        return self.vocab[word]['cnt']
+        return self.count(word)
 
 
     def vec(self, word):
         return self.vocab[word]['vec']
 
 
+    def count(self, word):
+        return self.vocab[word]['cnt']
+
+
     def item(self, word):
         return self.vocab[word]
 

From a70409d0dd57b49f8a5c8910224ca3f386550eb3 Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Wed, 28 Jul 2021 11:32:58 +0100
Subject: [PATCH 15/16] CAT-18 refactor and add more tests

---
 examples/vocab_data.txt         |  2 ++
 medcat/cat.py                   |  2 +-
 medcat/pipe.py                  | 12 ++------
 medcat/preprocessing/taggers.py | 21 +++++---------
 tests/test_cat.py               | 27 +++++++++++++++++
 tests/test_cdb.py               | 51 +++++++++++++++++++++++++++++++++
 tests/test_vocab.py             | 39 +++++++++++++++++++++++++
 7 files changed, 130 insertions(+), 24 deletions(-)
 create mode 100644 examples/vocab_data.txt
 create mode 100644 tests/test_cat.py
 create mode 100644 tests/test_cdb.py
 create mode 100644 tests/test_vocab.py

diff --git a/examples/vocab_data.txt b/examples/vocab_data.txt
new file mode 100644
index 000000000..da3f43048
--- /dev/null
+++ b/examples/vocab_data.txt
@@ -0,0 +1,2 @@
+house	34444	 0.3232 0.123213 1.231231
+dog	14444	0.76762 0.76767 1.45454
diff --git a/medcat/cat.py b/medcat/cat.py
index f5c86dbed..ce8a24628 100644
--- a/medcat/cat.py
+++ b/medcat/cat.py
@@ -116,7 +116,7 @@ def __call__(self, text, do_train=False):
         Returns:
             A spacy document with the extracted entities
         '''
-        # Should we train - do not use this for training, unles you know what you are doing. Use the
+        # Should we train - do not use this for training, unless you know what you are doing. Use the
         #self.train() function
         self.config.linking['train'] = do_train
 
diff --git a/medcat/pipe.py b/medcat/pipe.py
index d22094bb5..7fd7c5a10 100644
--- a/medcat/pipe.py
+++ b/medcat/pipe.py
@@ -24,7 +24,6 @@ def __init__(self, tokenizer, config):
         self.nlp.tokenizer = tokenizer(self.nlp)
         self.config = config
 
-
     def add_tagger(self, tagger, name, additional_fields=[]):
         r''' Add any kind of a tagger for tokens.
 
@@ -37,9 +36,9 @@ def add_tagger(self, tagger, name, additional_fields=[]):
             additional_fields (`List[str]`):
                 Fields to be added to the `_` properties of a token.
         '''
-        component_name = spacy.util.get_object_name(tagger)
-        Language.factory(name=component_name, default_config={"config": self.config}, func=tagger)
-        self.nlp.add_pipe(component_name, name='tag_' + name, first=True)
+        component_factory_name = spacy.util.get_object_name(tagger)
+        Language.factory(name=component_factory_name, default_config={"config": self.config}, func=tagger)
+        self.nlp.add_pipe(component_factory_name, name='tag_' + name, first=True)
         # Add custom fields needed for this usecase
         Token.set_extension('to_skip', default=False, force=True)
 
@@ -47,7 +46,6 @@ def add_tagger(self, tagger, name, additional_fields=[]):
         for field in additional_fields:
             Token.set_extension(field, default=False, force=True)
 
-
     def add_token_normalizer(self, config, spell_checker=None):
         token_normalizer = TokenNormalizer(spell_checker=spell_checker, config=config)
         component_name = spacy.util.get_object_name(token_normalizer)
@@ -57,7 +55,6 @@ def add_token_normalizer(self, config, spell_checker=None):
         # Add custom fields needed for this usecase
         Token.set_extension('norm', default=None, force=True)
 
-
     def add_ner(self, ner):
         r''' Add NER from CAT to the pipeline, will also add the necessary fields
         to the document and Span objects.
@@ -75,7 +72,6 @@ def add_ner(self, ner):
         Span.set_extension('detected_name', default=None, force=True)
         Span.set_extension('link_candidates', default=None, force=True)
 
-
     def add_linker(self, linker):
         r''' Add entity linker to the pipeline, will also add the necessary fields
         to Span object.
@@ -90,7 +86,6 @@ def add_linker(self, linker):
         Span.set_extension('cui', default=-1, force=True)
         Span.set_extension('context_similarity', default=-1, force=True)
 
-
     def add_meta_cat(self, meta_cat, name):
         component_name = spacy.util.get_object_name(meta_cat)
         Language.component(name=component_name, func=meta_cat)
@@ -100,6 +95,5 @@ def add_meta_cat(self, meta_cat, name):
         #of {category_name: value, ...}
         Span.set_extension('meta_anns', default=None, force=True)
 
-
     def __call__(self, text):
         return self.nlp(text)
diff --git a/medcat/preprocessing/taggers.py b/medcat/preprocessing/taggers.py
index c63539a12..6ec2ac6e0 100644
--- a/medcat/preprocessing/taggers.py
+++ b/medcat/preprocessing/taggers.py
@@ -1,16 +1,19 @@
-import re
-
 def tag_skip_and_punct(nlp, name, config):
     r''' Detects and tags spacy tokens that are punctuation and that should be skipped.
 
      Args:
+         nlp (spacy.language.<lng>):
+             The base spacy NLP pipeline.
+         name (`str`):
+             The component instance name.
          config (`medcat.config.Config`):
              Global config for medcat.
     '''
 
-    return TagSkipAndPunct(nlp, name, config)
+    return _Tagger(nlp, name, config)
+
 
-class TagSkipAndPunct(object):
+class _Tagger(object):
 
     def __init__(self, nlp, name, config):
         self.nlp = nlp
@@ -18,16 +21,6 @@ def __init__(self, nlp, name, config):
         self.config = config
 
     def __call__(self, doc):
-        r''' Detects and tags spacy tokens that are punctuation and that should be skipped.
-
-        Args:
-            doc (`spacy.tokens.Doc`):
-                Spacy document that will be tagged.
-
-        Return:
-            (`spacy.tokens.Doc):
-                Tagged spacy document
-        '''
         # Make life easier
         cnf_p = self.config.preprocessing
 
diff --git a/tests/test_cat.py b/tests/test_cat.py
new file mode 100644
index 000000000..869bcba54
--- /dev/null
+++ b/tests/test_cat.py
@@ -0,0 +1,27 @@
+import os
+import unittest
+from medcat.vocab import Vocab
+from medcat.cdb import CDB
+from medcat.cat import CAT
+
+
+class CATTests(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.cdb = CDB.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat"))
+        self.vocab = Vocab.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat"))
+        self.cdb.config.ner['min_name_len'] = 2
+        self.cdb.config.ner['upper_case_limit_len'] = 3
+        self.cdb.config.general['spell_check'] = True
+        self.cdb.config.linking['train_count_threshold'] = 10
+        self.cdb.config.linking['similarity_threshold'] = 0.3
+        self.cdb.config.linking['train'] = True
+        self.cdb.config.linking['disamb_length_limit'] = 5
+        self.cdb.config.general['full_unlink'] = True
+        self.undertest = CAT(cdb=self.cdb, config=self.cdb.config, vocab=self.vocab)
+
+    def test_pipeline(self):
+        text = "The dog is sitting outside the house."
+        doc = self.undertest(text)
+        self.assertEqual(text, doc.text)
+
diff --git a/tests/test_cdb.py b/tests/test_cdb.py
new file mode 100644
index 000000000..0d9bad237
--- /dev/null
+++ b/tests/test_cdb.py
@@ -0,0 +1,51 @@
+import os
+import shutil
+import unittest
+from medcat.config import Config
+from medcat.cdb_maker import CDBMaker
+
+
+class CDBTests(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        config = Config()
+        config.general["spacy_model"] = "en_core_sci_md"
+        cls.cdb_maker = CDBMaker(config)
+
+    def setUp(self) -> None:
+        cdb_csv = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.csv")
+        cdb_2_csv = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb_2.csv")
+        self.tmp_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tmp")
+        os.makedirs(self.tmp_dir, exist_ok=True)
+        self.undertest = CDBTests.cdb_maker.prepare_csvs([cdb_csv, cdb_2_csv], full_build=True)
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.tmp_dir)
+
+    def test_name2cuis(self):
+        self.assertEqual({
+            'second~csv': ['C0000239'],
+            'virus': ['C0000039', 'C0000139'],
+            'virus~k': ['C0000039', 'C0000139'],
+            'virus~m': ['C0000039', 'C0000139'],
+            'virus~z': ['C0000039', 'C0000139']
+        }, self.undertest.name2cuis)
+
+    def test_cui2names(self):
+        self.assertEqual({
+            'C0000039': {'virus~z', 'virus~k', 'virus~m', 'virus'},
+            'C0000139': {'virus~z', 'virus', 'virus~m', 'virus~k'},
+            'C0000239': {'second~csv'}
+        }, self.undertest.cui2names)
+
+    def test_cui2preferred_name(self):
+        self.assertEqual({'C0000039': 'Virus', 'C0000139': 'Virus Z'}, self.undertest.cui2preferred_name)
+
+    def test_cui2type_ids(self):
+        self.assertEqual({'C0000039': {'T109', 'T234', 'T123'}, 'C0000139': set(), 'C0000239': set()}, self.undertest.cui2type_ids)
+
+    def test_save_and_load(self):
+        cdb_path = f"{self.tmp_dir}/cdb.dat"
+        self.undertest.save(cdb_path)
+        self.undertest.load(cdb_path)
diff --git a/tests/test_vocab.py b/tests/test_vocab.py
new file mode 100644
index 000000000..8db82df41
--- /dev/null
+++ b/tests/test_vocab.py
@@ -0,0 +1,39 @@
+import os
+import shutil
+import unittest
+from medcat.vocab import Vocab
+
+
+class CATTests(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.undertest = Vocab()
+        self.tmp_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tmp")
+        os.makedirs(self.tmp_dir, exist_ok=True)
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.tmp_dir)
+
+    def test_add_words(self):
+        self.undertest.add_words(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt"))
+        self.assertEqual(["house", "dog"], list(self.undertest.vocab.keys()))
+
+    def test_add_word(self):
+        self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55])
+        self.assertEqual(["test"], list(self.undertest.vocab.keys()))
+        self.assertTrue("test" in self.undertest)
+
+    def test_count(self):
+        self.undertest.add_words(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt"))
+        self.assertEqual(34444, self.undertest.count("house"))
+
+    def test_save_and_load(self):
+        self.undertest.add_words(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt"))
+        self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55])
+        vocab_path = f"{self.tmp_dir}/vocab.dat"
+        self.undertest.save(vocab_path)
+        vocab = Vocab.load(vocab_path)
+        self.assertEqual(["house", "dog", "test"], list(vocab.vocab.keys()))
+
+
+

From 7f843efc027933bb39b0495f853f1448616bb89a Mon Sep 17 00:00:00 2001
From: baixiac <baixiac@gmail.com>
Date: Wed, 28 Jul 2021 12:13:27 +0100
Subject: [PATCH 16/16] CAT-31 reset workflows after testing

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 849333ac5..80a301542 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -2,7 +2,7 @@ name: build
 
 on:
   push:
-    branches: [ master, spacy-v3 ]
+    branches: [ master ]
   pull_request:
     branches: [ master ]