Merge pull request #92 from CogStack/spacy-v3

Upgrade spaCy to v3 and add the CI build pipeline
HDRUK · Jul 28, 2021 · 1222d40 · 1222d40
2 parents 3aa9b9b + 3e480a6
commit 1222d40
Show file tree

Hide file tree

Showing 30 changed files with 516 additions and 322 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,30 @@
+name: build
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [ 3.7, 3.8, 3.9 ]
+      max-parallel: 3
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Test
+        run: |
+          python -m unittest discover
diff --git a/.gitignore b/.gitignore
@@ -41,3 +41,4 @@ tmp.py
 
 # models files
 *.dat
+!examples/*.dat
diff --git a/README.md b/README.md
@@ -1,8 +1,12 @@
 # Medical  <img src="https://github.com/CogStack/MedCAT/blob/master/media/cat-logo.png" width=45> oncept Annotation Tool
 
+[![Build Status](https://github.com/CogStack/MedCAT/actions/workflows/main.yml/badge.svg?branch=master)](https://github.com/CogStack/MedCAT/actions/workflows/main.yml?query=branch%3Amaster)
+[![Latest release](https://img.shields.io/github/v/release/CogStack/MedCAT)](https://github.com/CogStack/MedCAT/releases/latest)
+[![pypi Version](https://img.shields.io/pypi/v/medcat.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/medcat/)
+
 MedCAT can be used to extract information from Electronic Health Records (EHRs) and link it to biomedical ontologies like SNOMED-CT and UMLS. Paper on [arXiv](https://arxiv.org/abs/2010.01165). 
 
-## News 
+## News
 - **New Feature and Tutorial \[8. July 2021\]**: [Integrating 🤗 Transformers with MedCAT for biomedical NER+L](https://towardsdatascience.com/integrating-transformers-with-medcat-for-biomedical-ner-l-8869c76762a)
 - **General \[1. April 2021\]**: MedCAT is upgraded to v1, unforunately this introduces breaking changes with older models (MedCAT v0.4), 
 as well as potential problems with all code that used the MedCAT package. MedCAT v0.4 is available on the legacy 
@@ -30,9 +34,9 @@ A guide on how to use MedCAT is available in the [tutorial](https://github.com/C
 
 2. Get the scispacy models:
 
-`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz`
+`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz`
 
-`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz`
+`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz`
 
 3. Downlad the Vocabulary and CDB from the Models section bellow
 
@@ -98,7 +102,7 @@ CDB [Download](https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1.dat) - Buil
 MetaCAT Status [Download](https://medcat.rosalind.kcl.ac.uk/media/mc_status.zip) - Built from a sample from MIMIC-III, detects is an annotation Affirmed (Positve) or Other (Negated or Hypothetical)
 
 
-(Note: This is was compiled from MedMentions and does not have any data from [NLM](https://www.nlm.nih.gov/research/umls/) as
+(Note: This was compiled from MedMentions and does not have any data from [NLM](https://www.nlm.nih.gov/research/umls/) as
 that data is not publicaly available.)
 
 ### SNOMED-CT and UMLS

diff --git a/examples/cdb.dat b/examples/cdb.dat
diff --git a/examples/vocab.dat b/examples/vocab.dat
diff --git a/examples/vocab_data.txt b/examples/vocab_data.txt
@@ -0,0 +1,2 @@
+house	34444	 0.3232 0.123213 1.231231
+dog	14444	0.76762 0.76767 1.45454
diff --git a/medcat/cat.py b/medcat/cat.py
@@ -70,7 +70,7 @@ def __init__(self, cdb, config, vocab, meta_cats=[]):
 
         # Build the pipeline
         self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
-        self.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=self.config),
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                             name='skip_and_punct',
                             additional_fields=['is_punct'])
 
@@ -116,7 +116,7 @@ def __call__(self, text, do_train=False):
         Returns:
             A spacy document with the extracted entities
         '''
-        # Should we train - do not use this for training, unles you know what you are doing. Use the
+        # Should we train - do not use this for training, unless you know what you are doing. Use the
         #self.train() function
         self.config.linking['train'] = do_train
 

diff --git a/medcat/cdb_maker.py b/medcat/cdb_maker.py
@@ -44,28 +44,28 @@ def __init__(self, config, cdb=None, name_max_words=20):
 
         # Build the required spacy pipeline
         self.nlp = Pipe(tokenizer=spacy_split_all, config=config)
-        self.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=self.config),
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                             name='skip_and_punct',
                             additional_fields=['is_punct'])
 
 
     def prepare_csvs(self, csv_paths, sep=',', encoding=None, escapechar=None, index_col=False, full_build=False, only_existing_cuis=False, **kwargs):
-        r''' Compile one or multipe CSVs into a CDB.
+        r''' Compile one or multiple CSVs into a CDB.
 
         Args:
             csv_paths (`List[str]`):
                 An array of paths to the csv files that should be processed
-            full_build (`bool`, defautls to `True`):
+            full_build (`bool`, defaults to `True`):
                 If False only the core portions of the CDB will be built (the ones required for
                 the functioning of MedCAT). If True, everything will be added to the CDB - this
                 usually includes concept descriptions, various forms of names etc (take care that
                 this option produces a much larger CDB).
             sep (`str`, defaults to `,`):
-                If necessarya a custom separator for the csv files
+                If necessary a custom separator for the csv files
             encoding (`str`, optional):
-                Encoing to be used for reading the CSV file
+                Encoding to be used for reading the CSV file
             escapechar (`str`, optional):
-                Escapechar for the CSV
+                Escape char for the CSV
             index_col (`bool`, defaults_to `False`):
                 Index column for pandas read_csv
             only_existing_cuis (`bool`, defaults to False):

diff --git a/medcat/linking/context_based_linker.py b/medcat/linking/context_based_linker.py
@@ -1,7 +1,8 @@
-from medcat.utils.filters import check_filters
-from medcat.linking.vector_context_model import ContextModel
 import random
 import logging
+from medcat.utils.filters import check_filters
+from medcat.linking.vector_context_model import ContextModel
+
 
 class Linker(object):
     r''' Link to a biomedical database.

diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py
@@ -10,6 +10,7 @@
 from medcat.preprocessing.tokenizers import TokenizerWrapperBPE
 from medcat.preprocessing.tokenizers import TokenizerWrapperBERT
 
+
 class MetaCAT(object):
     r''' TODO: Add documentation
     '''

diff --git a/medcat/ner/vocab_based_ner.py b/medcat/ner/vocab_based_ner.py
@@ -1,5 +1,6 @@
-from medcat.ner.vocab_based_annotator import maybe_annotate_name
 import logging
+from medcat.ner.vocab_based_annotator import maybe_annotate_name
+
 
 class NER(object):
     r'''

diff --git a/medcat/pipe.py b/medcat/pipe.py
@@ -1,7 +1,8 @@
+import spacy
 from spacy.tokens import Token, Doc, Span
+from spacy.language import Language
 from medcat.utils.normalizers import TokenNormalizer
-import spacy
-import os
+
 
 class Pipe(object):
     r''' A wrapper around the standard spacy pipeline.
@@ -21,7 +22,7 @@ def __init__(self, tokenizer, config):
         if config.preprocessing['stopwords'] is not None:
             self.nlp.Defaults.stop_words = set(config.preprocessing['stopwords'])
         self.nlp.tokenizer = tokenizer(self.nlp)
-
+        self.config = config
 
     def add_tagger(self, tagger, name, additional_fields=[]):
         r''' Add any kind of a tagger for tokens.
@@ -35,29 +36,33 @@ def add_tagger(self, tagger, name, additional_fields=[]):
             additional_fields (`List[str]`):
                 Fields to be added to the `_` properties of a token.
         '''
-        self.nlp.add_pipe(tagger, name='tag_' + name, first=True)
+        component_factory_name = spacy.util.get_object_name(tagger)
+        Language.factory(name=component_factory_name, default_config={"config": self.config}, func=tagger)
+        self.nlp.add_pipe(component_factory_name, name='tag_' + name, first=True)
         # Add custom fields needed for this usecase
         Token.set_extension('to_skip', default=False, force=True)
 
         # Add any additional fields that are required
         for field in additional_fields:
             Token.set_extension(field, default=False, force=True)
 
-
     def add_token_normalizer(self, config, spell_checker=None):
         token_normalizer = TokenNormalizer(spell_checker=spell_checker, config=config)
-        self.nlp.add_pipe(token_normalizer, name='token_normalizer', last=True)
+        component_name = spacy.util.get_object_name(token_normalizer)
+        Language.component(name=component_name, func=token_normalizer)
+        self.nlp.add_pipe(component_name, name='token_normalizer', last=True)
 
         # Add custom fields needed for this usecase
         Token.set_extension('norm', default=None, force=True)
 
-
     def add_ner(self, ner):
         r''' Add NER from CAT to the pipeline, will also add the necessary fields
         to the document and Span objects.
 
         '''
-        self.nlp.add_pipe(ner, name='cat_ner', last=True)
+        component_name = spacy.util.get_object_name(ner)
+        Language.component(name=component_name, func=ner)
+        self.nlp.add_pipe(component_name, name='cat_ner', last=True)
 
         Doc.set_extension('ents', default=[], force=True)
         Span.set_extension('confidence', default=-1, force=True)
@@ -67,7 +72,6 @@ def add_ner(self, ner):
         Span.set_extension('detected_name', default=None, force=True)
         Span.set_extension('link_candidates', default=None, force=True)
 
-
     def add_linker(self, linker):
         r''' Add entity linker to the pipeline, will also add the necessary fields
         to Span object.
@@ -76,18 +80,20 @@ def add_linker(self, linker):
             Any object/function created based on the requirements for a spaCy pipeline components. Have
             a look at https://spacy.io/usage/processing-pipelines#custom-components
         '''
-        self.nlp.add_pipe(linker, name='cat_linker', last=True)
+        component_name = spacy.util.get_object_name(linker)
+        Language.component(name=component_name, func=linker)
+        self.nlp.add_pipe(component_name, name='cat_linker', last=True)
         Span.set_extension('cui', default=-1, force=True)
         Span.set_extension('context_similarity', default=-1, force=True)
 
-
     def add_meta_cat(self, meta_cat, name):
-        self.nlp.add_pipe(meta_cat, name=name, last=True)
+        component_name = spacy.util.get_object_name(meta_cat)
+        Language.component(name=component_name, func=meta_cat)
+        self.nlp.add_pipe(component_name, name=name, last=True)
 
         # Only the meta_anns field is needed, it will be a dictionary 
         #of {category_name: value, ...}
         Span.set_extension('meta_anns', default=None, force=True)
 
-
     def __call__(self, text):
         return self.nlp(text)
diff --git a/medcat/preprocessing/taggers.py b/medcat/preprocessing/taggers.py
@@ -1,30 +1,38 @@
-import re
-
-def tag_skip_and_punct(doc, config):
+def tag_skip_and_punct(nlp, name, config):
     r''' Detects and tags spacy tokens that are punctuation and that should be skipped.
 
-    Args:
-        doc (`spacy.tokens.Doc`):
-            Spacy document that will be tagged.
-        config (`medcat.config.Config`):
-            Global config for medcat.
-
-    Return:
-        (`spacy.tokens.Doc):
-            Tagged spacy document
+     Args:
+         nlp (spacy.language.<lng>):
+             The base spacy NLP pipeline.
+         name (`str`):
+             The component instance name.
+         config (`medcat.config.Config`):
+             Global config for medcat.
     '''
-    # Make life easier
-    cnf_p = config.preprocessing
-
-    for token in doc:
-        if config.punct_checker.match(token.lower_) and token.text not in cnf_p['keep_punct']:
-            # There can't be punct in a token if it also has text
-            token._.is_punct = True
-            token._.to_skip = True
-        elif config.word_skipper.match(token.lower_):
-            # Skip if specific strings
-            token._.to_skip = True
-        elif cnf_p['skip_stopwords'] and token.is_stop:
-            token._.to_skip = True
-
-    return doc
+
+    return _Tagger(nlp, name, config)
+
+
+class _Tagger(object):
+
+    def __init__(self, nlp, name, config):
+        self.nlp = nlp
+        self.name = name
+        self.config = config
+
+    def __call__(self, doc):
+        # Make life easier
+        cnf_p = self.config.preprocessing
+
+        for token in doc:
+            if self.config.punct_checker.match(token.lower_) and token.text not in cnf_p['keep_punct']:
+                # There can't be punct in a token if it also has text
+                token._.is_punct = True
+                token._.to_skip = True
+            elif self.config.word_skipper.match(token.lower_):
+                # Skip if specific strings
+                token._.to_skip = True
+            elif cnf_p['skip_stopwords'] and token.is_stop:
+                token._.to_skip = True
+
+        return doc
diff --git a/medcat/utils/make_vocab.py b/medcat/utils/make_vocab.py
@@ -40,7 +40,7 @@ def __init__(self, config, cdb=None, vocab=None, word_tokenizer=None):
 
         # Build the required spacy pipeline
         self.nlp = Pipe(tokenizer=spacy_split_all, config=config)
-        self.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=self.config),
+        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                             name='skip_and_punct',
                             additional_fields=['is_punct'])
 

diff --git a/medcat/utils/normalizers.py b/medcat/utils/normalizers.py
@@ -1,9 +1,6 @@
-#import hunspell
 import re
-from collections import Counter
-from spacy.tokens import Span
 import spacy
-import os
+
 
 CONTAINS_NUMBER = re.compile('[0-9]+')
 

diff --git a/medcat/vocab.py b/medcat/vocab.py
@@ -134,7 +134,7 @@ def add_word(self, word, cnt=1, vec=None, replace=True):
             cnt (int):
                 count of this word in your dataset
             vec (np.array):
-                the vector repesentation of the word
+                the vector representation of the word
             replace (bool):
                 will replace old vector representation
         """
@@ -170,17 +170,16 @@ def add_words(self, path, replace=True):
             replace (bool):
                 existing words in the vocabulary will be replaced
         """
-        f = open(path)
+        with open(path) as f:
+            for line in f:
+                parts = line.split("\t")
+                word = parts[0]
+                cnt = int(parts[1].strip())
+                vec = None
+                if len(parts) == 3:
+                    vec = np.array([float(x) for x in parts[2].strip().split(" ")])
 
-        for line in f:
-            parts = line.split("\t")
-            word = parts[0]
-            cnt = int(parts[1].strip())
-            vec = None
-            if len(parts) == 3:
-                vec = np.array([float(x) for x in parts[2].strip().split(" ")])
-
-            self.add_word(word, cnt, vec)
+                self.add_word(word, cnt, vec, replace)
 
 
     def make_unigram_table(self, table_size=100000000):
@@ -232,13 +231,17 @@ def get_negative_samples(self, n=6, ignore_punct_and_num=False):
 
 
     def __getitem__(self, word):
-        return self.vocab[word]['cnt']
+        return self.count(word)
 
 
     def vec(self, word):
         return self.vocab[word]['vec']
 
 
+    def count(self, word):
+        return self.vocab[word]['cnt']
+
+
     def item(self, word):
         return self.vocab[word]
 

diff --git a/requirements-lg.txt b/requirements-lg.txt
@@ -0,0 +1,2 @@
+.
+https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz
diff --git a/requirements-sm.txt b/requirements-sm.txt
@@ -0,0 +1,2 @@
+.
+https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
 .
+https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz
Original file line number	Diff line number	Diff line change
Expand Up		@@ -41,3 +41,4 @@ tmp.py

		# models files
		*.dat
		!examples/*.dat
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		house 34444 0.3232 0.123213 1.231231
		dog 14444 0.76762 0.76767 1.45454
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		.
		https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		.
		https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz