From 3fad158a88874755325ad24387af20a55ccd73ee Mon Sep 17 00:00:00 2001 From: baixiac Date: Mon, 26 Jul 2021 17:39:59 +0100 Subject: [PATCH 1/7] CAT-18 upgrade spaCy to v3 --- README.md | 6 +- medcat/cdb_maker.py | 12 +- medcat/linking/context_based_linker.py | 5 +- medcat/meta_cat.py | 1 + medcat/ner/vocab_based_ner.py | 3 +- medcat/pipe.py | 26 +- medcat/preprocessing/taggers.py | 65 +++-- medcat/utils/make_vocab.py | 2 +- setup.py | 2 +- tests/__init__.py | 0 tests/archive_tests/test_cdb_maker_archive.py | 237 ++++++++--------- tests/archive_tests/test_ner_archive.py | 246 +++++++++--------- tests/test_cdb_maker.py | 16 +- tests/test_ner.py | 4 +- webapp/webapp/Dockerfile | 2 +- 15 files changed, 338 insertions(+), 289 deletions(-) create mode 100644 tests/__init__.py diff --git a/README.md b/README.md index 7b8fc2518..c124c409e 100644 --- a/README.md +++ b/README.md @@ -30,9 +30,9 @@ A guide on how to use MedCAT is available in the [tutorial](https://github.com/C 2. Get the scispacy models: -`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz` +`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz` -`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz` +`pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz` 3. Downlad the Vocabulary and CDB from the Models section bellow @@ -98,7 +98,7 @@ CDB [Download](https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1.dat) - Buil MetaCAT Status [Download](https://medcat.rosalind.kcl.ac.uk/media/mc_status.zip) - Built from a sample from MIMIC-III, detects is an annotation Affirmed (Positve) or Other (Negated or Hypothetical) -(Note: This is was compiled from MedMentions and does not have any data from [NLM](https://www.nlm.nih.gov/research/umls/) as +(Note: This was compiled from MedMentions and does not have any data from [NLM](https://www.nlm.nih.gov/research/umls/) as that data is not publicaly available.) ### SNOMED-CT and UMLS diff --git a/medcat/cdb_maker.py b/medcat/cdb_maker.py index c1cdc758e..3597831aa 100644 --- a/medcat/cdb_maker.py +++ b/medcat/cdb_maker.py @@ -44,28 +44,28 @@ def __init__(self, config, cdb=None, name_max_words=20): # Build the required spacy pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=config) - self.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=self.config), + self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) def prepare_csvs(self, csv_paths, sep=',', encoding=None, escapechar=None, index_col=False, full_build=False, only_existing_cuis=False, **kwargs): - r''' Compile one or multipe CSVs into a CDB. + r''' Compile one or multiple CSVs into a CDB. Args: csv_paths (`List[str]`): An array of paths to the csv files that should be processed - full_build (`bool`, defautls to `True`): + full_build (`bool`, defaults to `True`): If False only the core portions of the CDB will be built (the ones required for the functioning of MedCAT). If True, everything will be added to the CDB - this usually includes concept descriptions, various forms of names etc (take care that this option produces a much larger CDB). sep (`str`, defaults to `,`): - If necessarya a custom separator for the csv files + If necessary a custom separator for the csv files encoding (`str`, optional): - Encoing to be used for reading the CSV file + Encoding to be used for reading the CSV file escapechar (`str`, optional): - Escapechar for the CSV + Escape char for the CSV index_col (`bool`, defaults_to `False`): Index column for pandas read_csv only_existing_cuis (`bool`, defaults to False): diff --git a/medcat/linking/context_based_linker.py b/medcat/linking/context_based_linker.py index e7fb09578..94d1a01dc 100644 --- a/medcat/linking/context_based_linker.py +++ b/medcat/linking/context_based_linker.py @@ -1,7 +1,8 @@ -from medcat.utils.filters import check_filters -from medcat.linking.vector_context_model import ContextModel import random import logging +from medcat.utils.filters import check_filters +from medcat.linking.vector_context_model import ContextModel + class Linker(object): r''' Link to a biomedical database. diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py index af197801f..3b46b856e 100644 --- a/medcat/meta_cat.py +++ b/medcat/meta_cat.py @@ -10,6 +10,7 @@ from medcat.preprocessing.tokenizers import TokenizerWrapperBPE from medcat.preprocessing.tokenizers import TokenizerWrapperBERT + class MetaCAT(object): r''' TODO: Add documentation ''' diff --git a/medcat/ner/vocab_based_ner.py b/medcat/ner/vocab_based_ner.py index 9aef8f41f..7c12c5ff2 100644 --- a/medcat/ner/vocab_based_ner.py +++ b/medcat/ner/vocab_based_ner.py @@ -1,5 +1,6 @@ -from medcat.ner.vocab_based_annotator import maybe_annotate_name import logging +from medcat.ner.vocab_based_annotator import maybe_annotate_name + class NER(object): r''' diff --git a/medcat/pipe.py b/medcat/pipe.py index 470fc146c..d22094bb5 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -1,7 +1,8 @@ +import spacy from spacy.tokens import Token, Doc, Span +from spacy.language import Language from medcat.utils.normalizers import TokenNormalizer -import spacy -import os + class Pipe(object): r''' A wrapper around the standard spacy pipeline. @@ -21,6 +22,7 @@ def __init__(self, tokenizer, config): if config.preprocessing['stopwords'] is not None: self.nlp.Defaults.stop_words = set(config.preprocessing['stopwords']) self.nlp.tokenizer = tokenizer(self.nlp) + self.config = config def add_tagger(self, tagger, name, additional_fields=[]): @@ -35,7 +37,9 @@ def add_tagger(self, tagger, name, additional_fields=[]): additional_fields (`List[str]`): Fields to be added to the `_` properties of a token. ''' - self.nlp.add_pipe(tagger, name='tag_' + name, first=True) + component_name = spacy.util.get_object_name(tagger) + Language.factory(name=component_name, default_config={"config": self.config}, func=tagger) + self.nlp.add_pipe(component_name, name='tag_' + name, first=True) # Add custom fields needed for this usecase Token.set_extension('to_skip', default=False, force=True) @@ -46,7 +50,9 @@ def add_tagger(self, tagger, name, additional_fields=[]): def add_token_normalizer(self, config, spell_checker=None): token_normalizer = TokenNormalizer(spell_checker=spell_checker, config=config) - self.nlp.add_pipe(token_normalizer, name='token_normalizer', last=True) + component_name = spacy.util.get_object_name(token_normalizer) + Language.component(name=component_name, func=token_normalizer) + self.nlp.add_pipe(component_name, name='token_normalizer', last=True) # Add custom fields needed for this usecase Token.set_extension('norm', default=None, force=True) @@ -57,7 +63,9 @@ def add_ner(self, ner): to the document and Span objects. ''' - self.nlp.add_pipe(ner, name='cat_ner', last=True) + component_name = spacy.util.get_object_name(ner) + Language.component(name=component_name, func=ner) + self.nlp.add_pipe(component_name, name='cat_ner', last=True) Doc.set_extension('ents', default=[], force=True) Span.set_extension('confidence', default=-1, force=True) @@ -76,13 +84,17 @@ def add_linker(self, linker): Any object/function created based on the requirements for a spaCy pipeline components. Have a look at https://spacy.io/usage/processing-pipelines#custom-components ''' - self.nlp.add_pipe(linker, name='cat_linker', last=True) + component_name = spacy.util.get_object_name(linker) + Language.component(name=component_name, func=linker) + self.nlp.add_pipe(component_name, name='cat_linker', last=True) Span.set_extension('cui', default=-1, force=True) Span.set_extension('context_similarity', default=-1, force=True) def add_meta_cat(self, meta_cat, name): - self.nlp.add_pipe(meta_cat, name=name, last=True) + component_name = spacy.util.get_object_name(meta_cat) + Language.component(name=component_name, func=meta_cat) + self.nlp.add_pipe(component_name, name=name, last=True) # Only the meta_anns field is needed, it will be a dictionary #of {category_name: value, ...} diff --git a/medcat/preprocessing/taggers.py b/medcat/preprocessing/taggers.py index 474e2d019..c63539a12 100644 --- a/medcat/preprocessing/taggers.py +++ b/medcat/preprocessing/taggers.py @@ -1,30 +1,45 @@ import re -def tag_skip_and_punct(doc, config): +def tag_skip_and_punct(nlp, name, config): r''' Detects and tags spacy tokens that are punctuation and that should be skipped. - Args: - doc (`spacy.tokens.Doc`): - Spacy document that will be tagged. - config (`medcat.config.Config`): - Global config for medcat. - - Return: - (`spacy.tokens.Doc): - Tagged spacy document + Args: + config (`medcat.config.Config`): + Global config for medcat. ''' - # Make life easier - cnf_p = config.preprocessing - - for token in doc: - if config.punct_checker.match(token.lower_) and token.text not in cnf_p['keep_punct']: - # There can't be punct in a token if it also has text - token._.is_punct = True - token._.to_skip = True - elif config.word_skipper.match(token.lower_): - # Skip if specific strings - token._.to_skip = True - elif cnf_p['skip_stopwords'] and token.is_stop: - token._.to_skip = True - - return doc + + return TagSkipAndPunct(nlp, name, config) + +class TagSkipAndPunct(object): + + def __init__(self, nlp, name, config): + self.nlp = nlp + self.name = name + self.config = config + + def __call__(self, doc): + r''' Detects and tags spacy tokens that are punctuation and that should be skipped. + + Args: + doc (`spacy.tokens.Doc`): + Spacy document that will be tagged. + + Return: + (`spacy.tokens.Doc): + Tagged spacy document + ''' + # Make life easier + cnf_p = self.config.preprocessing + + for token in doc: + if self.config.punct_checker.match(token.lower_) and token.text not in cnf_p['keep_punct']: + # There can't be punct in a token if it also has text + token._.is_punct = True + token._.to_skip = True + elif self.config.word_skipper.match(token.lower_): + # Skip if specific strings + token._.to_skip = True + elif cnf_p['skip_stopwords'] and token.is_stop: + token._.to_skip = True + + return doc diff --git a/medcat/utils/make_vocab.py b/medcat/utils/make_vocab.py index 01547d4c9..a9ebfc18f 100644 --- a/medcat/utils/make_vocab.py +++ b/medcat/utils/make_vocab.py @@ -40,7 +40,7 @@ def __init__(self, config, cdb=None, vocab=None, word_tokenizer=None): # Build the required spacy pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=config) - self.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=self.config), + self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) diff --git a/setup.py b/setup.py index 3e33f8f58..00042b257 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ 'numpy~=1.20', 'pandas~=1.0', 'gensim~=3.8', - 'spacy==2.3.4', + 'spacy==3.1.1', 'scipy~=1.5', 'transformers~=4.5.1', 'torch~=1.8.1', diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/archive_tests/test_cdb_maker_archive.py b/tests/archive_tests/test_cdb_maker_archive.py index 422fde2cd..8b8bb2acd 100644 --- a/tests/archive_tests/test_cdb_maker_archive.py +++ b/tests/archive_tests/test_cdb_maker_archive.py @@ -1,119 +1,124 @@ -r''' The tests here are a bit messy but they work, should be converted to python unittests. -''' -from medcat.cdb_maker import CDBMaker -from medcat.config import Config -import numpy as np import logging - -config = Config() -config.general['log_level'] = logging.DEBUG -maker = CDBMaker(config) - -# Building a new CDB from two files (full_build) -csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv'] -cdb = maker.prepare_csvs(csvs, full_build=True) - -assert len(cdb.cui2names) == 3 -assert len(cdb.cui2snames) == 3 -assert len(cdb.name2cuis) == 5 -assert len(cdb.cui2tags) == 3 -assert len(cdb.cui2preferred_name) == 2 -assert len(cdb.cui2context_vectors) == 3 -assert len(cdb.cui2count_train) == 3 -assert cdb.name2cuis2status['virus']['C0000039'] == 'P' -assert cdb.cui2type_ids['C0000039'] == {'T234', 'T109', 'T123'} -assert cdb.addl_info['cui2original_names']['C0000039'] == {'Virus', 'Virus K', 'Virus M', 'Virus Z'} -assert cdb.addl_info['cui2description']['C0000039'].startswith("Synthetic") - -# Test name addition -from medcat.preprocessing.cleaners import prepare_name -cdb.add_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', maker.nlp, {}, config), name_status='P', full_build=True) -assert cdb.addl_info['cui2original_names']['C0000239'] == {'MY: new,-_! Name.', 'Second csv'} -assert 'my:newname.' in cdb.name2cuis -assert 'my:new' in cdb.snames -assert 'my:newname.' in cdb.name2cuis2status -assert cdb.name2cuis2status['my:newname.'] == {'C0000239': 'P'} - -# Test name removal -cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', maker.nlp, {}, config)) -# Run again to make sure it does not break anything -cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', maker.nlp, {}, config)) -assert len(cdb.name2cuis) == 5 -assert 'my:newname.' not in cdb.name2cuis2status - -# Test filtering -cuis_to_keep = {'C0000039'} # Because of transition 2 will be kept -cdb.filter_by_cui(cuis_to_keep=cuis_to_keep) -assert len(cdb.cui2names) == 2 -assert len(cdb.name2cuis) == 4 -assert len(cdb.snames) == 4 - -# Test vector addition +import unittest import numpy as np -cdb.reset_training() -np.random.seed(11) -cuis = list(cdb.cui2names.keys()) -for i in range(2): - for cui in cuis: - vectors = {} - for cntx_type in config.linking['context_vector_sizes']: - vectors[cntx_type] = np.random.rand(300) - cdb.update_context_vector(cui, vectors, negative=False) - -assert cdb.cui2count_train['C0000139'] == 2 -assert cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300 - - -# Test negative -for cui in cuis: - vectors = {} - for cntx_type in config.linking['context_vector_sizes']: - vectors[cntx_type] = np.random.rand(300) - cdb.update_context_vector(cui, vectors, negative=True) - -assert cdb.cui2count_train['C0000139'] == 2 -assert cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300 - -# Test save/load from medcat.cdb import CDB -cdb.save("./tmp_cdb.dat") -cdb2 = CDB.load('./tmp_cdb.dat') -# Check a random thing -assert cdb2.cui2context_vectors['C0000139']['long'][7] == cdb.cui2context_vectors['C0000139']['long'][7] - -# Test training import -cdb.reset_training() -cdb2.reset_training() -np.random.seed(11) -cuis = list(cdb.cui2names.keys()) -for i in range(2): - for cui in cuis: - vectors = {} - for cntx_type in config.linking['context_vector_sizes']: - vectors[cntx_type] = np.random.rand(300) - cdb.update_context_vector(cui, vectors, negative=False) - -cdb2.import_training(cdb=cdb, overwrite=True) -assert cdb2.cui2context_vectors['C0000139']['long'][7] == cdb.cui2context_vectors['C0000139']['long'][7] -assert cdb2.cui2count_train['C0000139'] == cdb.cui2count_train['C0000139'] - -# Test concept similarity -cdb = CDB(config=config) -np.random.seed(11) -for i in range(500): - cui = "C" + str(i) - type_ids = {'T-' + str(i%10)} - cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), maker.nlp, {}, config), ontologies=set(), - name_status='P', type_ids=type_ids, description='', full_build=True) - - vectors = {} - for cntx_type in config.linking['context_vector_sizes']: - vectors[cntx_type] = np.random.rand(300) - cdb.update_context_vector(cui, vectors, negative=False) -res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True) -assert len(res) == 10 - -# Test training reset -cdb.reset_training() -assert len(cdb.cui2context_vectors['C0']) == 0 -assert cdb.cui2count_train['C0'] == 0 +from medcat.cdb_maker import CDBMaker +from medcat.config import Config +from medcat.preprocessing.cleaners import prepare_name + + +class CdbMakerArchiveTests(unittest.TestCase): + + def setUp(self): + self.config = Config() + self.config.general['log_level'] = logging.DEBUG + self.maker = CDBMaker(self.config) + + # Building a new CDB from two files (full_build) + csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv'] + self.cdb = self.maker.prepare_csvs(csvs, full_build=True) + + def test_prepare_csvs(self): + assert len(self.cdb.cui2names) == 3 + assert len(self.cdb.cui2snames) == 3 + assert len(self.cdb.name2cuis) == 5 + assert len(self.cdb.cui2tags) == 3 + assert len(self.cdb.cui2preferred_name) == 2 + assert len(self.cdb.cui2context_vectors) == 3 + assert len(self.cdb.cui2count_train) == 3 + assert self.cdb.name2cuis2status['virus']['C0000039'] == 'P' + assert self.cdb.cui2type_ids['C0000039'] == {'T234', 'T109', 'T123'} + assert self.cdb.addl_info['cui2original_names']['C0000039'] == {'Virus', 'Virus K', 'Virus M', 'Virus Z'} + assert self.cdb.addl_info['cui2description']['C0000039'].startswith("Synthetic") + + def test_name_addition(self): + self.cdb.add_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config), name_status='P', full_build=True) + assert self.cdb.addl_info['cui2original_names']['C0000239'] == {'MY: new,-_! Name.', 'Second csv'} + assert 'my:newname.' in self.cdb.name2cuis + assert 'my:new' in self.cdb.snames + assert 'my:newname.' in self.cdb.name2cuis2status + assert self.cdb.name2cuis2status['my:newname.'] == {'C0000239': 'P'} + + def test_name_removal(self): + self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config)) + # Run again to make sure it does not break anything + self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.nlp, {}, self.config)) + assert len(self.cdb.name2cuis) == 5 + assert 'my:newname.' not in self.cdb.name2cuis2status + + def test_filtering(self): + cuis_to_keep = {'C0000039'} # Because of transition 2 will be kept + self.cdb.filter_by_cui(cuis_to_keep=cuis_to_keep) + assert len(self.cdb.cui2names) == 2 + assert len(self.cdb.name2cuis) == 4 + assert len(self.cdb.snames) == 4 + + def test_vector_addition(self): + self.cdb.reset_training() + np.random.seed(11) + cuis = list(self.cdb.cui2names.keys()) + for i in range(2): + for cui in cuis: + vectors = {} + for cntx_type in self.config.linking['context_vector_sizes']: + vectors[cntx_type] = np.random.rand(300) + self.cdb.update_context_vector(cui, vectors, negative=False) + + assert self.cdb.cui2count_train['C0000139'] == 2 + assert self.cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300 + + + def test_negative(self): + cuis = list(self.cdb.cui2names.keys()) + for cui in cuis: + vectors = {} + for cntx_type in self.config.linking['context_vector_sizes']: + vectors[cntx_type] = np.random.rand(300) + self.cdb.update_context_vector(cui, vectors, negative=True) + + assert self.cdb.cui2count_train['C0000139'] == 2 + assert self.cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300 + + def test_save_and_load(self): + self.cdb.save("./tmp_cdb.dat") + cdb2 = CDB.load('./tmp_cdb.dat') + # Check a random thing + assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7] + + def test_training_import(self): + cdb2 = CDB.load('./tmp_cdb.dat') + self.cdb.reset_training() + cdb2.reset_training() + np.random.seed(11) + cuis = list(self.cdb.cui2names.keys()) + for i in range(2): + for cui in cuis: + vectors = {} + for cntx_type in self.config.linking['context_vector_sizes']: + vectors[cntx_type] = np.random.rand(300) + self.cdb.update_context_vector(cui, vectors, negative=False) + + cdb2.import_training(cdb=self.cdb, overwrite=True) + assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7] + assert cdb2.cui2count_train['C0000139'] == self.cdb.cui2count_train['C0000139'] + + def test_concept_similarity(self): + cdb = CDB(config=self.config) + np.random.seed(11) + for i in range(500): + cui = "C" + str(i) + type_ids = {'T-' + str(i%10)} + cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.nlp, {}, self.config), ontologies=set(), + name_status='P', type_ids=type_ids, description='', full_build=True) + + vectors = {} + for cntx_type in self.config.linking['context_vector_sizes']: + vectors[cntx_type] = np.random.rand(300) + cdb.update_context_vector(cui, vectors, negative=False) + res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True) + assert len(res) == 10 + + def test_training_reset(self): + self.cdb.reset_training() + assert len(self.cdb.cui2context_vectors['C0']) == 0 + assert self.cdb.cui2count_train['C0'] == 0 diff --git a/tests/archive_tests/test_ner_archive.py b/tests/archive_tests/test_ner_archive.py index 1be695f35..6037f3c16 100644 --- a/tests/archive_tests/test_ner_archive.py +++ b/tests/archive_tests/test_ner_archive.py @@ -1,3 +1,9 @@ +import logging +import os +import unittest +import numpy as np +from timeit import default_timer as timer +from medcat.cdb import CDB from medcat.preprocessing.tokenizers import spacy_split_all from medcat.ner.vocab_based_ner import NER from medcat.preprocessing.taggers import tag_skip_and_punct @@ -6,127 +12,127 @@ from medcat.vocab import Vocab from medcat.preprocessing.cleaners import prepare_name from medcat.linking.vector_context_model import ContextModel -from functools import partial from medcat.linking.context_based_linker import Linker from medcat.config import Config -import logging -from medcat.cdb import CDB -import os -import requests - -config = Config() -config.general['log_level'] = logging.INFO -cdb = CDB(config=config) - -nlp = Pipe(tokenizer=spacy_split_all, config=config) -nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config), - name='skip_and_punct', - additional_fields=['is_punct']) - -# Add a couple of names -cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config)) -cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', nlp, {}, config)) -cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config)) -# Check -#assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}} - -vocab_path = "./tmp_vocab.dat" -if not os.path.exists(vocab_path): - import requests - tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") - with open(vocab_path, 'wb') as f: - f.write(tmp.content) - -vocab = Vocab.load(vocab_path) -# Make the pipeline -nlp = Pipe(tokenizer=spacy_split_all, config=config) -nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config), - name='skip_and_punct', - additional_fields=['is_punct']) -spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=config, data_vocab=vocab) -nlp.add_token_normalizer(spell_checker=spell_checker, config=config) -ner = NER(cdb, config) -nlp.add_ner(ner) - -# Add Linker -link = Linker(cdb, vocab, config) -nlp.add_linker(link) - -# Test limits for tokens and uppercase -config.ner['max_skip_tokens'] = 1 -config.ner['upper_case_limit_len'] = 4 -config.linking['disamb_length_limit'] = 2 -text = "CDB - I was running and then Movar Virus attacked and CDb" -d = nlp(text) - -assert len(d._.ents) == 2 -assert d._.ents[0]._.link_candidates[0] == 'S-229004' - -# Change limit for skip -config.ner['max_skip_tokens'] = 3 -d = nlp(text) -assert len(d._.ents) == 3 - -# Change limit for upper_case -config.ner['upper_case_limit_len'] = 3 -d = nlp(text) -assert len(d._.ents) == 4 - -# Check name length limit -config.ner['min_name_len'] = 4 -d = nlp(text) -assert len(d._.ents) == 2 - -# Speed tests -from timeit import default_timer as timer -text = "CDB - I was running and then Movar Virus attacked and CDb" -text = text * 300 -config.general['spell_check'] = True -start = timer() -for i in range(50): - d = nlp(text) -end = timer() -print("Time: ", end - start) - -# Now without spell check -config.general['spell_check'] = False -start = timer() -for i in range(50): - d = nlp(text) -end = timer() -print("Time: ", end - start) - - -# Test for linker -import numpy as np - -config = Config() -config.general['log_level'] = logging.DEBUG -cdb = CDB(config=config) - -# Add a couple of names -cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config)) -cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', nlp, {}, config)) -cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config)) -cdb.add_names(cui='S-2290045', names=prepare_name('Movar', nlp, {}, config)) -# Check -#assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}} - -cuis = list(cdb.cui2names.keys()) -for cui in cuis[0:50]: - vectors = {'short': np.random.rand(300), - 'long': np.random.rand(300), - 'medium': np.random.rand(300) - } - cdb.update_context_vector(cui, vectors, negative=False) - -vocab = Vocab.load(vocab_path) -cm = ContextModel(cdb, vocab, config) -cm.train_using_negative_sampling('S-229004') -config.linking['train_count_threshold'] = 0 - -cm.train('S-229004', d._.ents[1], d) -cm.similarity('S-229004', d._.ents[1], d) -cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d) +class NerArchiveTests(unittest.TestCase): + + def setUp(self) -> None: + self.config = Config() + self.config.general['log_level'] = logging.INFO + cdb = CDB(config=self.config) + + self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) + self.nlp.add_tagger(tagger=tag_skip_and_punct, + name='skip_and_punct', + additional_fields=['is_punct']) + + # Add a couple of names + cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config)) + cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config)) + cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config)) + # Check + #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}} + + self.vocab_path = "./tmp_vocab.dat" + if not os.path.exists(self.vocab_path): + import requests + tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") + with open(self.vocab_path, 'wb') as f: + f.write(tmp.content) + + vocab = Vocab.load(self.vocab_path) + # Make the pipeline + self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) + self.nlp.add_tagger(tagger=tag_skip_and_punct, + name='skip_and_punct', + additional_fields=['is_punct']) + spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=self.config, data_vocab=vocab) + self.nlp.add_token_normalizer(spell_checker=spell_checker, config=self.config) + ner = NER(cdb, self.config) + self.nlp.add_ner(ner) + + # Add Linker + link = Linker(cdb, vocab, self.config) + self.nlp.add_linker(link) + + self.text = "CDB - I was running and then Movar Virus attacked and CDb" + + def test_limits_for_tokens_and_uppercase(self): + self.config.ner['max_skip_tokens'] = 1 + self.config.ner['upper_case_limit_len'] = 4 + self.config.linking['disamb_length_limit'] = 2 + + d = self.nlp(self.text) + + assert len(d._.ents) == 2 + assert d._.ents[0]._.link_candidates[0] == 'S-229004' + + def test_change_limit_for_skip(self): + self.config.ner['max_skip_tokens'] = 3 + d = self.nlp(self.text) + assert len(d._.ents) == 3 + + def test_change_limit_for_upper_case(self): + self.config.ner['upper_case_limit_len'] = 3 + d = self.nlp(self.text) + assert len(d._.ents) == 4 + + def test_check_name_length_limit(self): + self.config.ner['min_name_len'] = 4 + d = self.nlp(self.text) + assert len(d._.ents) == 2 + + def test_speed(self): + text = "CDB - I was running and then Movar Virus attacked and CDb" + text = text * 300 + self.config.general['spell_check'] = True + start = timer() + for i in range(50): + d = self.nlp(text) + end = timer() + print("Time: ", end - start) + + def test_without_spell_check(self): + # Now without spell check + self.config.general['spell_check'] = False + start = timer() + for i in range(50): + d = self.nlp(self.text) + end = timer() + print("Time: ", end - start) + + + def test_for_linker(self): + self.config = Config() + self.config.general['log_level'] = logging.DEBUG + cdb = CDB(config=self.config) + + # Add a couple of names + cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config)) + cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config)) + cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config)) + cdb.add_names(cui='S-2290045', names=prepare_name('Movar', self.nlp, {}, self.config)) + # Check + #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}} + + cuis = list(cdb.cui2names.keys()) + for cui in cuis[0:50]: + vectors = {'short': np.random.rand(300), + 'long': np.random.rand(300), + 'medium': np.random.rand(300) + } + cdb.update_context_vector(cui, vectors, negative=False) + + d = self.nlp(self.text) + vocab = Vocab.load(self.vocab_path) + cm = ContextModel(cdb, vocab, self.config) + cm.train_using_negative_sampling('S-229004') + self.config.linking['train_count_threshold'] = 0 + + cm.train('S-229004', d._.ents[1], d) + + cm.similarity('S-229004', d._.ents[1], d) + + cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d) diff --git a/tests/test_cdb_maker.py b/tests/test_cdb_maker.py index 3917c075b..9d84215a5 100644 --- a/tests/test_cdb_maker.py +++ b/tests/test_cdb_maker.py @@ -1,10 +1,11 @@ import unittest +import logging +import os +import numpy as np from medcat.cdb_maker import CDBMaker from medcat.cdb import CDB from medcat.config import Config from medcat.preprocessing.cleaners import prepare_name -import numpy as np -import logging #cdb.csv #cui name ontologies name_status type_ids description @@ -20,6 +21,7 @@ #TESTS RUN IN ALPHABETICAL ORDER - CONTROLLING WITH '[class_letter]Class and test_[classletter subclassletter]' function syntax + class A_CDBMakerLoadTests(unittest.TestCase): @classmethod @@ -28,7 +30,10 @@ def setUpClass(cls): config = Config() config.general['log_level'] = logging.DEBUG maker = CDBMaker(config) - csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv'] + csvs = [ + os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb.csv'), + os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb_2.csv') + ] cls.cdb = maker.prepare_csvs(csvs, full_build=True) def test_aa_cdb_names_length(self): @@ -110,7 +115,10 @@ def setUpClass(cls): cls.config = Config() cls.config.general['log_level'] = logging.DEBUG cls.maker = CDBMaker(cls.config) - csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv'] + csvs = [ + os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb.csv'), + os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'examples', 'cdb_2.csv') + ] cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True) cls.cdb2 = CDB(cls.config) diff --git a/tests/test_ner.py b/tests/test_ner.py index e05be9659..14720c205 100644 --- a/tests/test_ner.py +++ b/tests/test_ner.py @@ -28,7 +28,7 @@ def setUpClass(cls): print("Set up Vocab") vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): - tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") + tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) @@ -37,7 +37,7 @@ def setUpClass(cls): print("Set up NLP pipeline") cls.nlp = Pipe(tokenizer=spacy_split_all, config=cls.config) - cls.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=cls.config), + cls.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) diff --git a/webapp/webapp/Dockerfile b/webapp/webapp/Dockerfile index ccf9dfa81..ccd5e591c 100644 --- a/webapp/webapp/Dockerfile +++ b/webapp/webapp/Dockerfile @@ -19,7 +19,7 @@ RUN pip install -r requirements.txt # Get the spacy and scipspacy model RUN python -m spacy download en_core_web_md -RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_md-0.2.4.tar.gz +RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz # Build the db RUN python manage.py makemigrations && \ From 365faff1fa3a3046c0044b113d6f3058ec99af8a Mon Sep 17 00:00:00 2001 From: baixiac Date: Tue, 27 Jul 2021 14:12:33 +0100 Subject: [PATCH 2/7] CAT-31 add github actions and manage scispaCy models using requirements-*.txt --- .github/workflows/main.yml | 30 ++++++++++++++++++++++++++++++ medcat/utils/normalizers.py | 5 +---- requirements-lg.txt | 2 ++ requirements-sm.txt | 2 ++ requirements.txt | 1 + setup.py | 4 ++-- 6 files changed, 38 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/main.yml create mode 100644 requirements-lg.txt create mode 100644 requirements-sm.txt diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 000000000..b5eaceb3c --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,30 @@ +name: Main + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [ 3.7 ] + max-parallel: 1 + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + - name: Test + run: | + python -m unittest discover \ No newline at end of file diff --git a/medcat/utils/normalizers.py b/medcat/utils/normalizers.py index cc15bb0f6..8e6a07e27 100644 --- a/medcat/utils/normalizers.py +++ b/medcat/utils/normalizers.py @@ -1,9 +1,6 @@ -#import hunspell import re -from collections import Counter -from spacy.tokens import Span import spacy -import os + CONTAINS_NUMBER = re.compile('[0-9]+') diff --git a/requirements-lg.txt b/requirements-lg.txt new file mode 100644 index 000000000..7d293df4e --- /dev/null +++ b/requirements-lg.txt @@ -0,0 +1,2 @@ +. +https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz \ No newline at end of file diff --git a/requirements-sm.txt b/requirements-sm.txt new file mode 100644 index 000000000..03885cbf2 --- /dev/null +++ b/requirements-sm.txt @@ -0,0 +1,2 @@ +. +https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9c558e357..78c7cf284 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ . +https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz \ No newline at end of file diff --git a/setup.py b/setup.py index 0012bfa07..f65830479 100644 --- a/setup.py +++ b/setup.py @@ -21,11 +21,11 @@ 'numpy~=1.20', 'pandas~=1.0', 'gensim~=3.8', - 'spacy==3.1.1', + 'spacy<3.1.0,>=3.0.1', 'scipy~=1.5', 'transformers~=4.5.1', 'torch~=1.8.1', - 'Flask~=1.1', + 'tqdm<4.50.0,>=4.27', 'sklearn~=0.0', 'elasticsearch~=7.10', 'dill~=0.3.3', From b49a4523e0c300fcd6bff84c8975d0f6dce44201 Mon Sep 17 00:00:00 2001 From: baixiac Date: Tue, 27 Jul 2021 14:14:36 +0100 Subject: [PATCH 3/7] CAT-31 test github actions --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b5eaceb3c..865cfd186 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,7 +2,7 @@ name: Main on: push: - branches: [ master ] + branches: [ master, spacy-v3 ] pull_request: branches: [ master ] From 84a2595efe27b32d2bf845cddd26a235f8780705 Mon Sep 17 00:00:00 2001 From: baixiac Date: Tue, 27 Jul 2021 14:14:36 +0100 Subject: [PATCH 4/7] CAT-31 add badges to readme and fix vocab urls --- .github/workflows/main.yml | 6 +++--- README.md | 6 +++++- webapp/envs/env_medmen | 4 ++-- webapp/webapp/Dockerfile | 7 +++---- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b5eaceb3c..df7f5d505 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,8 +1,8 @@ -name: Main +name: build on: push: - branches: [ master ] + branches: [ master, spacy-v3 ] pull_request: branches: [ master ] @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ 3.7 ] + python-version: [ 3.7, 3.8, 3.9 ] max-parallel: 1 steps: diff --git a/README.md b/README.md index f1eea4679..2437541b8 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,12 @@ # Medical oncept Annotation Tool +[![Build Status](https://github.com/CogStack/MedCAT/actions/workflows/main.yml/badge.svg?branch=master)](https://github.com/CogStack/MedCAT/actions/workflows/main.yml?query=branch%3Amaster) +[![Latest release](https://img.shields.io/github/v/release/CogStack/MedCAT)](https://github.com/CogStack/MedCAT/releases/latest) +[![pypi Version](https://img.shields.io/pypi/v/medcat.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/medcat/) + MedCAT can be used to extract information from Electronic Health Records (EHRs) and link it to biomedical ontologies like SNOMED-CT and UMLS. Paper on [arXiv](https://arxiv.org/abs/2010.01165). -## News +## News - **New Feature and Tutorial \[8. July 2021\]**: [Integrating 🤗 Transformers with MedCAT for biomedical NER+L](https://towardsdatascience.com/integrating-transformers-with-medcat-for-biomedical-ner-l-8869c76762a) - **General \[1. April 2021\]**: MedCAT is upgraded to v1, unforunately this introduces breaking changes with older models (MedCAT v0.4), as well as potential problems with all code that used the MedCAT package. MedCAT v0.4 is available on the legacy diff --git a/webapp/envs/env_medmen b/webapp/envs/env_medmen index 85b79f850..447f0dfb9 100644 --- a/webapp/envs/env_medmen +++ b/webapp/envs/env_medmen @@ -19,8 +19,8 @@ KEEP_PUNCT=:|. SPACY_MODEL=en_core_sci_md VOCAB_PATH=/webapp/models/vocab.dat CDB_PATH=/webapp/models/cdb.dat -VOCAB_URL=https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat -CDB_URL=https://s3-eu-west-1.amazonaws.com/zkcl/cdb-medmen.dat +VOCAB_URL=https://medcat.rosalind.kcl.ac.uk/media/vocab.dat +CDB_URL=https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1.dat MKL_NUM_THREAD=1 NUMEXPR_NUM_THREADS=1 diff --git a/webapp/webapp/Dockerfile b/webapp/webapp/Dockerfile index ccd5e591c..61575edba 100644 --- a/webapp/webapp/Dockerfile +++ b/webapp/webapp/Dockerfile @@ -6,8 +6,8 @@ RUN mkdir -p /webapp/models # Copy everything COPY . /webapp -ENV VOCAB_URL=https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat -ENV CDB_URL=https://s3-eu-west-1.amazonaws.com/zkcl/cdb-medmen.dat +ENV VOCAB_URL=https://medcat.rosalind.kcl.ac.uk/media/vocab.dat +ENV CDB_URL=https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1.dat ENV CDB_PATH=/webapp/models/cdb.dat ENV VOCAB_PATH=/webapp/models/vocab.dat @@ -17,9 +17,8 @@ WORKDIR /webapp RUN pip install -r requirements.txt -# Get the spacy and scipspacy model +# Get the spacy model RUN python -m spacy download en_core_web_md -RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz # Build the db RUN python manage.py makemigrations && \ From bfeeba9c801d5224e345fa0de10865bf3c2945bb Mon Sep 17 00:00:00 2001 From: baixiac Date: Tue, 27 Jul 2021 17:54:15 +0100 Subject: [PATCH 5/7] CAT-18 replace another partial function --- .github/workflows/main.yml | 2 +- medcat/cat.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index df7f5d505..849333ac5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,7 +13,7 @@ jobs: strategy: matrix: python-version: [ 3.7, 3.8, 3.9 ] - max-parallel: 1 + max-parallel: 3 steps: - uses: actions/checkout@v2 diff --git a/medcat/cat.py b/medcat/cat.py index 9a7db9f15..f5c86dbed 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -70,7 +70,7 @@ def __init__(self, cdb, config, vocab, meta_cats=[]): # Build the pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) - self.nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=self.config), + self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) From 6c07e78d7f5266682cffe1f123a199604df8ca60 Mon Sep 17 00:00:00 2001 From: baixiac Date: Wed, 28 Jul 2021 11:32:06 +0100 Subject: [PATCH 6/7] CAT-18 close files after their use and pass on the replace param --- medcat/vocab.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/medcat/vocab.py b/medcat/vocab.py index d2558744a..f21de2c8a 100644 --- a/medcat/vocab.py +++ b/medcat/vocab.py @@ -134,7 +134,7 @@ def add_word(self, word, cnt=1, vec=None, replace=True): cnt (int): count of this word in your dataset vec (np.array): - the vector repesentation of the word + the vector representation of the word replace (bool): will replace old vector representation """ @@ -170,17 +170,16 @@ def add_words(self, path, replace=True): replace (bool): existing words in the vocabulary will be replaced """ - f = open(path) + with open(path) as f: + for line in f: + parts = line.split("\t") + word = parts[0] + cnt = int(parts[1].strip()) + vec = None + if len(parts) == 3: + vec = np.array([float(x) for x in parts[2].strip().split(" ")]) - for line in f: - parts = line.split("\t") - word = parts[0] - cnt = int(parts[1].strip()) - vec = None - if len(parts) == 3: - vec = np.array([float(x) for x in parts[2].strip().split(" ")]) - - self.add_word(word, cnt, vec) + self.add_word(word, cnt, vec, replace) def make_unigram_table(self, table_size=100000000): @@ -232,13 +231,17 @@ def get_negative_samples(self, n=6, ignore_punct_and_num=False): def __getitem__(self, word): - return self.vocab[word]['cnt'] + return self.count(word) def vec(self, word): return self.vocab[word]['vec'] + def count(self, word): + return self.vocab[word]['cnt'] + + def item(self, word): return self.vocab[word] From a70409d0dd57b49f8a5c8910224ca3f386550eb3 Mon Sep 17 00:00:00 2001 From: baixiac Date: Wed, 28 Jul 2021 11:32:58 +0100 Subject: [PATCH 7/7] CAT-18 refactor and add more tests --- examples/vocab_data.txt | 2 ++ medcat/cat.py | 2 +- medcat/pipe.py | 12 ++------ medcat/preprocessing/taggers.py | 21 +++++--------- tests/test_cat.py | 27 +++++++++++++++++ tests/test_cdb.py | 51 +++++++++++++++++++++++++++++++++ tests/test_vocab.py | 39 +++++++++++++++++++++++++ 7 files changed, 130 insertions(+), 24 deletions(-) create mode 100644 examples/vocab_data.txt create mode 100644 tests/test_cat.py create mode 100644 tests/test_cdb.py create mode 100644 tests/test_vocab.py diff --git a/examples/vocab_data.txt b/examples/vocab_data.txt new file mode 100644 index 000000000..da3f43048 --- /dev/null +++ b/examples/vocab_data.txt @@ -0,0 +1,2 @@ +house 34444 0.3232 0.123213 1.231231 +dog 14444 0.76762 0.76767 1.45454 diff --git a/medcat/cat.py b/medcat/cat.py index f5c86dbed..ce8a24628 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -116,7 +116,7 @@ def __call__(self, text, do_train=False): Returns: A spacy document with the extracted entities ''' - # Should we train - do not use this for training, unles you know what you are doing. Use the + # Should we train - do not use this for training, unless you know what you are doing. Use the #self.train() function self.config.linking['train'] = do_train diff --git a/medcat/pipe.py b/medcat/pipe.py index d22094bb5..7fd7c5a10 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -24,7 +24,6 @@ def __init__(self, tokenizer, config): self.nlp.tokenizer = tokenizer(self.nlp) self.config = config - def add_tagger(self, tagger, name, additional_fields=[]): r''' Add any kind of a tagger for tokens. @@ -37,9 +36,9 @@ def add_tagger(self, tagger, name, additional_fields=[]): additional_fields (`List[str]`): Fields to be added to the `_` properties of a token. ''' - component_name = spacy.util.get_object_name(tagger) - Language.factory(name=component_name, default_config={"config": self.config}, func=tagger) - self.nlp.add_pipe(component_name, name='tag_' + name, first=True) + component_factory_name = spacy.util.get_object_name(tagger) + Language.factory(name=component_factory_name, default_config={"config": self.config}, func=tagger) + self.nlp.add_pipe(component_factory_name, name='tag_' + name, first=True) # Add custom fields needed for this usecase Token.set_extension('to_skip', default=False, force=True) @@ -47,7 +46,6 @@ def add_tagger(self, tagger, name, additional_fields=[]): for field in additional_fields: Token.set_extension(field, default=False, force=True) - def add_token_normalizer(self, config, spell_checker=None): token_normalizer = TokenNormalizer(spell_checker=spell_checker, config=config) component_name = spacy.util.get_object_name(token_normalizer) @@ -57,7 +55,6 @@ def add_token_normalizer(self, config, spell_checker=None): # Add custom fields needed for this usecase Token.set_extension('norm', default=None, force=True) - def add_ner(self, ner): r''' Add NER from CAT to the pipeline, will also add the necessary fields to the document and Span objects. @@ -75,7 +72,6 @@ def add_ner(self, ner): Span.set_extension('detected_name', default=None, force=True) Span.set_extension('link_candidates', default=None, force=True) - def add_linker(self, linker): r''' Add entity linker to the pipeline, will also add the necessary fields to Span object. @@ -90,7 +86,6 @@ def add_linker(self, linker): Span.set_extension('cui', default=-1, force=True) Span.set_extension('context_similarity', default=-1, force=True) - def add_meta_cat(self, meta_cat, name): component_name = spacy.util.get_object_name(meta_cat) Language.component(name=component_name, func=meta_cat) @@ -100,6 +95,5 @@ def add_meta_cat(self, meta_cat, name): #of {category_name: value, ...} Span.set_extension('meta_anns', default=None, force=True) - def __call__(self, text): return self.nlp(text) diff --git a/medcat/preprocessing/taggers.py b/medcat/preprocessing/taggers.py index c63539a12..6ec2ac6e0 100644 --- a/medcat/preprocessing/taggers.py +++ b/medcat/preprocessing/taggers.py @@ -1,16 +1,19 @@ -import re - def tag_skip_and_punct(nlp, name, config): r''' Detects and tags spacy tokens that are punctuation and that should be skipped. Args: + nlp (spacy.language.): + The base spacy NLP pipeline. + name (`str`): + The component instance name. config (`medcat.config.Config`): Global config for medcat. ''' - return TagSkipAndPunct(nlp, name, config) + return _Tagger(nlp, name, config) + -class TagSkipAndPunct(object): +class _Tagger(object): def __init__(self, nlp, name, config): self.nlp = nlp @@ -18,16 +21,6 @@ def __init__(self, nlp, name, config): self.config = config def __call__(self, doc): - r''' Detects and tags spacy tokens that are punctuation and that should be skipped. - - Args: - doc (`spacy.tokens.Doc`): - Spacy document that will be tagged. - - Return: - (`spacy.tokens.Doc): - Tagged spacy document - ''' # Make life easier cnf_p = self.config.preprocessing diff --git a/tests/test_cat.py b/tests/test_cat.py new file mode 100644 index 000000000..869bcba54 --- /dev/null +++ b/tests/test_cat.py @@ -0,0 +1,27 @@ +import os +import unittest +from medcat.vocab import Vocab +from medcat.cdb import CDB +from medcat.cat import CAT + + +class CATTests(unittest.TestCase): + + def setUp(self) -> None: + self.cdb = CDB.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat")) + self.vocab = Vocab.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat")) + self.cdb.config.ner['min_name_len'] = 2 + self.cdb.config.ner['upper_case_limit_len'] = 3 + self.cdb.config.general['spell_check'] = True + self.cdb.config.linking['train_count_threshold'] = 10 + self.cdb.config.linking['similarity_threshold'] = 0.3 + self.cdb.config.linking['train'] = True + self.cdb.config.linking['disamb_length_limit'] = 5 + self.cdb.config.general['full_unlink'] = True + self.undertest = CAT(cdb=self.cdb, config=self.cdb.config, vocab=self.vocab) + + def test_pipeline(self): + text = "The dog is sitting outside the house." + doc = self.undertest(text) + self.assertEqual(text, doc.text) + diff --git a/tests/test_cdb.py b/tests/test_cdb.py new file mode 100644 index 000000000..0d9bad237 --- /dev/null +++ b/tests/test_cdb.py @@ -0,0 +1,51 @@ +import os +import shutil +import unittest +from medcat.config import Config +from medcat.cdb_maker import CDBMaker + + +class CDBTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + config = Config() + config.general["spacy_model"] = "en_core_sci_md" + cls.cdb_maker = CDBMaker(config) + + def setUp(self) -> None: + cdb_csv = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.csv") + cdb_2_csv = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb_2.csv") + self.tmp_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tmp") + os.makedirs(self.tmp_dir, exist_ok=True) + self.undertest = CDBTests.cdb_maker.prepare_csvs([cdb_csv, cdb_2_csv], full_build=True) + + def tearDown(self) -> None: + shutil.rmtree(self.tmp_dir) + + def test_name2cuis(self): + self.assertEqual({ + 'second~csv': ['C0000239'], + 'virus': ['C0000039', 'C0000139'], + 'virus~k': ['C0000039', 'C0000139'], + 'virus~m': ['C0000039', 'C0000139'], + 'virus~z': ['C0000039', 'C0000139'] + }, self.undertest.name2cuis) + + def test_cui2names(self): + self.assertEqual({ + 'C0000039': {'virus~z', 'virus~k', 'virus~m', 'virus'}, + 'C0000139': {'virus~z', 'virus', 'virus~m', 'virus~k'}, + 'C0000239': {'second~csv'} + }, self.undertest.cui2names) + + def test_cui2preferred_name(self): + self.assertEqual({'C0000039': 'Virus', 'C0000139': 'Virus Z'}, self.undertest.cui2preferred_name) + + def test_cui2type_ids(self): + self.assertEqual({'C0000039': {'T109', 'T234', 'T123'}, 'C0000139': set(), 'C0000239': set()}, self.undertest.cui2type_ids) + + def test_save_and_load(self): + cdb_path = f"{self.tmp_dir}/cdb.dat" + self.undertest.save(cdb_path) + self.undertest.load(cdb_path) diff --git a/tests/test_vocab.py b/tests/test_vocab.py new file mode 100644 index 000000000..8db82df41 --- /dev/null +++ b/tests/test_vocab.py @@ -0,0 +1,39 @@ +import os +import shutil +import unittest +from medcat.vocab import Vocab + + +class CATTests(unittest.TestCase): + + def setUp(self) -> None: + self.undertest = Vocab() + self.tmp_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tmp") + os.makedirs(self.tmp_dir, exist_ok=True) + + def tearDown(self) -> None: + shutil.rmtree(self.tmp_dir) + + def test_add_words(self): + self.undertest.add_words(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt")) + self.assertEqual(["house", "dog"], list(self.undertest.vocab.keys())) + + def test_add_word(self): + self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55]) + self.assertEqual(["test"], list(self.undertest.vocab.keys())) + self.assertTrue("test" in self.undertest) + + def test_count(self): + self.undertest.add_words(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt")) + self.assertEqual(34444, self.undertest.count("house")) + + def test_save_and_load(self): + self.undertest.add_words(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab_data.txt")) + self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55]) + vocab_path = f"{self.tmp_dir}/vocab.dat" + self.undertest.save(vocab_path) + vocab = Vocab.load(vocab_path) + self.assertEqual(["house", "dog", "test"], list(vocab.vocab.keys())) + + +