From a1dccf4d21f7b23996250cb7fde61531395dfec0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 26 Jul 2023 09:48:04 +0100 Subject: [PATCH 01/64] Bump aiohttp from 3.8.3 to 3.8.5 (#333) Bumps [aiohttp](https://github.com/aio-libs/aiohttp) from 3.8.3 to 3.8.5. - [Release notes](https://github.com/aio-libs/aiohttp/releases) - [Changelog](https://github.com/aio-libs/aiohttp/blob/v3.8.5/CHANGES.rst) - [Commits](https://github.com/aio-libs/aiohttp/compare/v3.8.3...v3.8.5) --- updated-dependencies: - dependency-name: aiohttp dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 84ca5cc2a..be517876f 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,4 +3,4 @@ sphinx-rtd-theme~=1.0 myst-parser~=0.17 sphinx-autoapi~=1.8 setuptools>=60.0 -aiohttp==3.8.3 \ No newline at end of file +aiohttp==3.8.5 \ No newline at end of file diff --git a/setup.py b/setup.py index 4e73b2f89..cefc88572 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ 'pydantic>=1.10.0,<2.0', # for spacy compatibility; avoid 2.0 due to breaking changes # the following are not direct dependencies of MedCAT but needed for docs/building # hopefully will no longer need the transitive dependencies - 'aiohttp==3.8.3', # 3.8.3 is needed for compatibility with fsspec <- datasets <- medcat + 'aiohttp==3.8.5', # 3.8.3 is needed for compatibility with fsspec <- datasets <- medcat 'blis<0.8.0,>=0.7.8', # as required by thinc <- spacy <- medcat # 'smart-open==5.2.1', # 5.2.1 is needed for compatibility with pathy # 'joblib~=1.2', From 8fe9dfcdf8541149545faa683890bf234d2608c0 Mon Sep 17 00:00:00 2001 From: tomolopolis Date: Mon, 31 Jul 2023 12:08:23 +0100 Subject: [PATCH 02/64] CU-862k77jjj: changes needed for Trainer metrics page --- medcat/cat.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index b2d3f7cb3..5218e9d02 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -534,10 +534,14 @@ def _print_stats(self, anns_norm.append((ann['start'], cui)) anns_examples.append({"text": doc['text'][max(0, ann['start']-60):ann['end']+60], "cui": cui, + "start": ann['start'], + "end": ann['end'], "source value": ann['value'], "acc": 1, "project name": project.get('name'), - "document name": doc.get('name')}) + "document name": doc.get('name'), + "project id": project.get('id'), + "document id": doc.get('id')}) elif ann.get('validated', True) and (ann.get('killed', False) or ann.get('deleted', False)): anns_norm_neg.append((ann['start'], cui)) @@ -556,11 +560,14 @@ def _print_stats(self, p_anns_norm.append((ann.start_char, cui)) p_anns_examples.append({"text": doc['text'][max(0, ann.start_char-60):ann.end_char+60], "cui": cui, + "start": ann.start_char, + "end": ann.end_char, "source value": ann.text, "acc": float(ann._.context_similarity), "project name": project.get('name'), - "document name": doc.get('name')}) - + "document name": doc.get('name'), + "project id": project.get('id'), + "document id": doc.get('id')}) for iann, ann in enumerate(p_anns_norm): cui = ann[1] if ann in anns_norm: From 9f9b25b21e700fde49c86372569ec5ca008abad0 Mon Sep 17 00:00:00 2001 From: tomolopolis Date: Mon, 14 Aug 2023 12:12:33 +0100 Subject: [PATCH 03/64] remove bad merge

element --- webapp/webapp/demo/templates/train_annotations.html | 2 -- 1 file changed, 2 deletions(-) diff --git a/webapp/webapp/demo/templates/train_annotations.html b/webapp/webapp/demo/templates/train_annotations.html index 19b5882c7..25677cd21 100644 --- a/webapp/webapp/demo/templates/train_annotations.html +++ b/webapp/webapp/demo/templates/train_annotations.html @@ -29,8 +29,6 @@

Disclaimer
WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED.

contact@cogstack.com for more information.

-

contact@cogstack.com for more information.

-

Sample text

From eec6b5a68dd0cb9d9b91015254525610936237e7 Mon Sep 17 00:00:00 2001
From: Mart Ratas 
Date: Mon, 4 Sep 2023 13:05:54 +0300
Subject: [PATCH 04/64] CU-8692kpchc Fix for Rosalind link not working (#342)

* CU-8692kpchc Add the 403 exception to vocab downloader

* CU-8692kpchc Add the new vocab download link
---
 tests/helper.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/tests/helper.py b/tests/helper.py
index 23afdb6b4..9fb66589b 100644
--- a/tests/helper.py
+++ b/tests/helper.py
@@ -24,6 +24,15 @@ async def __call__(self, *args, **kwargs):
 
 """
 
+ERROR_403 = b"""
+
+403 Forbidden
+
+

Forbidden

+

You don't have permission to access this resource.

+ +""" + SIMPLE_WORDS = """house 34444 0.3232 0.123213 1.231231 dog 14444 0.76762 0.76767 1.45454""" @@ -45,7 +54,7 @@ def generate_simple_vocab(): class VocabDownloader: - url = 'https://medcat.rosalind.kcl.ac.uk/media/vocab.dat' + url = 'https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/vocab.dat' vocab_path = "./tmp_vocab.dat" _has_simple = False @@ -54,6 +63,8 @@ def is_valid(self): content = f.read() if content == ERROR_503: return False + if content == ERROR_403: + return False v = Vocab.load(self.vocab_path) if len(v.vocab) == 2: # simple one self._has_simple = True @@ -64,7 +75,7 @@ def check_or_download(self): if os.path.exists(self.vocab_path) and self.is_valid(): return tmp = requests.get(self.url) - if tmp.content == ERROR_503: + if tmp.content == ERROR_503 or tmp.content == ERROR_403: print('Rosalind server unavailable') if self._has_simple: print('Local simple vocab already present') From 54d8a6d7f08c7761f1a9a366c05ea6c0f52fcec9 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 4 Sep 2023 14:53:32 +0300 Subject: [PATCH 05/64] Add missing self argument (#343) To `_refset_df2dict ` method in Snomed preprocessing --- medcat/utils/preprocess_snomed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 5e65b3a77..3ba94b977 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -327,7 +327,7 @@ def _check_path_and_release(self): raise FileNotFoundError('Incorrect path to SNOMED CT directory') return paths, snomed_releases - def _refset_df2dict(refset_df: pd.DataFrame) -> dict: + def _refset_df2dict(self, refset_df: pd.DataFrame) -> dict: """ This function takes a SNOMED refset DataFrame as an input and converts it into a dictionary. The DataFrame should contain the columns 'referencedComponentId','mapTarget','mapGroup','mapPriority','mapRule','mapAdvice'. From 3aaef441e7d121758fc2c390ce34b6ce876cf855 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 4 Sep 2023 15:15:18 +0300 Subject: [PATCH 06/64] CU-8692kn0yv Fix issue with fake dict in identifier based config More specifically the get method which was not able to return default values for non-existant keys (#341) --- medcat/config.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/medcat/config.py b/medcat/config.py index 07cf6f7f1..b2e324deb 100644 --- a/medcat/config.py +++ b/medcat/config.py @@ -28,7 +28,11 @@ class FakeDict: """FakeDict that allows the use of the __getitem__ and __setitem__ method for legacy access.""" def __getitem__(self, arg: str) -> Any: - return getattr(self, arg) + try: + return getattr(self, arg) + except AttributeError as e: + raise KeyError from e + def __setitem__(self, arg: str, val) -> None: setattr(self, arg, val) From e0c64561eb7c839f42e5a32188789981d6c3e6d3 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Thu, 21 Sep 2023 16:21:00 +0300 Subject: [PATCH 07/64] CU-8692mevx8 Fix issue with filters not taking effect in train_supervised method (#345) * CU-8692mevx8 Fix issue with filters not taking effect in train_supervised method * CU-8692mevx8 Fix filter retention in train_supervised method --- medcat/cat.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index 5218e9d02..2323cd737 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -490,7 +490,8 @@ def _print_stats(self, fp_docs: Set = set() fn_docs: Set = set() - local_filters = self.config.linking.filters.copy_of() + orig_filters = self.config.linking.filters.copy_of() + local_filters = self.config.linking.filters for pind, project in tqdm(enumerate(data['projects']), desc="Stats project", total=len(data['projects']), leave=False): local_filters.cuis = set() @@ -645,6 +646,8 @@ def _print_stats(self, except Exception: traceback.print_exc() + self.config.linking.filters = orig_filters + return fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples def _set_project_filters(self, local_filters: LinkingFilters, project: dict, @@ -1033,7 +1036,13 @@ def train_supervised_raw(self, """ checkpoint = self._init_ckpts(is_resumed, checkpoint) - local_filters = self.config.linking.filters.copy_of() + # the config.linking.filters stuff is used directly in + # medcat.linking.context_based_linker and medcat.linking.vector_context_model + # as such, they need to be kept up to date with per-project filters + # However, the original state needs to be kept track of + # so that it can be restored after training + orig_filters = self.config.linking.filters.copy_of() + local_filters = self.config.linking.filters fp = fn = tp = p = r = f1 = examples = {} @@ -1094,7 +1103,7 @@ def train_supervised_raw(self, if retain_filters and extra_cui_filter and not retain_extra_cui_filter: # adding project filters without extra_cui_filters self._set_project_filters(local_filters, project, set(), use_filters) - self.config.linking.filters.merge_with(local_filters) + orig_filters.merge_with(local_filters) # adding extra_cui_filters, but NOT project filters self._set_project_filters(local_filters, project, extra_cui_filter, False) # refrain from doing it again for subsequent epochs @@ -1140,7 +1149,7 @@ def train_supervised_raw(self, checkpoint.save(self.cdb, latest_trained_step) # if retaining MCT filters AND (if they exist) extra_cui_filters if retain_filters: - self.config.linking.filters.merge_with(local_filters) + orig_filters.merge_with(local_filters) # refrain from doing it again for subsequent epochs retain_filters = False @@ -1162,6 +1171,9 @@ def train_supervised_raw(self, use_groups=use_groups, extra_cui_filter=extra_cui_filter) + # reset the state of filters + self.config.linking.filters = orig_filters + return fp, fn, tp, p, r, f1, cui_counts, examples def get_entities(self, From dd895a9fd441bbd3356d4f838ae2cf9d2150a581 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 3 Oct 2023 04:38:04 +0000 Subject: [PATCH 08/64] Bump urllib3 from 1.26.5 to 1.26.17 in /webapp/webapp Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.26.5 to 1.26.17. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/1.26.5...1.26.17) --- updated-dependencies: - dependency-name: urllib3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- webapp/webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/webapp/requirements.txt b/webapp/webapp/requirements.txt index a4b7827ad..d6a590572 100644 --- a/webapp/webapp/requirements.txt +++ b/webapp/webapp/requirements.txt @@ -3,4 +3,4 @@ django-dbbackup==4.0.0b0 django-storages[boto3]==1.12.3 django-cron==0.5.1 medcat==1.2.7 -urllib3==1.26.5 +urllib3==1.26.17 From 4daceb2bbb28106c40a6e99a78060ca5b79ffffc Mon Sep 17 00:00:00 2001 From: tomolopolis Date: Mon, 9 Oct 2023 17:01:50 +0100 Subject: [PATCH 09/64] CU-8692wb8gf: 'tokenizers>=0.12.0', # 0.13.1 doesn't seem to build --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index cefc88572..5c82805ba 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ 'spacy>=3.1.0', 'scipy~=1.9.2', # first to support 3.11 'transformers>=4.19.2,<4.22.0', # upper bound is needed for the de-id model until it is retrained + 'tokenizers>=0.12.0', # 0.13.1 doesn't seem to build 'torch>=1.13.0', # first to support 3.11 'tqdm>=4.27', 'scikit-learn>=1.1.3', # first to supporrt 3.11 From 7f798c2588e085afb9a5c1c1382a003776455894 Mon Sep 17 00:00:00 2001 From: tomolopolis Date: Mon, 9 Oct 2023 17:07:31 +0100 Subject: [PATCH 10/64] CU-8692wb8gf: pin to pre 0.12, so rust compiler install reliably works, as we can't use 0.14 due to de-id transformers dep reliance --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5c82805ba..db91b6a0a 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ 'spacy>=3.1.0', 'scipy~=1.9.2', # first to support 3.11 'transformers>=4.19.2,<4.22.0', # upper bound is needed for the de-id model until it is retrained - 'tokenizers>=0.12.0', # 0.13.1 doesn't seem to build + 'tokenizers~=0.12', # 0.13.1 doesn't seem to build 'torch>=1.13.0', # first to support 3.11 'tqdm>=4.27', 'scikit-learn>=1.1.3', # first to supporrt 3.11 From 158ef5893670d9429e39a517c89f3c49eee2f64e Mon Sep 17 00:00:00 2001 From: Tom Searle Date: Tue, 10 Oct 2023 00:10:34 +0100 Subject: [PATCH 11/64] CU-8692wcmp7: update transformers to the latest version --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index db91b6a0a..e9757e849 100644 --- a/setup.py +++ b/setup.py @@ -23,8 +23,7 @@ 'gensim>=4.3.0', # first to support 3.11 'spacy>=3.1.0', 'scipy~=1.9.2', # first to support 3.11 - 'transformers>=4.19.2,<4.22.0', # upper bound is needed for the de-id model until it is retrained - 'tokenizers~=0.12', # 0.13.1 doesn't seem to build + 'transformers>=4.34', 'torch>=1.13.0', # first to support 3.11 'tqdm>=4.27', 'scikit-learn>=1.1.3', # first to supporrt 3.11 From a5cdb8a5c3a6e9dce659f2c063e7739cae4f98b8 Mon Sep 17 00:00:00 2001 From: tomolopolis Date: Tue, 10 Oct 2023 10:18:15 +0100 Subject: [PATCH 12/64] CU-8692wcmp7: include accelerate as required by the de-id test. --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e9757e849..25a7a7c43 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,8 @@ 'gensim>=4.3.0', # first to support 3.11 'spacy>=3.1.0', 'scipy~=1.9.2', # first to support 3.11 - 'transformers>=4.34', + 'transformers>=4.34.0', + 'accelerate>=0.23.0', # required by Trainer class in de-id 'torch>=1.13.0', # first to support 3.11 'tqdm>=4.27', 'scikit-learn>=1.1.3', # first to supporrt 3.11 From ab07daabec2c32b4c0d2ddd8f0d53222402f700b Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Tue, 10 Oct 2023 14:04:06 +0300 Subject: [PATCH 13/64] CU-8692wgmkm: Remove py2neo dependency and the code that used it (#356) * CU-8692wgmkm: Remove py2neo dependency and the code that used it * CU-8692wgmkm: Remove medcat.neo package from setup.py --- medcat/neo/__init__.py | 0 medcat/neo/data_preparation.py | 231 --------------------------------- medcat/neo/neo_connector.py | 161 ----------------------- setup.py | 3 +- 4 files changed, 1 insertion(+), 394 deletions(-) delete mode 100644 medcat/neo/__init__.py delete mode 100644 medcat/neo/data_preparation.py delete mode 100644 medcat/neo/neo_connector.py diff --git a/medcat/neo/__init__.py b/medcat/neo/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/medcat/neo/data_preparation.py b/medcat/neo/data_preparation.py deleted file mode 100644 index 551c3117e..000000000 --- a/medcat/neo/data_preparation.py +++ /dev/null @@ -1,231 +0,0 @@ -import os -import pandas as pd - - -def get_index_queries(): - """Run before everything to speed up things.""" - return ['CREATE INDEX patientId FOR (p:Patient) ON (p.patientId);', - 'CREATE INDEX conceptId FOR (c:Concept) ON (c.conceptId);', - 'CREATE INDEX documentId FOR (d:Document) ON (d.documentId);'] - - -def create_neo_csv(data, columns, output_dir='/etc/lib/neo4j/import/', - base_name='patients'): - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data. - columns: - What data to use from the dataframe. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible. - base_name: - Name of the csv. - """ - if isinstance(data, pd.DataFrame): - df = data - else: - df = pd.read_csv(data) - - # Remove duplicates - df = df.drop_duplicates(subset=columns) - - out_df = df[columns] - data_path = os.path.join(output_dir, f"{base_name}.csv") - out_df.to_csv(data_path, index=False) - - -def create_patients_csv(data, output_dir='/etc/lib/neo4j/import/', - base_name='patients'): - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data: patientId, - sex, ethnicity, dob. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible, - but writing there could be only admin. - - Returns: - str: The query. - """ - query = ( - 'USING PERIODIC COMMIT 100000 \n' - f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n' - 'CREATE (:Patient {patientId: toString(row.patientId), \n' - ' sex: toString(row.sex), \n' - ' ethnicity: toString(row.ethnicity), \n' - ' dob: datetime(row.dob)}) \n' - ) - - create_neo_csv(data=data, columns=['patientId', 'sex', 'ethnicity', 'dob'], - output_dir=output_dir, base_name=base_name) - - return query - - -def create_documents_csv(data, output_dir='/etc/lib/neo4j/import/', - base_name='documents'): - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data: documentId. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible. - - Returns: - str: The query. - """ - query = ( - 'USING PERIODIC COMMIT 100000 \n' - f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n' - 'CREATE (:Document {documentId: toString(row.documentId)}) \n' - ) - - create_neo_csv(data=data, columns=['documentId'], - output_dir=output_dir, base_name=base_name) - - return query - - -def create_concepts_csv(data, output_dir='/etc/lib/neo4j/import/', - base_name='concepts'): - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data: conceptId, - name and type. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible. - """ - query = ( - 'USING PERIODIC COMMIT 100000 \n' - f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n' - 'CREATE (:Concept {conceptId: toString(row.conceptId), \n' - ' type: toString(row.type), \n' - ' name: toString(row.name)}) \n' - ) - - create_neo_csv(data=data, columns=['conceptId', 'name', 'type'], - output_dir=output_dir, base_name=base_name) - - return query - - -def create_document2patient_csv(data, output_dir='/etc/lib/neo4j/import/', - base_name='document2patient'): - - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data: patientId and - documentId. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible. - """ - query = ( - 'USING PERIODIC COMMIT 100000 \n' - f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n' - 'MATCH (pt:Patient {patientId: toString(row.patientId)}) \n' - 'MATCH (doc:Document {documentId: toString(row.documentId)}) \n' - 'CREATE (pt)-[:HAS]->(doc); \n' - ) - - create_neo_csv(data=data, columns=['patientId', 'documentId'], - output_dir=output_dir, base_name=base_name) - - return query - - -def create_concept_ontology_csv(data, output_dir='/etc/lib/neo4j/import/', - base_name='concept_ontology'): - - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data: child, parent. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible. - """ - query = ( - 'USING PERIODIC COMMIT 100000 \n' - f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n' - 'MATCH (child:Concept {conceptId: toString(row.child)}) \n' - 'MATCH (parent:Concept {conceptId: toString(row.parent)}) \n' - 'CREATE (child)-[:IS_A]->(parent); \n' - ) - - create_neo_csv(data=data, columns=['child', 'parent'], - output_dir=output_dir, base_name=base_name) - - return query - - -def create_document2concept_csv(data, output_dir='/etc/lib/neo4j/import/', - base_name='document2concepts'): - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data: 'conceptId', - 'documentId', 'contextSimilarity', 'start', 'end', 'timestamp', - 'metaSubject', 'metaPresence', 'metaTime'. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible. - """ - query = ( - 'USING PERIODIC COMMIT 100000 \n' - f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n' - 'MATCH (doc:Document{documentId: toString(row.documentId)}) \n' - 'MATCH (concept:Concept {conceptId: toString(row.conceptId)}) \n' - 'CREATE (doc)-[:HAS {start: toInteger(row.start), \n' - ' end: toInteger(row.end), \n' - ' timestamp: toInteger(row.timestamp), \n' - ' contextSimilarity: toFloat(row.contextSimilarity), \n' - ' metaSubject: toString(row.metaSubject), \n' - ' metaPresence: toString(row.metaPresence), \n' - ' metaTime: toString(row.metaTime) \n' - ' }]->(concept); \n' - ) - - columns = ['conceptId', 'documentId', 'contextSimilarity', 'start', - 'end', 'timestamp', 'metaSubject', 'metaPresence', 'metaTime'] - - create_neo_csv(data=data, columns=columns, - output_dir=output_dir, base_name=base_name) - - return query - - -def get_data_from_docs(docs, doc2pt, doc2time=None): - data = [['conceptId', 'documentId', 'contextSimilarity', - 'start', 'end', 'timestamp', 'metaSubject', - 'metaPresence', 'metaTime']] - - for doc_id, doc in docs.items(): - row = [] - for ent in doc['entities'].values(): - #if ent['meta_anns']['Subject']['value'] == 'Patient' and \ - # ent['meta_anns']['Presence']['value'] == 'True': - if doc2time is not None: - t = doc2time[doc_id] - else: - t = ent['document_timestamp'] - - row = [ent['cui'], doc_id, - ent['context_similarity'], - ent['start'], ent['end'], - t, - ent['meta_anns'].get('Subject', {}).get('value', None), - ent['meta_anns'].get('Presence', {}).get('value', None), - ent['meta_anns'].get('Time', {}).get('value', None)] - data.append(row) - row = [] - - return data diff --git a/medcat/neo/neo_connector.py b/medcat/neo/neo_connector.py deleted file mode 100644 index 69eef0f7e..000000000 --- a/medcat/neo/neo_connector.py +++ /dev/null @@ -1,161 +0,0 @@ -from py2neo import Graph -import getpass -from collections import defaultdict - - -class NeoConnector: - def __init__(self, uri, user, password=None): - if password is None: - password = getpass.getpass("Password:") - self.graph = Graph(uri, auth=(user, password)) - - def execute(self, query): - r = self.graph.run(query) - return r - - def bucket_concepts(self, data, bucket_size_seconds): - entities = data['entities'] - - _bucket = [] - _concepts = set() - start_time = -1 - new_stream = [] - # Sort entities - entities.sort(key=lambda ent: ent['timestamp']) - for ent in entities: - if start_time == -1: - start_time = ent['timestamp'] - - if ent['timestamp'] - start_time >= bucket_size_seconds: - # Add to stream - new_stream.extend(_bucket) - _bucket = [] - _concepts = set() - start_time = ent['timestamp'] - - t_ent = dict(new_stream[-1]) - t_ent['timestamp'] += 1 - t_ent['name'] = '' - t_ent['conceptId'] = '' - new_stream.append(t_ent) - - if ent['conceptId'] not in _concepts: - _bucket.append(ent) - _concepts.add(ent['conceptId']) - - if _bucket: - new_stream.extend(_bucket) - - data['entities'] = new_stream - - def get_all_patients(self, concepts, limit=1000, require_time=False, ignore_meta=False): - """Return all patients having all concepts - - Args: - concepts: The concepts - limit: The maximum number of results. Defaults to 1000. - require_time: If set only concepts that have the timestamp property will be used. - """ - - q = "WITH [{}] AS cs ".format(",".join(["'{}'".format(c) for c in concepts])) - if not require_time: - q += '''MATCH (c:Concept)<-[:HAS ''' - if not ignore_meta: - q += '''{metaPresence: 'True', metaSubject: 'Patient'}''' - q += ''']-(:Document)<-[:HAS]-(pt:Patient) - WHERE c.conceptId in cs - WITH pt, size(cs) as inputCnt, count(DISTINCT c) as cnt - WHERE cnt = inputCnt - ''' - else: - q += '''MATCH (c:Concept)<-[r:HAS {metaPresence: 'True', metaSubject: - 'Patient'}]-(:Document)<-[:HAS]-(pt:Patient) \n - WHERE c.conceptId in cs AND exists(r.timestamp) \n - WITH pt, size(cs) as inputCnt, count(DISTINCT c) as cnt \n - WHERE cnt = inputCnt \n - ''' - - q += ' RETURN pt LIMIT {}'.format(limit) - data = self.execute(q).data() # Do not like this too much - - return [n['pt']['patientId'] for n in data], q - - def get_all_concepts_from(self, patient_id=None, document_id=None, - limit=1000, bucket_size_seconds=None, min_count=0, meta_requirements=None, require_time=True): - """Returns all concepts belonging to a document or patient - given the concept type (if none all are retruned). - """ - - if patient_id is not None: - q = 'MATCH (patient:Patient {patientId: "%s"})-[:HAS]->' % patient_id \ - + '(document:Document)-[has:HAS]->(concept:Concept) \n' - elif document_id is not None: - q = 'MATCH (patient:Patient)-[:HAS]->(document:Document {documentId: "%s"})' % document_id \ - + '-[has:HAS]->(concept:Concept) \n' - else: - raise Exception("patient_id or document_id are required") - q += 'RETURN patient, document, concept, has LIMIT %s \n' % limit - - data = self.execute(q).data() # Do not like this too much - out = None - if len(data) > 0: - out = {'patient': dict(data[0]['patient']), - 'entities': []} - - cnt = defaultdict(int) - for row in data: - if meta_requirements is None or \ - all([row['has'][meta] == value for meta,value in meta_requirements.items()]): - if not require_time or 'timestamp' in row['has']: - ent = dict(row['concept']) # Take everything from concept - ent['documentId'] = row['document']['documentId'] - ent.update(row['has']) # add all the stuff from the meta ann - - out['entities'].append(ent) - cnt[ent['conceptId']] += 1 - - # Cleanup based on min_count - new_ents = [] - for ent in out['entities']: - if cnt[ent['conceptId']] >= min_count: - ent['count'] = cnt[ent['conceptId']] - new_ents.append(ent) - out['entities'] = new_ents - - if bucket_size_seconds is not None: - self.bucket_concepts(data=out, bucket_size_seconds=bucket_size_seconds) - - return out, q - - def get_all_patients_descend(self, concepts, limit=1000, require_time=False): - """Return all patients having all descendant concepts under the ancestor concept - - Args: - concepts: Ancestor top-level concepts - limit: The maximum number of results. Defaults to 1000. - require_time: If set only concepts that have the timestamp property will be used. - Defaults to False - Returns: - List: Patients with attached SNOMED concepts - """ - - q = "WITH [{}] AS ancestor ".format(",".join(["'{}'".format(c) for c in concepts])) - if not require_time: - q += '''MATCH (n:Concept)-[:IS_A*0..5]->(m:Concept) - WHERE m.conceptId IN ancestor ## get the ancestor and the children - WITH [n.conceptId] AS lineage ## pass the lineage to patient match - MATCH (c:Concept)<-[r:HAS {metaPresence: 'True', metaSubject: 'Patient'}]-(d:Document)<-[q:HAS]-(pt:Patient) - WHERE c.conceptId in lineage - ''' - else: - q += '''MATCH (n:Concept)-[:IS_A*0..5]->(m:Concept) - WHERE m.conceptId IN ancestor ## get the ancestor and the children - WITH [n.conceptId] AS lineage ## pass the lineage to patient match - MATCH (c:Concept)<-[r:HAS {metaPresence: 'True', metaSubject: 'Patient'}]-(d:Document)<-[q:HAS]-(pt:Patient) - WHERE c.conceptId in lineage AND exists(r.timestamp) - ''' - - q += ' RETURN pt.patientId, pt.sex, c.conceptId, c.name, r.timestamp LIMIT {}'.format(limit) - data = self.execute(q).data() # Do not like this too much - - return [n['pt']['patientId'] for n in data], q diff --git a/setup.py b/setup.py index db91b6a0a..646369c9c 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ long_description_content_type="text/markdown", url="https://github.com/CogStack/MedCAT", packages=['medcat', 'medcat.utils', 'medcat.preprocessing', 'medcat.ner', 'medcat.linking', 'medcat.datasets', - 'medcat.tokenizers', 'medcat.utils.meta_cat', 'medcat.pipeline', 'medcat.neo', 'medcat.utils.ner', + 'medcat.tokenizers', 'medcat.utils.meta_cat', 'medcat.pipeline', 'medcat.utils.ner', 'medcat.utils.saving', 'medcat.utils.regression'], install_requires=[ 'numpy>=1.22.0', # first to support 3.11 @@ -34,7 +34,6 @@ 'psutil>=5.8.0', # 0.70.12 uses older version of dill (i.e less than 0.3.5) which is required for datasets 'multiprocess~=0.70.12', # 0.70.14 seemed to work just fine - 'py2neo~=2021.2.3', 'aiofiles>=0.8.0', # allow later versions, tested with 22.1.0 'ipywidgets>=7.6.5', # allow later versions, tested with 0.8.0 'xxhash>=3.0.0', # allow later versions, tested with 3.1.0 From 128d9ea7441e59b38897b890ad36d2d67360c56e Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Tue, 10 Oct 2023 14:04:06 +0300 Subject: [PATCH 14/64] CU-8692wgmkm: Remove py2neo dependency and the code that used it (#356) * CU-8692wgmkm: Remove py2neo dependency and the code that used it * CU-8692wgmkm: Remove medcat.neo package from setup.py --- medcat/neo/__init__.py | 0 medcat/neo/data_preparation.py | 231 --------------------------------- medcat/neo/neo_connector.py | 161 ----------------------- setup.py | 3 +- 4 files changed, 1 insertion(+), 394 deletions(-) delete mode 100644 medcat/neo/__init__.py delete mode 100644 medcat/neo/data_preparation.py delete mode 100644 medcat/neo/neo_connector.py diff --git a/medcat/neo/__init__.py b/medcat/neo/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/medcat/neo/data_preparation.py b/medcat/neo/data_preparation.py deleted file mode 100644 index 551c3117e..000000000 --- a/medcat/neo/data_preparation.py +++ /dev/null @@ -1,231 +0,0 @@ -import os -import pandas as pd - - -def get_index_queries(): - """Run before everything to speed up things.""" - return ['CREATE INDEX patientId FOR (p:Patient) ON (p.patientId);', - 'CREATE INDEX conceptId FOR (c:Concept) ON (c.conceptId);', - 'CREATE INDEX documentId FOR (d:Document) ON (d.documentId);'] - - -def create_neo_csv(data, columns, output_dir='/etc/lib/neo4j/import/', - base_name='patients'): - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data. - columns: - What data to use from the dataframe. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible. - base_name: - Name of the csv. - """ - if isinstance(data, pd.DataFrame): - df = data - else: - df = pd.read_csv(data) - - # Remove duplicates - df = df.drop_duplicates(subset=columns) - - out_df = df[columns] - data_path = os.path.join(output_dir, f"{base_name}.csv") - out_df.to_csv(data_path, index=False) - - -def create_patients_csv(data, output_dir='/etc/lib/neo4j/import/', - base_name='patients'): - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data: patientId, - sex, ethnicity, dob. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible, - but writing there could be only admin. - - Returns: - str: The query. - """ - query = ( - 'USING PERIODIC COMMIT 100000 \n' - f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n' - 'CREATE (:Patient {patientId: toString(row.patientId), \n' - ' sex: toString(row.sex), \n' - ' ethnicity: toString(row.ethnicity), \n' - ' dob: datetime(row.dob)}) \n' - ) - - create_neo_csv(data=data, columns=['patientId', 'sex', 'ethnicity', 'dob'], - output_dir=output_dir, base_name=base_name) - - return query - - -def create_documents_csv(data, output_dir='/etc/lib/neo4j/import/', - base_name='documents'): - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data: documentId. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible. - - Returns: - str: The query. - """ - query = ( - 'USING PERIODIC COMMIT 100000 \n' - f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n' - 'CREATE (:Document {documentId: toString(row.documentId)}) \n' - ) - - create_neo_csv(data=data, columns=['documentId'], - output_dir=output_dir, base_name=base_name) - - return query - - -def create_concepts_csv(data, output_dir='/etc/lib/neo4j/import/', - base_name='concepts'): - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data: conceptId, - name and type. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible. - """ - query = ( - 'USING PERIODIC COMMIT 100000 \n' - f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n' - 'CREATE (:Concept {conceptId: toString(row.conceptId), \n' - ' type: toString(row.type), \n' - ' name: toString(row.name)}) \n' - ) - - create_neo_csv(data=data, columns=['conceptId', 'name', 'type'], - output_dir=output_dir, base_name=base_name) - - return query - - -def create_document2patient_csv(data, output_dir='/etc/lib/neo4j/import/', - base_name='document2patient'): - - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data: patientId and - documentId. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible. - """ - query = ( - 'USING PERIODIC COMMIT 100000 \n' - f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n' - 'MATCH (pt:Patient {patientId: toString(row.patientId)}) \n' - 'MATCH (doc:Document {documentId: toString(row.documentId)}) \n' - 'CREATE (pt)-[:HAS]->(doc); \n' - ) - - create_neo_csv(data=data, columns=['patientId', 'documentId'], - output_dir=output_dir, base_name=base_name) - - return query - - -def create_concept_ontology_csv(data, output_dir='/etc/lib/neo4j/import/', - base_name='concept_ontology'): - - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data: child, parent. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible. - """ - query = ( - 'USING PERIODIC COMMIT 100000 \n' - f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n' - 'MATCH (child:Concept {conceptId: toString(row.child)}) \n' - 'MATCH (parent:Concept {conceptId: toString(row.parent)}) \n' - 'CREATE (child)-[:IS_A]->(parent); \n' - ) - - create_neo_csv(data=data, columns=['child', 'parent'], - output_dir=output_dir, base_name=base_name) - - return query - - -def create_document2concept_csv(data, output_dir='/etc/lib/neo4j/import/', - base_name='document2concepts'): - """Creates a patients CSV for neo4j load csv function - - Args: - data: - A dataframe or path to a dataframe with the required data: 'conceptId', - 'documentId', 'contextSimilarity', 'start', 'end', 'timestamp', - 'metaSubject', 'metaPresence', 'metaTime'. - output_dir: - Where to save the CSVs, should be the neo4j imports path if possible. - """ - query = ( - 'USING PERIODIC COMMIT 100000 \n' - f'LOAD CSV WITH HEADERS FROM "file:///{base_name}.csv" AS row \n' - 'MATCH (doc:Document{documentId: toString(row.documentId)}) \n' - 'MATCH (concept:Concept {conceptId: toString(row.conceptId)}) \n' - 'CREATE (doc)-[:HAS {start: toInteger(row.start), \n' - ' end: toInteger(row.end), \n' - ' timestamp: toInteger(row.timestamp), \n' - ' contextSimilarity: toFloat(row.contextSimilarity), \n' - ' metaSubject: toString(row.metaSubject), \n' - ' metaPresence: toString(row.metaPresence), \n' - ' metaTime: toString(row.metaTime) \n' - ' }]->(concept); \n' - ) - - columns = ['conceptId', 'documentId', 'contextSimilarity', 'start', - 'end', 'timestamp', 'metaSubject', 'metaPresence', 'metaTime'] - - create_neo_csv(data=data, columns=columns, - output_dir=output_dir, base_name=base_name) - - return query - - -def get_data_from_docs(docs, doc2pt, doc2time=None): - data = [['conceptId', 'documentId', 'contextSimilarity', - 'start', 'end', 'timestamp', 'metaSubject', - 'metaPresence', 'metaTime']] - - for doc_id, doc in docs.items(): - row = [] - for ent in doc['entities'].values(): - #if ent['meta_anns']['Subject']['value'] == 'Patient' and \ - # ent['meta_anns']['Presence']['value'] == 'True': - if doc2time is not None: - t = doc2time[doc_id] - else: - t = ent['document_timestamp'] - - row = [ent['cui'], doc_id, - ent['context_similarity'], - ent['start'], ent['end'], - t, - ent['meta_anns'].get('Subject', {}).get('value', None), - ent['meta_anns'].get('Presence', {}).get('value', None), - ent['meta_anns'].get('Time', {}).get('value', None)] - data.append(row) - row = [] - - return data diff --git a/medcat/neo/neo_connector.py b/medcat/neo/neo_connector.py deleted file mode 100644 index 69eef0f7e..000000000 --- a/medcat/neo/neo_connector.py +++ /dev/null @@ -1,161 +0,0 @@ -from py2neo import Graph -import getpass -from collections import defaultdict - - -class NeoConnector: - def __init__(self, uri, user, password=None): - if password is None: - password = getpass.getpass("Password:") - self.graph = Graph(uri, auth=(user, password)) - - def execute(self, query): - r = self.graph.run(query) - return r - - def bucket_concepts(self, data, bucket_size_seconds): - entities = data['entities'] - - _bucket = [] - _concepts = set() - start_time = -1 - new_stream = [] - # Sort entities - entities.sort(key=lambda ent: ent['timestamp']) - for ent in entities: - if start_time == -1: - start_time = ent['timestamp'] - - if ent['timestamp'] - start_time >= bucket_size_seconds: - # Add to stream - new_stream.extend(_bucket) - _bucket = [] - _concepts = set() - start_time = ent['timestamp'] - - t_ent = dict(new_stream[-1]) - t_ent['timestamp'] += 1 - t_ent['name'] = '' - t_ent['conceptId'] = '' - new_stream.append(t_ent) - - if ent['conceptId'] not in _concepts: - _bucket.append(ent) - _concepts.add(ent['conceptId']) - - if _bucket: - new_stream.extend(_bucket) - - data['entities'] = new_stream - - def get_all_patients(self, concepts, limit=1000, require_time=False, ignore_meta=False): - """Return all patients having all concepts - - Args: - concepts: The concepts - limit: The maximum number of results. Defaults to 1000. - require_time: If set only concepts that have the timestamp property will be used. - """ - - q = "WITH [{}] AS cs ".format(",".join(["'{}'".format(c) for c in concepts])) - if not require_time: - q += '''MATCH (c:Concept)<-[:HAS ''' - if not ignore_meta: - q += '''{metaPresence: 'True', metaSubject: 'Patient'}''' - q += ''']-(:Document)<-[:HAS]-(pt:Patient) - WHERE c.conceptId in cs - WITH pt, size(cs) as inputCnt, count(DISTINCT c) as cnt - WHERE cnt = inputCnt - ''' - else: - q += '''MATCH (c:Concept)<-[r:HAS {metaPresence: 'True', metaSubject: - 'Patient'}]-(:Document)<-[:HAS]-(pt:Patient) \n - WHERE c.conceptId in cs AND exists(r.timestamp) \n - WITH pt, size(cs) as inputCnt, count(DISTINCT c) as cnt \n - WHERE cnt = inputCnt \n - ''' - - q += ' RETURN pt LIMIT {}'.format(limit) - data = self.execute(q).data() # Do not like this too much - - return [n['pt']['patientId'] for n in data], q - - def get_all_concepts_from(self, patient_id=None, document_id=None, - limit=1000, bucket_size_seconds=None, min_count=0, meta_requirements=None, require_time=True): - """Returns all concepts belonging to a document or patient - given the concept type (if none all are retruned). - """ - - if patient_id is not None: - q = 'MATCH (patient:Patient {patientId: "%s"})-[:HAS]->' % patient_id \ - + '(document:Document)-[has:HAS]->(concept:Concept) \n' - elif document_id is not None: - q = 'MATCH (patient:Patient)-[:HAS]->(document:Document {documentId: "%s"})' % document_id \ - + '-[has:HAS]->(concept:Concept) \n' - else: - raise Exception("patient_id or document_id are required") - q += 'RETURN patient, document, concept, has LIMIT %s \n' % limit - - data = self.execute(q).data() # Do not like this too much - out = None - if len(data) > 0: - out = {'patient': dict(data[0]['patient']), - 'entities': []} - - cnt = defaultdict(int) - for row in data: - if meta_requirements is None or \ - all([row['has'][meta] == value for meta,value in meta_requirements.items()]): - if not require_time or 'timestamp' in row['has']: - ent = dict(row['concept']) # Take everything from concept - ent['documentId'] = row['document']['documentId'] - ent.update(row['has']) # add all the stuff from the meta ann - - out['entities'].append(ent) - cnt[ent['conceptId']] += 1 - - # Cleanup based on min_count - new_ents = [] - for ent in out['entities']: - if cnt[ent['conceptId']] >= min_count: - ent['count'] = cnt[ent['conceptId']] - new_ents.append(ent) - out['entities'] = new_ents - - if bucket_size_seconds is not None: - self.bucket_concepts(data=out, bucket_size_seconds=bucket_size_seconds) - - return out, q - - def get_all_patients_descend(self, concepts, limit=1000, require_time=False): - """Return all patients having all descendant concepts under the ancestor concept - - Args: - concepts: Ancestor top-level concepts - limit: The maximum number of results. Defaults to 1000. - require_time: If set only concepts that have the timestamp property will be used. - Defaults to False - Returns: - List: Patients with attached SNOMED concepts - """ - - q = "WITH [{}] AS ancestor ".format(",".join(["'{}'".format(c) for c in concepts])) - if not require_time: - q += '''MATCH (n:Concept)-[:IS_A*0..5]->(m:Concept) - WHERE m.conceptId IN ancestor ## get the ancestor and the children - WITH [n.conceptId] AS lineage ## pass the lineage to patient match - MATCH (c:Concept)<-[r:HAS {metaPresence: 'True', metaSubject: 'Patient'}]-(d:Document)<-[q:HAS]-(pt:Patient) - WHERE c.conceptId in lineage - ''' - else: - q += '''MATCH (n:Concept)-[:IS_A*0..5]->(m:Concept) - WHERE m.conceptId IN ancestor ## get the ancestor and the children - WITH [n.conceptId] AS lineage ## pass the lineage to patient match - MATCH (c:Concept)<-[r:HAS {metaPresence: 'True', metaSubject: 'Patient'}]-(d:Document)<-[q:HAS]-(pt:Patient) - WHERE c.conceptId in lineage AND exists(r.timestamp) - ''' - - q += ' RETURN pt.patientId, pt.sex, c.conceptId, c.name, r.timestamp LIMIT {}'.format(limit) - data = self.execute(q).data() # Do not like this too much - - return [n['pt']['patientId'] for n in data], q diff --git a/setup.py b/setup.py index 25a7a7c43..8b152cb77 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ long_description_content_type="text/markdown", url="https://github.com/CogStack/MedCAT", packages=['medcat', 'medcat.utils', 'medcat.preprocessing', 'medcat.ner', 'medcat.linking', 'medcat.datasets', - 'medcat.tokenizers', 'medcat.utils.meta_cat', 'medcat.pipeline', 'medcat.neo', 'medcat.utils.ner', + 'medcat.tokenizers', 'medcat.utils.meta_cat', 'medcat.pipeline', 'medcat.utils.ner', 'medcat.utils.saving', 'medcat.utils.regression'], install_requires=[ 'numpy>=1.22.0', # first to support 3.11 @@ -34,7 +34,6 @@ 'psutil>=5.8.0', # 0.70.12 uses older version of dill (i.e less than 0.3.5) which is required for datasets 'multiprocess~=0.70.12', # 0.70.14 seemed to work just fine - 'py2neo~=2021.2.3', 'aiofiles>=0.8.0', # allow later versions, tested with 22.1.0 'ipywidgets>=7.6.5', # allow later versions, tested with 0.8.0 'xxhash>=3.0.0', # allow later versions, tested with 3.1.0 From b3210f7932e96e8469da6e059d417d27394a4584 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Fri, 20 Oct 2023 14:04:55 +0300 Subject: [PATCH 15/64] Cu 8692wbcq5 docs builds (#359) * CU-8692wbcq5: Pin max version of numpy * CU-8692wbcq5: Pin max version of numpy in setup.py * CU-8692wbcq5: Bump python version for readthedocs workflow * CU-8692wbcq5: Pin all requirement versions in docs requirements * CU-8692wbcq5: Move docs requirements before setuptools * CU-8692wbcq5: Fix typo in docs requirements * CU-8692wbcq5: Remove some less relevant stuff from docs requirements * CU-8692wbcq5: Add back sphinx-based requirements to docs requirements * CU-8692wbcq5: Move back to python 3.9 on docs build workflow * CU-8692wbcq5: Bump sphinx-autoapi version * CU-8692wbcq5: Bump sphinx version * CU-8692wbcq5: Bump python version back to 3.10 for future-proofing * CU-8692wbcq5: Undo pinning numpy to max version in setup.py * CU-8692wbcq5: Remove docs-build specific dependencies in setup.py --- .readthedocs.yaml | 6 +-- docs/requirements.txt | 106 ++++++++++++++++++++++++++++++++++++++++-- setup.py | 6 --- 3 files changed, 105 insertions(+), 13 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 8c4e65615..5cc0d97f0 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,13 +7,13 @@ version: 2 build: os: ubuntu-20.04 tools: - python: "3.9" + python: "3.10" sphinx: configuration: docs/conf.py python: install: + - requirements: docs/requirements.txt - method: setuptools - path: . - - requirements: docs/requirements.txt \ No newline at end of file + path: . \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index be517876f..7e7df6e01 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,104 @@ -Sphinx~=4.0 +sphinx==6.2.1 sphinx-rtd-theme~=1.0 myst-parser~=0.17 -sphinx-autoapi~=1.8 -setuptools>=60.0 -aiohttp==3.8.5 \ No newline at end of file +sphinx-autoapi~=3.0.0 +MarkupSafe==2.1.3 +accelerate==0.23.0 +aiofiles==23.2.1 +aiohttp==3.8.5 +aiosignal==1.3.1 +asttokens==2.4.0 +async-timeout==4.0.3 +attrs==23.1.0 +backcall==0.2.0 +blis==0.7.11 +catalogue==2.0.10 +certifi==2023.7.22 +charset-normalizer==3.3.0 +click==8.1.7 +comm==0.1.4 +confection==0.1.3 +cymem==2.0.8 +datasets==2.14.5 +decorator==5.1.1 +dill==0.3.7 +exceptiongroup==1.1.3 +executing==2.0.0 +filelock==3.12.4 +flake8==4.0.1 +frozenlist==1.4.0 +fsspec==2023.6.0 +gensim==4.3.2 +huggingface-hub==0.17.3 +idna==3.4 +ipython==8.16.1 +ipywidgets==8.1.1 +jedi==0.19.1 +jinja2==3.1.2 +joblib==1.3.2 +jsonpickle==3.0.2 +jupyterlab-widgets==3.0.9 +langcodes==3.3.0 +matplotlib-inline==0.1.6 +mccabe==0.6.1 +mpmath==1.3.0 +multidict==6.0.4 +multiprocess==0.70.15 +murmurhash==1.0.10 +mypy==1.0.0 +mypy-extensions==0.4.3 +networkx==3.1 +numpy==1.25.2 +packaging==23.2 +pandas==2.1.1 +parso==0.8.3 +pathy==0.10.2 +pexpect==4.8.0 +pickleshare==0.7.5 +preshed==3.0.9 +prompt-toolkit==3.0.39 +psutil==5.9.5 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pyarrow==13.0.0 +pycodestyle==2.8.0 +pydantic==1.10.13 +pyflakes==2.4.0 +pygments==2.16.1 +python-dateutil==2.8.2 +pytz==2023.3.post1 +pyyaml==6.0.1 +regex==2023.10.3 +requests==2.31.0 +safetensors==0.4.0 +scikit-learn==1.3.1 +scipy==1.9.3 +six==1.16.0 +smart-open==6.4.0 +spacy==3.4.4 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +srsly==2.4.8 +stack-data==0.6.3 +sympy==1.12 +thinc==8.1.12 +threadpoolctl==3.2.0 +tokenizers==0.14.1 +tomli==2.0.1 +torch==2.1.0 +tqdm==4.66.1 +traitlets==5.11.2 +transformers==4.34.0 +triton==2.1.0 +typer==0.7.0 +types-PyYAML==6.0.3 +types-aiofiles==0.8.3 +types-setuptools==57.4.10 +typing-extensions==4.8.0 +tzdata==2023.3 +urllib3==2.0.6 +wasabi==0.10.1 +wcwidth==0.2.8 +widgetsnbextension==4.0.9 +xxhash==3.4.1 +yarl==1.9.2 \ No newline at end of file diff --git a/setup.py b/setup.py index 8b152cb77..ab49eaff1 100644 --- a/setup.py +++ b/setup.py @@ -40,12 +40,6 @@ 'blis>=0.7.5', # allow later versions, tested with 0.7.9 'click>=8.0.4', # allow later versions, tested with 8.1.3 'pydantic>=1.10.0,<2.0', # for spacy compatibility; avoid 2.0 due to breaking changes - # the following are not direct dependencies of MedCAT but needed for docs/building - # hopefully will no longer need the transitive dependencies - 'aiohttp==3.8.5', # 3.8.3 is needed for compatibility with fsspec <- datasets <- medcat - 'blis<0.8.0,>=0.7.8', # as required by thinc <- spacy <- medcat - # 'smart-open==5.2.1', # 5.2.1 is needed for compatibility with pathy - # 'joblib~=1.2', ], classifiers=[ "Programming Language :: Python :: 3", From ed840d07a51b36734807317ae2947f28c8d2332b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 20 Oct 2023 11:06:29 +0000 Subject: [PATCH 16/64] Bump urllib3 from 1.26.17 to 1.26.18 in /webapp/webapp Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.26.17 to 1.26.18. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/1.26.17...1.26.18) --- updated-dependencies: - dependency-name: urllib3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- webapp/webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/webapp/requirements.txt b/webapp/webapp/requirements.txt index d6a590572..cef7a036d 100644 --- a/webapp/webapp/requirements.txt +++ b/webapp/webapp/requirements.txt @@ -3,4 +3,4 @@ django-dbbackup==4.0.0b0 django-storages[boto3]==1.12.3 django-cron==0.5.1 medcat==1.2.7 -urllib3==1.26.17 +urllib3==1.26.18 From d377f0b6a38b0b2e46dd7224b0a87cf015b767d9 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 30 Oct 2023 14:06:34 +0200 Subject: [PATCH 17/64] CU-8692uznvd: Allow empty-dict config.linking.filters.cuis and convert to set in memory (#352) * CU-8692uznvd: Allow empty-dict config.linking.filters.cuis and convert to set in memory * CU-8692uznvd: Move the empty-set detection and conversion from validator to init * CU-8692uznvd: Remove unused import --- medcat/config.py | 13 +++++++++++++ tests/test_config.py | 21 ++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/medcat/config.py b/medcat/config.py index b2e324deb..87c6d34f5 100644 --- a/medcat/config.py +++ b/medcat/config.py @@ -433,6 +433,19 @@ class LinkingFilters(MixingConfig, BaseModel): cuis: Set[str] = set() cuis_exclude: Set[str] = set() + def __init__(self, **data): + if 'cuis' in data: + cuis = data['cuis'] + if isinstance(cuis, dict) and len(cuis) == 0: + logger.warning("Loading an old model where " + "config.linking.filters.cuis has been " + "dict to an empty dict instead of an empty " + "set. Converting the dict to a set in memory " + "as that is what is expected. Please consider " + "saving the model again.") + data['cuis'] = set(cuis.keys()) + super().__init__(**data) + def check_filters(self, cui: str) -> bool: """Checks is a CUI in the filters diff --git a/tests/test_config.py b/tests/test_config.py index 2f9cd5a84..aacd0a760 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,7 +1,7 @@ import unittest import pickle import tempfile -from medcat.config import Config, MixingConfig, VersionInfo, General +from medcat.config import Config, MixingConfig, VersionInfo, General, LinkingFilters from pydantic import ValidationError import os @@ -180,5 +180,24 @@ def test_from_dict(self): self.assertEqual("value", config.key) +class ConfigLinkingFiltersTests(unittest.TestCase): + + def test_allows_empty_dict_for_cuis(self): + lf = LinkingFilters(cuis={}) + self.assertIsNotNone(lf) + + def test_empty_dict_converted_to_empty_set(self): + lf = LinkingFilters(cuis={}) + self.assertEqual(lf.cuis, set()) + + def test_not_allow_nonempty_dict_for_cuis(self): + with self.assertRaises(ValidationError): + LinkingFilters(cuis={"KEY": "VALUE"}) + + def test_not_allow_empty_dict_for_cuis_exclude(self): + with self.assertRaises(ValidationError): + LinkingFilters(cuis_exclude={}) + + if __name__ == '__main__': unittest.main() From ad6704891db9fa91169b4ef6934fa18e5242184f Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 30 Oct 2023 16:27:37 +0200 Subject: [PATCH 18/64] CU-8692t3fdf separate config on save (#350) * CU-8692t3fdf Move saving config outside of the cdb.dat; Add test to make sure the config does not get saved with the CDB; patch a few existing tests * CU-8692t3fdf Use class methods on class instead of instance in a few tests * CU-8692t3fdf Fix typing issue * CU-8692t3fdf Add additional tests for 2 configs and zero configs when loading model pack * CU-8692t3fdf: Make sure CDB is linked to the correct config; Treat incorrect configs as dirty CDBs and force a recalc of the hash --- medcat/cat.py | 8 +++ medcat/cdb.py | 73 +++++++++++++++++++++++- medcat/config.py | 8 ++- medcat/utils/saving/serializer.py | 15 ++++- tests/test_cat.py | 52 ++++++++++++++++- tests/test_cdb.py | 8 +++ tests/test_config.py | 29 ++++++++++ tests/utils/saving/test_serialization.py | 2 +- tests/utils/test_hashing.py | 47 ++++++++++++++- 9 files changed, 229 insertions(+), 13 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index 2323cd737..0fb6b1167 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -271,6 +271,10 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M cdb_path = os.path.join(save_dir_path, "cdb.dat") self.cdb.save(cdb_path, json_path) + # Save the config + config_path = os.path.join(save_dir_path, "config.json") + self.cdb.config.save(config_path) + # Save the Vocab vocab_path = os.path.join(save_dir_path, "vocab.dat") if self.vocab is not None: @@ -362,6 +366,10 @@ def load_model_pack(cls, logger.info('Loading model pack with %s', 'JSON format' if json_path else 'dill format') cdb = CDB.load(cdb_path, json_path) + # load config + config_path = os.path.join(model_pack_path, "config.json") + cdb.load_config(config_path) + # TODO load addl_ner # Modify the config to contain full path to spacy model diff --git a/medcat/cdb.py b/medcat/cdb.py index 44d4fd9dd..5a648f4af 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -5,8 +5,9 @@ import logging import aiofiles import numpy as np -from typing import Dict, Set, Optional, List, Union +from typing import Dict, Set, Optional, List, Union, cast from functools import partial +import os from medcat import __version__ from medcat.utils.hasher import Hasher @@ -61,8 +62,10 @@ class CDB(object): def __init__(self, config: Union[Config, None] = None) -> None: if config is None: self.config = Config() + self._config_from_file = False else: self.config = config + self._config_from_file = True self.name2cuis: Dict = {} self.name2cuis2status: Dict = {} @@ -95,6 +98,12 @@ def __init__(self, config: Union[Config, None] = None) -> None: self._optim_params = None self.is_dirty = False self._hash: Optional[str] = None + # the config hash is kept track of here so that + # the CDB hash can be re-calculated when the config changes + # it can also be used to make sure the config loaded with + # a CDB matches the config it was saved with + # since the config is now saved separately + self._config_hash: Optional[str] = None self._memory_optimised_parts: Set[str] = set() def get_name(self, cui: str) -> str: @@ -458,6 +467,35 @@ async def save_async(self, path: str) -> None: } await f.write(dill.dumps(to_save)) + def load_config(self, config_path: str) -> None: + if not os.path.exists(config_path): + if not self._config_from_file: + # if there's no config defined anywhere + raise ValueError("Could not find a config in the CDB nor ", + "in the config.json for this model " + f"({os.path.dirname(config_path)})", + ) + # if there is a config, but it's defined in the cdb.dat file + logger.warning("Could not find config.json in model pack folder " + f"({os.path.dirname(config_path)}). " + "This is probably an older model. Please save the model " + "again in the new format to avoid potential issues.") + else: + if self._config_from_file: + # if there's a config.json and one defined in the cbd.dat file + raise ValueError("Found a config in the CDB and in the config.json " + f"for model ({os.path.dirname(config_path)}) - " + "this is ambiguous. Please either remove the " + "config.json or load the CDB without the config.json " + "in the folder and re-save in the newer format " + "(the default save in this version)") + # if the only config is in the separate config.json file + # this should be the behaviour for all newer models + self.config = cast(Config, Config.load(config_path)) + logger.debug("Loaded config from CDB from %s", config_path) + # mark config read from file + self._config_from_file = True + @classmethod def load(cls, path: str, json_path: Optional[str] = None, config_dict: Optional[Dict] = None) -> "CDB": """Load and return a CDB. This allows partial loads in probably not the right way at all. @@ -777,8 +815,34 @@ def _check_medcat_version(cls, config_data: Dict) -> None: or download the compatible model.""" ) + def _should_recalc_hash(self, force_recalc: bool) -> bool: + if force_recalc: + return True + if self.config.hash is None: + # TODO - perhaps this is not the best? + # as this is a side effect + # get and save result in config + self.config.get_hash() + if not self._hash or self.is_dirty: + # if no hash saved or is dirty + # need to calculate + logger.debug("Recalculating hash due to %s", + "no hash saved" if not self._hash else "CDB is dirty") + return True + # recalc config hash in case it changed + self.config.get_hash() + if self._config_hash is None or self._config_hash != self.config.hash: + # if no config hash saved + # or if the config hash is different from one saved in here + logger.debug("Recalculating hash due to %s", + "no config hash saved" if not self._config_hash + else "config hash has changed") + return True + return False + def get_hash(self, force_recalc: bool = False): - if not force_recalc and self._hash and not self.is_dirty: + should_recalc = self._should_recalc_hash(force_recalc) + if not should_recalc: logger.info("Reusing old hash of CDB since the CDB has not changed: %s", self._hash) return self._hash self.is_dirty = False @@ -791,7 +855,7 @@ def calculate_hash(self): for k,v in self.__dict__.items(): if k in ['cui2countext_vectors', 'name2cuis']: hasher.update(v, length=False) - elif k in ['_hash', 'is_dirty']: + elif k in ['_hash', 'is_dirty', '_config_hash']: # ignore _hash since if it previously didn't exist, the # new hash would be different when the value does exist # and ignore is_dirty so that we get the same hash as previously @@ -799,6 +863,9 @@ def calculate_hash(self): elif k != 'config': hasher.update(v, length=True) + # set cached config hash + self._config_hash = self.config.hash + self._hash = hasher.hexdigest() logger.info("Found new CDB hash: %s", self._hash) return self._hash diff --git a/medcat/config.py b/medcat/config.py index 87c6d34f5..e60c2eafc 100644 --- a/medcat/config.py +++ b/medcat/config.py @@ -548,6 +548,7 @@ class Config(MixingConfig, BaseModel): linking: Linking = Linking() word_skipper: re.Pattern = re.compile('') # empty pattern gets replaced upon init punct_checker: re.Pattern = re.compile('') # empty pattern gets replaced upon init + hash: Optional[str] = None class Config: # this if for word_skipper and punct_checker which would otherwise @@ -572,6 +573,9 @@ def rebuild_re(self) -> None: def get_hash(self): hasher = Hasher() for k, v in self.dict().items(): + if k in ['hash', ]: + # ignore hash + continue if k not in ['version', 'general', 'linking']: hasher.update(v, length=True) elif k == 'general': @@ -587,5 +591,5 @@ def get_hash(self): hasher.update(v2, length=False) else: hasher.update(v2, length=True) - - return hasher.hexdigest() + self.hash = hasher.hexdigest() + return self.hash diff --git a/medcat/utils/saving/serializer.py b/medcat/utils/saving/serializer.py index d82df751c..25529c778 100644 --- a/medcat/utils/saving/serializer.py +++ b/medcat/utils/saving/serializer.py @@ -135,13 +135,12 @@ def serialize(self, cdb, overwrite: bool = False) -> None: raise ValueError(f'Unable to overwrite shelf path "{self.json_path}"' ' - specify overrwrite=True if you wish to overwrite') to_save = {} - to_save['config'] = cdb.config.asdict() # This uses different names so as to not be ambiguous # when looking at files whether the json parts should # exist separately or not to_save['cdb_main' if self.jsons is not None else 'cdb'] = dict( ((key, val) for key, val in cdb.__dict__.items() if - key != 'config' and + key not in ('config', '_config_from_file') and (self.jsons is None or key not in SPECIALITY_NAMES))) logger.info('Dumping CDB to %s', self.main_path) with open(self.main_path, 'wb') as f: @@ -165,7 +164,17 @@ def deserialize(self, cdb_cls): logger.info('Reading CDB data from %s', self.main_path) with open(self.main_path, 'rb') as f: data = dill.load(f) - config = cast(Config, Config.from_dict(data['config'])) + if 'config' in data: + logger.warning("Found config in CDB for model (%s). " + "This is an old format. Please re-save the " + "model in the new format to avoid potential issues", + os.path.dirname(self.main_path)) + config = cast(Config, Config.from_dict(data['config'])) + else: + # by passing None as config to constructor + # the CDB should identify that there has been + # no config loaded + config = None cdb = cdb_cls(config=config) if self.jsons is None: cdb_main = data['cdb'] diff --git a/tests/test_cat.py b/tests/test_cat.py index 0baa0d35d..62db4d44d 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -367,7 +367,7 @@ def test_load_model_pack(self): meta_cat = _get_meta_cat(self.meta_cat_dir) cat = CAT(cdb=self.cdb, config=self.cdb.config, vocab=self.vocab, meta_cats=[meta_cat]) full_model_pack_name = cat.create_model_pack(save_dir_path.name, model_pack_name="mp_name") - cat = self.undertest.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) + cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) self.assertTrue(isinstance(cat, CAT)) self.assertIsNotNone(cat.config.version.medcat_version) self.assertEqual(repr(cat._meta_cats), repr([meta_cat])) @@ -377,7 +377,7 @@ def test_load_model_pack_without_meta_cat(self): meta_cat = _get_meta_cat(self.meta_cat_dir) cat = CAT(cdb=self.cdb, config=self.cdb.config, vocab=self.vocab, meta_cats=[meta_cat]) full_model_pack_name = cat.create_model_pack(save_dir_path.name, model_pack_name="mp_name") - cat = self.undertest.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"), load_meta_models=False) + cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"), load_meta_models=False) self.assertTrue(isinstance(cat, CAT)) self.assertIsNotNone(cat.config.version.medcat_version) self.assertEqual(cat._meta_cats, []) @@ -385,10 +385,56 @@ def test_load_model_pack_without_meta_cat(self): def test_hashing(self): save_dir_path = tempfile.TemporaryDirectory() full_model_pack_name = self.undertest.create_model_pack(save_dir_path.name, model_pack_name="mp_name") - cat = self.undertest.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) + cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) self.assertEqual(cat.get_hash(), cat.config.version.id) +class ModelWithTwoConfigsLoadTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cls.model_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples") + cdb = CDB.load(os.path.join(cls.model_path, "cdb.dat")) + # save config next to the CDB + cls.config_path = os.path.join(cls.model_path, 'config.json') + cdb.config.save(cls.config_path) + + + @classmethod + def tearDownClass(cls) -> None: + # REMOVE config next to the CDB + os.remove(cls.config_path) + + def test_loading_model_pack_with_cdb_config_and_config_json_raises_exception(self): + with self.assertRaises(ValueError): + CAT.load_model_pack(self.model_path) + + +class ModelWithZeroConfigsLoadTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cdb_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat") + cdb = CDB.load(cdb_path) + vocab_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat") + # copy the CDB and vocab to a temp dir + cls.temp_dir = tempfile.TemporaryDirectory() + cls.cdb_path = os.path.join(cls.temp_dir.name, 'cdb.dat') + cdb.save(cls.cdb_path) # save without internal config + cls.vocab_path = os.path.join(cls.temp_dir.name, 'vocab.dat') + shutil.copyfile(vocab_path, cls.vocab_path) + + + @classmethod + def tearDownClass(cls) -> None: + # REMOVE temp dir + cls.temp_dir.cleanup() + + def test_loading_model_pack_without_any_config_raises_exception(self): + with self.assertRaises(ValueError): + CAT.load_model_pack(self.temp_dir.name) + + def _get_meta_cat(meta_cat_dir): config = ConfigMetaCAT() config.general["category_name"] = "Status" diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 96425bc8c..f7be24d64 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -6,6 +6,7 @@ import numpy as np from medcat.config import Config from medcat.cdb_maker import CDBMaker +from medcat.cdb import CDB class CDBTests(unittest.TestCase): @@ -53,6 +54,13 @@ def test_save_and_load(self): self.undertest.save(f.name) self.undertest.load(f.name) + def test_load_has_no_config(self): + with tempfile.NamedTemporaryFile() as f: + self.undertest.save(f.name) + cdb = CDB.load(f.name) + self.assertFalse(cdb._config_from_file) + + def test_save_async_and_load(self): with tempfile.NamedTemporaryFile() as f: asyncio.run(self.undertest.save_async(f.name)) diff --git a/tests/test_config.py b/tests/test_config.py index aacd0a760..ce6ed76eb 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -179,6 +179,35 @@ def test_from_dict(self): config = Config.from_dict({"key": "value"}) self.assertEqual("value", config.key) + def test_config_no_hash_before_get(self): + config = Config() + self.assertIsNone(config.hash) + + def test_config_has_hash_after_get(self): + config = Config() + config.get_hash() + self.assertIsNotNone(config.hash) + + def test_config_hash_recalc_same_def(self): + config = Config() + h1 = config.get_hash() + h2 = config.get_hash() + self.assertEqual(h1, h2) + + def test_config_hash_changes_after_change(self): + config = Config() + h1 = config.get_hash() + config.linking.filters.cuis = {"a", "b"} + h2 = config.get_hash() + self.assertNotEqual(h1, h2) + + def test_config_hash_recalc_same_changed(self): + config = Config() + config.linking.filters.cuis = {"a", "b"} + h1 = config.get_hash() + h2 = config.get_hash() + self.assertEqual(h1, h2) + class ConfigLinkingFiltersTests(unittest.TestCase): diff --git a/tests/utils/saving/test_serialization.py b/tests/utils/saving/test_serialization.py index f0cc75de1..c2c44da16 100644 --- a/tests/utils/saving/test_serialization.py +++ b/tests/utils/saving/test_serialization.py @@ -87,7 +87,7 @@ def test_dill_to_json(self): model_pack_folder = os.path.join( self.json_model_pack.name, model_pack_path) json_path = os.path.join(model_pack_folder, "*.json") - jsons = glob.glob(json_path) + jsons = [fn for fn in glob.glob(json_path) if not fn.endswith("config.json")] # there is also a model_card.json # but nothing for cui2many or name2many # so can remove the length of ONE2MANY diff --git a/tests/utils/test_hashing.py b/tests/utils/test_hashing.py index 99c10b153..b6681461f 100644 --- a/tests/utils/test_hashing.py +++ b/tests/utils/test_hashing.py @@ -1,4 +1,5 @@ import os +from typing import Optional import tempfile import unittest import unittest.mock @@ -6,6 +7,7 @@ from medcat.cat import CAT from medcat.cdb import CDB from medcat.vocab import Vocab +from medcat.config import Config class CDBHashingTests(unittest.TestCase): @@ -30,6 +32,43 @@ def test_CDB_hash_saves_on_disk(self): self.assertEqual(h, cdb._hash) +class CDBHashingWithConfigTests(unittest.TestCase): + temp_dir = tempfile.TemporaryDirectory() + + @classmethod + def setUpClass(cls) -> None: + cls.cdb = CDB.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) + # ensure config has hash + h = cls.cdb.get_hash() + cls.config = cls.config_copy(cls.cdb.config) + cls._config_hash = cls.cdb.config.hash + + @classmethod + def config_copy(cls, config: Optional[Config] = None) -> Config: + if config is None: + config = cls.config + return Config(**config.asdict()) + + def setUp(self) -> None: + # reset config + self.cdb.config = self.config_copy() + # reset config hash + self.cdb._config_hash = self._config_hash + self.cdb.config.hash = self._config_hash + + def test_CDB_same_hash_no_need_recalc(self): + self.assertFalse(self.cdb._should_recalc_hash(force_recalc=False)) + + def test_CDB_hash_recalc_if_no_config_hash(self): + self.cdb._config_hash = None + self.assertTrue(self.cdb._should_recalc_hash(force_recalc=False)) + + def test_CDB_hash_recalc_after_config_change(self): + self.cdb.config.linking.filters.cuis = {"a", "b", "c"} + self.assertTrue(self.cdb._should_recalc_hash(force_recalc=False)) + + class BaseCATHashingTests(unittest.TestCase): @classmethod @@ -75,8 +114,14 @@ def test_no_changes_recalc_same(self): class CATHashingTestsWithoutChange(CATHashingTestsWithFakeHash): - def test_no_changes_no_calc(self): + def setUp(self) -> None: + self._calculate_hash = self.undertest.cdb.calculate_hash + # make sure the hash exists + self.undertest.cdb._config_hash = self.undertest.cdb.config.get_hash() + self.undertest.cdb.get_hash() self.undertest.cdb.calculate_hash = unittest.mock.Mock() + + def test_no_changes_no_calc(self): hash = self.undertest.get_hash() self.assertIsInstance(hash, str) self.undertest.cdb.calculate_hash.assert_not_called() From e52bda3547dfa61c671727746058f67a21da3576 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Tue, 31 Oct 2023 11:44:00 +0200 Subject: [PATCH 19/64] CU-2cdpd4t: Unify default addl_info in different methdos. (#363) --- medcat/cat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index 0fb6b1167..a86584c3a 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -1340,7 +1340,7 @@ def multiprocessing(self, nproc: int = 2, batch_size_chars: int = 5000 * 1000, only_cui: bool = False, - addl_info: List[str] = [], + addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'], separate_nn_components: bool = True, out_split_size_chars: Optional[int] = None, save_dir_path: str = os.path.abspath(os.getcwd()), @@ -1536,7 +1536,7 @@ def multiprocessing_pipe(self, nproc: Optional[int] = None, batch_size: Optional[int] = None, only_cui: bool = False, - addl_info: List[str] = [], + addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'], return_dict: bool = True, batch_factor: int = 2) -> Union[List[Tuple], Dict]: """Run multiprocessing NOT FOR TRAINING From b6ab62ca2e5c0f654dcd34271ce0d72a70f603cb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 Nov 2023 22:13:07 +0000 Subject: [PATCH 20/64] Bump django from 3.2.20 to 3.2.23 in /webapp/webapp Bumps [django](https://github.com/django/django) from 3.2.20 to 3.2.23. - [Commits](https://github.com/django/django/compare/3.2.20...3.2.23) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- webapp/webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/webapp/requirements.txt b/webapp/webapp/requirements.txt index d6a590572..4ace8c6f3 100644 --- a/webapp/webapp/requirements.txt +++ b/webapp/webapp/requirements.txt @@ -1,4 +1,4 @@ -Django==3.2.20 +Django==3.2.23 django-dbbackup==4.0.0b0 django-storages[boto3]==1.12.3 django-cron==0.5.1 From 94827bb50cf11526daa8f787b8669395b621043b Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Fri, 3 Nov 2023 15:20:50 +0000 Subject: [PATCH 21/64] Changing cdb.add_concept to a protected method --- medcat/cat.py | 2 +- medcat/cdb.py | 7 ++++--- medcat/cdb_maker.py | 2 +- tests/archive_tests/test_cdb_maker_archive.py | 2 +- tests/utils/test_hashing.py | 6 +++--- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index 2323cd737..5bdf43211 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -834,7 +834,7 @@ def add_and_train_concept(self, names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config) # Only if not negative, otherwise do not add the new name if in fact it should not be detected if do_add_concept and not negative: - self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description, + self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description, full_build=full_build) if spacy_entity is not None and spacy_doc is not None: diff --git a/medcat/cdb.py b/medcat/cdb.py index 44d4fd9dd..91bb7bd9d 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -213,9 +213,9 @@ def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: b # Name status must be one of the three name_status = 'A' - self.add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build) + self._add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build) - def add_concept(self, + def _add_concept(self, cui: str, names: Dict, ontologies: set, @@ -232,7 +232,8 @@ def add_concept(self, the same CUI will be merged internally. names (Dict[str, Dict]): Names for this concept, or the value that if found in free text can be linked to this concept. - Names is an dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}` + Names is a dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}` + Names should be generated by helper function 'medcat.preprocessing.cleaners.prepare_name' ontologies (Set[str]): ontologies in which the concept exists (e.g. SNOMEDCT, HPO) name_status (str): diff --git a/medcat/cdb_maker.py b/medcat/cdb_maker.py index e9c72d12e..ca98f821e 100644 --- a/medcat/cdb_maker.py +++ b/medcat/cdb_maker.py @@ -173,7 +173,7 @@ def prepare_csvs(self, if len(raw_name) >= self.config.cdb_maker['remove_parenthesis']: prepare_name(raw_name, self.pipe.spacy_nlp, names, self.config) - self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, + self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description, full_build=full_build) # DEBUG logger.debug("\n\n**** Added\n CUI: %s\n Names: %s\n Ontologies: %s\n Name status: %s\n Type IDs: %s\n Description: %s\n Is full build: %s", diff --git a/tests/archive_tests/test_cdb_maker_archive.py b/tests/archive_tests/test_cdb_maker_archive.py index 329408999..9e2fc2d72 100644 --- a/tests/archive_tests/test_cdb_maker_archive.py +++ b/tests/archive_tests/test_cdb_maker_archive.py @@ -108,7 +108,7 @@ def test_concept_similarity(self): for i in range(500): cui = "C" + str(i) type_ids = {'T-' + str(i%10)} - cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.pipe.get_spacy_nlp(), {}, self.config), ontologies=set(), + cdb._add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.pipe.get_spacy_nlp(), {}, self.config), ontologies=set(), name_status='P', type_ids=type_ids, description='', full_build=True) vectors = {} diff --git a/tests/utils/test_hashing.py b/tests/utils/test_hashing.py index 99c10b153..60796eb99 100644 --- a/tests/utils/test_hashing.py +++ b/tests/utils/test_hashing.py @@ -90,7 +90,7 @@ class CATHashingTestsWithChange(CATHashingTestsWithFakeHash): def test_when_changes_do_calc(self): with unittest.mock.patch.object(CDB, 'calculate_hash', return_value='abcd1234') as patch_method: - self.undertest.cdb.add_concept(**self.concept_kwargs) + self.undertest.cdb._add_concept(**self.concept_kwargs) hash = self.undertest.get_hash() self.assertIsInstance(hash, str) patch_method.assert_called() @@ -106,10 +106,10 @@ def test_default_cdb_not_dirty(self): self.assertFalse(self.undertest.cdb.is_dirty) def test_after_add_concept_is_dirty(self): - self.undertest.cdb.add_concept(**self.concept_kwargs) + self.undertest.cdb._add_concept(**self.concept_kwargs) self.assertTrue(self.undertest.cdb.is_dirty) def test_after_recalc_not_dirty(self): - self.undertest.cdb.add_concept(**self.concept_kwargs) + self.undertest.cdb._add_concept(**self.concept_kwargs) self.undertest.get_hash() self.assertFalse(self.undertest.cdb.is_dirty) From 26b5120384905eaba66db6eb2e3fd626d7c66c7b Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Mon, 6 Nov 2023 17:48:15 +0000 Subject: [PATCH 22/64] Re-added deprecated method with deprecated flag and addtional comments --- medcat/cdb.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/medcat/cdb.py b/medcat/cdb.py index 91bb7bd9d..5c1c27a96 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -12,6 +12,7 @@ from medcat.utils.hasher import Hasher from medcat.utils.matutils import unitvec from medcat.utils.ml_utils import get_lr_linking +from medcat.utils.decorators import deprecated from medcat.config import Config, weighted_average, workers from medcat.utils.saving.serializer import CDBSerializer @@ -215,6 +216,44 @@ def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: b self._add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build) + @deprecated("Use `cdb._add_concept` as this will be removed in a future release.") + def add_concept(self, + cui: str, + names: Dict, + ontologies: set, + name_status: str, + type_ids: Set[str], + description: str, + full_build: bool = False) -> None: + """ + Deprecated: Use `cdb._add_concept` as this will be removed in a future release. + + Add a concept to internal Concept Database (CDB). Depending on what you are providing + this will add a large number of properties for each concept. + + Args: + cui (str): + Concept ID or unique identifier in this database, all concepts that have + the same CUI will be merged internally. + names (Dict[str, Dict]): + Names for this concept, or the value that if found in free text can be linked to this concept. + Names is a dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}` + Names should be generated by helper function 'medcat.preprocessing.cleaners.prepare_name' + ontologies (Set[str]): + ontologies in which the concept exists (e.g. SNOMEDCT, HPO) + name_status (str): + One of `P`, `N`, `A` + type_ids (Set[str]): + Semantic type identifier (have a look at TUIs in UMLS or SNOMED-CT) + description (str): + Description of this concept. + full_build (bool): + If True the dictionary self.addl_info will also be populated, contains a lot of extra information + about concepts, but can be very memory consuming. This is not necessary + for normal functioning of MedCAT (Default Value `False`). + """ + self._add_concept(cui, names, ontologies, name_status, type_ids, description, full_build) + def _add_concept(self, cui: str, names: Dict, From 81ba0bfcd5f99376e7d3ecda53b2c25cbc078e50 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Wed, 22 Nov 2023 17:44:07 +0000 Subject: [PATCH 23/64] Initial commit for merge_cdb method --- medcat/cdb.py | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/medcat/cdb.py b/medcat/cdb.py index 44d4fd9dd..86d0ff810 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -802,3 +802,97 @@ def calculate_hash(self): self._hash = hasher.hexdigest() logger.info("Found new CDB hash: %s", self._hash) return self._hash + +@staticmethod +def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict = {}): + """Merge two CDB's together to produce a single CDB. + + Args: + cdb1 (medcat.cdb.CDB): + The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as + cui2preferred_name), this cdb values will be prioritised over cdb2. + cdb2 (medcat.cdb.CDB): + The second medcat cdb to merge. + overwrite (bool): + NYI: Do not merge certain dictionaries, and prioritise a cdb. + vector_import (Dict[str, Dict[str, np.array]]): + NYI: Vectors to import, using the same format as cui2context_vectors. + """ + # TODO: overwriting, vector import + config = cdb1.config.copy() + cdb = CDB(config) + + # names - copy cdb 1 as that is priority, and save computation time + cdb.name2cuis = cdb1.name2cuis.copy() + cdb.name2cuis2status = cdb1.name2cuis2status.copy() + cdb.name2count_train = cdb1.name2count_train.copy() + cdb.name_isupper = cdb1.name_isupper.copy() + for name in cdb2.name2cuis: + if name in cdb1.name2cuis: #if they exist in both cdbs + cdb.name2cuis[name] = list(set(cdb1.name2cuis[name] + cdb2.name2cuis[name])) # unique CUI's only for each name + if name in cdb1.name2cuis2status: cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} + if name in cdb1.name2count_train: cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason + else: # if name only exists in cdb 2 + cdb.name2cuis[name] = cdb2.name2cuis[name] + if name in cdb2.name2cuis2status: cdb.name2cuis2status[name] = cdb2.name2cuis2status[name] + if name in cdb2.name2count_train: cdb.name2count_train[name] = cdb2.name2count_train[name] + if name in cdb2.name_isupper: cdb.name_isupper[name] = cdb2.name_isupper[name] + + # snames + cdb.snames = cdb1.snames.union(cdb2.snames) + + # cui merging + cdb.cui2names = cdb1.cui2names.copy() + cdb.cui2snames = cdb1.cui2snames.copy() + cdb.cui2count_train = cdb1.cui2count_train.copy() + cdb.cui2info = cdb1.cui2info.copy() + cdb.cui2context_vectors = cdb1.cui2context_vectors.copy() + cdb.cui2tags = cdb1.cui2tags.copy() + cdb.cui2type_ids = cdb1.cui2type_ids.copy() + cdb.cui2preferred_name = cdb1.cui2preferred_name.copy() + + cdb.cui2average_confidence = cdb1.cui2average_confidence.copy() + for cui in cdb2.cui2names: + if cui in cdb1.cui2names: + cdb.cui2names[cui] = cdb1.cui2names[cui].union(cdb2.cui2names[cui]) + if cui in cdb1.cui2snames: cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) + if cui in cdb1.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] + cdb1.cui2count_train[cui] + # this is where cui2info would be + if cui in cdb1.cui2context_vectors: + contexts = set(cdb1.cui2context_vectors[cui].keys() + cdb2.cui2context_vectors[cui].keys()) # xlong, long, medium, short + norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) + weights = [cdb1.cui2count_train[cui]/norm, cdb2.cui2count_train[cui]/norm] + for s in contexts: + if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: + cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] + elif s in cdb1.cui2context_vectors[cui]: + cdb.cui2context_vectors[cui][s] = cdb1.cui2context_vectors[cui][s] + else: + cdb.cui2context_vectors[cui][s] = cdb2.cui2context_vectors[cui][s] + if cui in cdb1.cui2tags: cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) + if cui in cdb1.cui2type_ids: cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) + # Nothing to do with prefered name, unless overwrite + else: + cdb.cui2names[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2snames: cdb.cui2snames[cui] = cdb2.cui2snames[cui] + if cui in cdb2.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2info: cdb.cui2info[cui] = cdb2.cui2info[cui] # no idea what this is used for, so no merging will be done + if cui in cdb2.cui2context_vectors: cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] + if cui in cdb2.cui2tags: cdb.cui2tags[cui] = cdb2.cui2tags[cui] + if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] + if cui in cdb2.cui2preferred_name: cdb.cui2preferred_name[cui] = cdb2.cui2preferred_name[cui] + + cdb.addl_info = cdb1.addl_info.copy() + for key in cdb2.addl_info: + if key not in cdb1.addl_info: # doesn't / can't handle clashes TODO: Handle Overwrite Param + cdb.addl_info[key] = cdb2.addl_info[key] + + # vocab, adding counts if they occur in both + cdb.vocab = cdb1.vocab.copy() + for word in cdb2.vocab: + if word in cdb.vocab: + cdb.vocab[word] += cdb2.vocab[word] + else: + cdb.vocab[word] = cdb2.vocab[word] + + return cdb \ No newline at end of file From 379a0dbbebc8212cf5c6f5ed46076bd79c15cc5c Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Wed, 22 Nov 2023 23:47:28 +0000 Subject: [PATCH 24/64] Added indentation to make merge_cdb a class method --- medcat/cdb.py | 184 +++++++++++++++++++++++++------------------------- 1 file changed, 92 insertions(+), 92 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 86d0ff810..ec6f80319 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -803,96 +803,96 @@ def calculate_hash(self): logger.info("Found new CDB hash: %s", self._hash) return self._hash -@staticmethod -def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict = {}): - """Merge two CDB's together to produce a single CDB. - - Args: - cdb1 (medcat.cdb.CDB): - The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as - cui2preferred_name), this cdb values will be prioritised over cdb2. - cdb2 (medcat.cdb.CDB): - The second medcat cdb to merge. - overwrite (bool): - NYI: Do not merge certain dictionaries, and prioritise a cdb. - vector_import (Dict[str, Dict[str, np.array]]): - NYI: Vectors to import, using the same format as cui2context_vectors. - """ - # TODO: overwriting, vector import - config = cdb1.config.copy() - cdb = CDB(config) - - # names - copy cdb 1 as that is priority, and save computation time - cdb.name2cuis = cdb1.name2cuis.copy() - cdb.name2cuis2status = cdb1.name2cuis2status.copy() - cdb.name2count_train = cdb1.name2count_train.copy() - cdb.name_isupper = cdb1.name_isupper.copy() - for name in cdb2.name2cuis: - if name in cdb1.name2cuis: #if they exist in both cdbs - cdb.name2cuis[name] = list(set(cdb1.name2cuis[name] + cdb2.name2cuis[name])) # unique CUI's only for each name - if name in cdb1.name2cuis2status: cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} - if name in cdb1.name2count_train: cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason - else: # if name only exists in cdb 2 - cdb.name2cuis[name] = cdb2.name2cuis[name] - if name in cdb2.name2cuis2status: cdb.name2cuis2status[name] = cdb2.name2cuis2status[name] - if name in cdb2.name2count_train: cdb.name2count_train[name] = cdb2.name2count_train[name] - if name in cdb2.name_isupper: cdb.name_isupper[name] = cdb2.name_isupper[name] - - # snames - cdb.snames = cdb1.snames.union(cdb2.snames) - - # cui merging - cdb.cui2names = cdb1.cui2names.copy() - cdb.cui2snames = cdb1.cui2snames.copy() - cdb.cui2count_train = cdb1.cui2count_train.copy() - cdb.cui2info = cdb1.cui2info.copy() - cdb.cui2context_vectors = cdb1.cui2context_vectors.copy() - cdb.cui2tags = cdb1.cui2tags.copy() - cdb.cui2type_ids = cdb1.cui2type_ids.copy() - cdb.cui2preferred_name = cdb1.cui2preferred_name.copy() - - cdb.cui2average_confidence = cdb1.cui2average_confidence.copy() - for cui in cdb2.cui2names: - if cui in cdb1.cui2names: - cdb.cui2names[cui] = cdb1.cui2names[cui].union(cdb2.cui2names[cui]) - if cui in cdb1.cui2snames: cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) - if cui in cdb1.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] + cdb1.cui2count_train[cui] - # this is where cui2info would be - if cui in cdb1.cui2context_vectors: - contexts = set(cdb1.cui2context_vectors[cui].keys() + cdb2.cui2context_vectors[cui].keys()) # xlong, long, medium, short - norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) - weights = [cdb1.cui2count_train[cui]/norm, cdb2.cui2count_train[cui]/norm] - for s in contexts: - if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: - cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] - elif s in cdb1.cui2context_vectors[cui]: - cdb.cui2context_vectors[cui][s] = cdb1.cui2context_vectors[cui][s] - else: - cdb.cui2context_vectors[cui][s] = cdb2.cui2context_vectors[cui][s] - if cui in cdb1.cui2tags: cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) - if cui in cdb1.cui2type_ids: cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) - # Nothing to do with prefered name, unless overwrite - else: - cdb.cui2names[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2snames: cdb.cui2snames[cui] = cdb2.cui2snames[cui] - if cui in cdb2.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2info: cdb.cui2info[cui] = cdb2.cui2info[cui] # no idea what this is used for, so no merging will be done - if cui in cdb2.cui2context_vectors: cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] - if cui in cdb2.cui2tags: cdb.cui2tags[cui] = cdb2.cui2tags[cui] - if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - if cui in cdb2.cui2preferred_name: cdb.cui2preferred_name[cui] = cdb2.cui2preferred_name[cui] - - cdb.addl_info = cdb1.addl_info.copy() - for key in cdb2.addl_info: - if key not in cdb1.addl_info: # doesn't / can't handle clashes TODO: Handle Overwrite Param - cdb.addl_info[key] = cdb2.addl_info[key] - - # vocab, adding counts if they occur in both - cdb.vocab = cdb1.vocab.copy() - for word in cdb2.vocab: - if word in cdb.vocab: - cdb.vocab[word] += cdb2.vocab[word] - else: - cdb.vocab[word] = cdb2.vocab[word] + @staticmethod + def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict = {}): + """Merge two CDB's together to produce a single CDB. + + Args: + cdb1 (medcat.cdb.CDB): + The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as + cui2preferred_name), this cdb values will be prioritised over cdb2. + cdb2 (medcat.cdb.CDB): + The second medcat cdb to merge. + overwrite (bool): + NYI: Do not merge certain dictionaries, and prioritise a cdb. + vector_import (Dict[str, Dict[str, np.array]]): + NYI: Vectors to import, using the same format as cui2context_vectors. + """ + # TODO: overwriting, vector import + config = cdb1.config.copy() + cdb = CDB(config) + + # names - copy cdb 1 as that is priority, and save computation time + cdb.name2cuis = cdb1.name2cuis.copy() + cdb.name2cuis2status = cdb1.name2cuis2status.copy() + cdb.name2count_train = cdb1.name2count_train.copy() + cdb.name_isupper = cdb1.name_isupper.copy() + for name in cdb2.name2cuis: + if name in cdb1.name2cuis: #if they exist in both cdbs + cdb.name2cuis[name] = list(set(cdb1.name2cuis[name] + cdb2.name2cuis[name])) # unique CUI's only for each name + if name in cdb1.name2cuis2status: cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} + if name in cdb1.name2count_train: cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason + else: # if name only exists in cdb 2 + cdb.name2cuis[name] = cdb2.name2cuis[name] + if name in cdb2.name2cuis2status: cdb.name2cuis2status[name] = cdb2.name2cuis2status[name] + if name in cdb2.name2count_train: cdb.name2count_train[name] = cdb2.name2count_train[name] + if name in cdb2.name_isupper: cdb.name_isupper[name] = cdb2.name_isupper[name] + + # snames + cdb.snames = cdb1.snames.union(cdb2.snames) + + # cui merging + cdb.cui2names = cdb1.cui2names.copy() + cdb.cui2snames = cdb1.cui2snames.copy() + cdb.cui2count_train = cdb1.cui2count_train.copy() + cdb.cui2info = cdb1.cui2info.copy() + cdb.cui2context_vectors = cdb1.cui2context_vectors.copy() + cdb.cui2tags = cdb1.cui2tags.copy() + cdb.cui2type_ids = cdb1.cui2type_ids.copy() + cdb.cui2preferred_name = cdb1.cui2preferred_name.copy() + + cdb.cui2average_confidence = cdb1.cui2average_confidence.copy() + for cui in cdb2.cui2names: + if cui in cdb1.cui2names: + cdb.cui2names[cui] = cdb1.cui2names[cui].union(cdb2.cui2names[cui]) + if cui in cdb1.cui2snames: cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) + if cui in cdb1.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] + cdb1.cui2count_train[cui] + # this is where cui2info would be + if cui in cdb1.cui2context_vectors: + contexts = set(cdb1.cui2context_vectors[cui].keys() + cdb2.cui2context_vectors[cui].keys()) # xlong, long, medium, short + norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) + weights = [cdb1.cui2count_train[cui]/norm, cdb2.cui2count_train[cui]/norm] + for s in contexts: + if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: + cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] + elif s in cdb1.cui2context_vectors[cui]: + cdb.cui2context_vectors[cui][s] = cdb1.cui2context_vectors[cui][s] + else: + cdb.cui2context_vectors[cui][s] = cdb2.cui2context_vectors[cui][s] + if cui in cdb1.cui2tags: cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) + if cui in cdb1.cui2type_ids: cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) + # Nothing to do with prefered name, unless overwrite + else: + cdb.cui2names[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2snames: cdb.cui2snames[cui] = cdb2.cui2snames[cui] + if cui in cdb2.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2info: cdb.cui2info[cui] = cdb2.cui2info[cui] # no idea what this is used for, so no merging will be done + if cui in cdb2.cui2context_vectors: cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] + if cui in cdb2.cui2tags: cdb.cui2tags[cui] = cdb2.cui2tags[cui] + if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] + if cui in cdb2.cui2preferred_name: cdb.cui2preferred_name[cui] = cdb2.cui2preferred_name[cui] + + cdb.addl_info = cdb1.addl_info.copy() + for key in cdb2.addl_info: + if key not in cdb1.addl_info: # doesn't / can't handle clashes TODO: Handle Overwrite Param + cdb.addl_info[key] = cdb2.addl_info[key] + + # vocab, adding counts if they occur in both + cdb.vocab = cdb1.vocab.copy() + for word in cdb2.vocab: + if word in cdb.vocab: + cdb.vocab[word] += cdb2.vocab[word] + else: + cdb.vocab[word] = cdb2.vocab[word] - return cdb \ No newline at end of file + return cdb \ No newline at end of file From e64b2e0714ec558d2199f5cd9fe177c22561788b Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Thu, 23 Nov 2023 00:18:30 +0000 Subject: [PATCH 25/64] fixed syntax issues --- medcat/cdb.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index ec6f80319..6ddcfa5ab 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -804,7 +804,7 @@ def calculate_hash(self): return self._hash @staticmethod - def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict = {}): + def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict = {}): """Merge two CDB's together to produce a single CDB. Args: @@ -828,7 +828,7 @@ def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict cdb.name2count_train = cdb1.name2count_train.copy() cdb.name_isupper = cdb1.name_isupper.copy() for name in cdb2.name2cuis: - if name in cdb1.name2cuis: #if they exist in both cdbs + if name in cdb1.name2cuis: # if they exist in both cdbs cdb.name2cuis[name] = list(set(cdb1.name2cuis[name] + cdb2.name2cuis[name])) # unique CUI's only for each name if name in cdb1.name2cuis2status: cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} if name in cdb1.name2count_train: cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason @@ -886,7 +886,7 @@ def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict for key in cdb2.addl_info: if key not in cdb1.addl_info: # doesn't / can't handle clashes TODO: Handle Overwrite Param cdb.addl_info[key] = cdb2.addl_info[key] - + # vocab, adding counts if they occur in both cdb.vocab = cdb1.vocab.copy() for word in cdb2.vocab: @@ -895,4 +895,5 @@ def merge_cdb(cdb1 : CDB, cdb2 : CDB, overwrite : int = 0, vector_import : Dict else: cdb.vocab[word] = cdb2.vocab[word] - return cdb \ No newline at end of file + return cdb + \ No newline at end of file From eefb010f115f25da2d3b4259ba1383ce34906f06 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Thu, 23 Nov 2023 00:25:18 +0000 Subject: [PATCH 26/64] more lint fixes --- medcat/cdb.py | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 6ddcfa5ab..19fe04d55 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -830,13 +830,18 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict for name in cdb2.name2cuis: if name in cdb1.name2cuis: # if they exist in both cdbs cdb.name2cuis[name] = list(set(cdb1.name2cuis[name] + cdb2.name2cuis[name])) # unique CUI's only for each name - if name in cdb1.name2cuis2status: cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} - if name in cdb1.name2count_train: cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason + if name in cdb1.name2cuis2status: + cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} + if name in cdb1.name2count_train: + cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason else: # if name only exists in cdb 2 cdb.name2cuis[name] = cdb2.name2cuis[name] - if name in cdb2.name2cuis2status: cdb.name2cuis2status[name] = cdb2.name2cuis2status[name] - if name in cdb2.name2count_train: cdb.name2count_train[name] = cdb2.name2count_train[name] - if name in cdb2.name_isupper: cdb.name_isupper[name] = cdb2.name_isupper[name] + if name in cdb2.name2cuis2status: + cdb.name2cuis2status[name] = cdb2.name2cuis2status[name] + if name in cdb2.name2count_train: + cdb.name2count_train[name] = cdb2.name2count_train[name] + if name in cdb2.name_isupper: + cdb.name_isupper[name] = cdb2.name_isupper[name] # snames cdb.snames = cdb1.snames.union(cdb2.snames) @@ -855,8 +860,10 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict for cui in cdb2.cui2names: if cui in cdb1.cui2names: cdb.cui2names[cui] = cdb1.cui2names[cui].union(cdb2.cui2names[cui]) - if cui in cdb1.cui2snames: cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) - if cui in cdb1.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] + cdb1.cui2count_train[cui] + if cui in cdb1.cui2snames: + cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) + if cui in cdb1.cui2count_train: + cdb.cui2count_train[cui] = cdb2.cui2names[cui] + cdb1.cui2count_train[cui] # this is where cui2info would be if cui in cdb1.cui2context_vectors: contexts = set(cdb1.cui2context_vectors[cui].keys() + cdb2.cui2context_vectors[cui].keys()) # xlong, long, medium, short @@ -864,7 +871,7 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict weights = [cdb1.cui2count_train[cui]/norm, cdb2.cui2count_train[cui]/norm] for s in contexts: if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: - cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] + cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] elif s in cdb1.cui2context_vectors[cui]: cdb.cui2context_vectors[cui][s] = cdb1.cui2context_vectors[cui][s] else: @@ -874,13 +881,20 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict # Nothing to do with prefered name, unless overwrite else: cdb.cui2names[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2snames: cdb.cui2snames[cui] = cdb2.cui2snames[cui] - if cui in cdb2.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2info: cdb.cui2info[cui] = cdb2.cui2info[cui] # no idea what this is used for, so no merging will be done - if cui in cdb2.cui2context_vectors: cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] - if cui in cdb2.cui2tags: cdb.cui2tags[cui] = cdb2.cui2tags[cui] - if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - if cui in cdb2.cui2preferred_name: cdb.cui2preferred_name[cui] = cdb2.cui2preferred_name[cui] + if cui in cdb2.cui2snames: + cdb.cui2snames[cui] = cdb2.cui2snames[cui] + if cui in cdb2.cui2count_train: + cdb.cui2count_train[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2info: + cdb.cui2info[cui] = cdb2.cui2info[cui] # no idea what this is used for, so no merging will be done + if cui in cdb2.cui2context_vectors: + cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] + if cui in cdb2.cui2tags: + cdb.cui2tags[cui] = cdb2.cui2tags[cui] + if cui in cdb2.cui2type_ids: + cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] + if cui in cdb2.cui2preferred_name: + cdb.cui2preferred_name[cui] = cdb2.cui2preferred_name[cui] cdb.addl_info = cdb1.addl_info.copy() for key in cdb2.addl_info: @@ -896,4 +910,3 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict cdb.vocab[word] = cdb2.vocab[word] return cdb - \ No newline at end of file From ff48a2a8168c5216afe0ddb14ec6eae13ae6df78 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Thu, 23 Nov 2023 00:29:31 +0000 Subject: [PATCH 27/64] more lint fixes --- medcat/cdb.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 19fe04d55..6580569f1 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -876,8 +876,10 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict cdb.cui2context_vectors[cui][s] = cdb1.cui2context_vectors[cui][s] else: cdb.cui2context_vectors[cui][s] = cdb2.cui2context_vectors[cui][s] - if cui in cdb1.cui2tags: cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) - if cui in cdb1.cui2type_ids: cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) + if cui in cdb1.cui2tags: + cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) + if cui in cdb1.cui2type_ids: + cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) # Nothing to do with prefered name, unless overwrite else: cdb.cui2names[cui] = cdb2.cui2names[cui] From f299677c12afed91a4f50ecb8848a708491c6429 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Thu, 23 Nov 2023 19:15:33 +0000 Subject: [PATCH 28/64] bug fixes of merge_cdb --- medcat/cdb.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 6580569f1..368d7bc0e 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -804,7 +804,7 @@ def calculate_hash(self): return self._hash @staticmethod - def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict = {}): + def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: dict[str, dict[str, np.array]] = {}): """Merge two CDB's together to produce a single CDB. Args: @@ -863,12 +863,14 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: Dict if cui in cdb1.cui2snames: cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) if cui in cdb1.cui2count_train: - cdb.cui2count_train[cui] = cdb2.cui2names[cui] + cdb1.cui2count_train[cui] + cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] + cdb1.cui2count_train[cui] # this is where cui2info would be if cui in cdb1.cui2context_vectors: - contexts = set(cdb1.cui2context_vectors[cui].keys() + cdb2.cui2context_vectors[cui].keys()) # xlong, long, medium, short + contexts = set(list(cdb1.cui2context_vectors[cui]) + list(cdb2.cui2context_vectors[cui].keys())) # xlong, long, medium, short norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) - weights = [cdb1.cui2count_train[cui]/norm, cdb2.cui2count_train[cui]/norm] + print(cdb1.cui2count_train[cui]) + print(norm) + weights = [np.divide(cdb1.cui2count_train[cui], norm), np.divide(cdb2.cui2count_train[cui], norm)] for s in contexts: if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] From abb68b5994e73a219e999cb4544003ff5cd358d9 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Thu, 23 Nov 2023 22:35:09 +0000 Subject: [PATCH 29/64] removed print statements --- medcat/cdb.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 368d7bc0e..47a9d8e02 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -867,9 +867,7 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: dict[ # this is where cui2info would be if cui in cdb1.cui2context_vectors: contexts = set(list(cdb1.cui2context_vectors[cui]) + list(cdb2.cui2context_vectors[cui].keys())) # xlong, long, medium, short - norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) - print(cdb1.cui2count_train[cui]) - print(norm) + norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) weights = [np.divide(cdb1.cui2count_train[cui], norm), np.divide(cdb2.cui2count_train[cui], norm)] for s in contexts: if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: From b6b023b9cb5c9c40a5d9824d8a1f368b32aa6b21 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 27 Nov 2023 03:21:10 -0600 Subject: [PATCH 30/64] CU-86931prq4: Update GHA versions (checkout and setup-python) to v4 (#368) --- .github/workflows/main.yml | 8 ++++---- .github/workflows/production.yml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c769dfc2e..a5468fb9b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -16,9 +16,9 @@ jobs: max-parallel: 4 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -48,13 +48,13 @@ jobs: steps: - name: Checkout master - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: ref: 'master' fetch-depth: 0 - name: Set up Python 3.9 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: 3.9 diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index 5088c1000..9ad9a5d90 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -14,13 +14,13 @@ jobs: steps: - name: Checkout production - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: ref: ${{ github.event.release.target_commitish }} fetch-depth: 0 - name: Set up Python 3.9 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: 3.9 From 6a5103cbd9b9187a53040e82fa42f2f628faa7bc Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 27 Nov 2023 05:14:44 -0600 Subject: [PATCH 31/64] Cu 1yn0v9e duplicate multiprocessing methods (#364) * CU-1yn0v9e: Rename and deprecate one of the multiprocessing methods; Add docstring. Trying to be more explicit regarding usage and differences between different methods * CU-1yn0v9e: Rename and deprecate the multiprocessing_pipe method; Add docstring. Trying to be more explicit regarding usage and differences between different methods * CU-1yn0v9e: Fix typo in docstring; more consistent naming --- medcat/cat.py | 47 +++++++++++++++++++++++++++++++++++++++++++++-- tests/test_cat.py | 8 ++++---- 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index a86584c3a..9cca44b0f 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -1335,6 +1335,7 @@ def _save_docs_to_file(self, docs: Iterable, annotated_ids: List[str], save_dir_ pickle.dump((annotated_ids, part_counter), open(annotated_ids_path, 'wb')) return part_counter + @deprecated(message="Use `multiprocessing_batch_char_size` instead") def multiprocessing(self, data: Union[List[Tuple], Iterable[Tuple]], nproc: int = 2, @@ -1345,9 +1346,31 @@ def multiprocessing(self, out_split_size_chars: Optional[int] = None, save_dir_path: str = os.path.abspath(os.getcwd()), min_free_memory=0.1) -> Dict: + return self.multiprocessing_batch_char_size(data=data, nproc=nproc, + batch_size_chars=batch_size_chars, + only_cui=only_cui, addl_info=addl_info, + separate_nn_components=separate_nn_components, + out_split_size_chars=out_split_size_chars, + save_dir_path=save_dir_path, + min_free_memory=min_free_memory) + + def multiprocessing_batch_char_size(self, + data: Union[List[Tuple], Iterable[Tuple]], + nproc: int = 2, + batch_size_chars: int = 5000 * 1000, + only_cui: bool = False, + addl_info: List[str] = [], + separate_nn_components: bool = True, + out_split_size_chars: Optional[int] = None, + save_dir_path: str = os.path.abspath(os.getcwd()), + min_free_memory=0.1) -> Dict: r"""Run multiprocessing for inference, if out_save_path and out_split_size_chars is used this will also continue annotating documents if something is saved in that directory. + This method batches the data based on the number of characters as specified by user. + + PS: This method is unlikely to work on a Windows machine. + Args: data: Iterator or array with format: [(id, text), (id, text), ...] @@ -1531,7 +1554,22 @@ def _multiprocessing_batch(self, return docs - def multiprocessing_pipe(self, + @deprecated(message="Use `multiprocessing_batch_docs_size` instead") + def multiprocessing_pipe(self, in_data: Union[List[Tuple], Iterable[Tuple]], + nproc: Optional[int] = None, + batch_size: Optional[int] = None, + only_cui: bool = False, + addl_info: List[str] = [], + return_dict: bool = True, + batch_factor: int = 2) -> Union[List[Tuple], Dict]: + return self.multiprocessing_batch_docs_size(in_data=in_data, nproc=nproc, + batch_size=batch_size, + only_cui=only_cui, + addl_info=addl_info, + return_dict=return_dict, + batch_factor=batch_factor) + + def multiprocessing_batch_docs_size(self, in_data: Union[List[Tuple], Iterable[Tuple]], nproc: Optional[int] = None, batch_size: Optional[int] = None, @@ -1539,7 +1577,12 @@ def multiprocessing_pipe(self, addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'], return_dict: bool = True, batch_factor: int = 2) -> Union[List[Tuple], Dict]: - """Run multiprocessing NOT FOR TRAINING + """Run multiprocessing NOT FOR TRAINING. + + This method batches the data based on the number of documents as specified by the user. + + PS: + This method supports Windows. Args: in_data (Union[List[Tuple], Iterable[Tuple]]): List with format: [(id, text), (id, text), ...] diff --git a/tests/test_cat.py b/tests/test_cat.py index 62db4d44d..cd33efbc7 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -60,7 +60,7 @@ def test_multiprocessing(self): (2, ""), (3, None) ] - out = self.undertest.multiprocessing(in_data, nproc=1) + out = self.undertest.multiprocessing_batch_char_size(in_data, nproc=1) self.assertEqual(3, len(out)) self.assertEqual(1, len(out[1]['entities'])) @@ -73,7 +73,7 @@ def test_multiprocessing_pipe(self): (2, "The dog is sitting outside the house."), (3, "The dog is sitting outside the house."), ] - out = self.undertest.multiprocessing_pipe(in_data, nproc=2, return_dict=False) + out = self.undertest.multiprocessing_batch_docs_size(in_data, nproc=2, return_dict=False) self.assertTrue(type(out) == list) self.assertEqual(3, len(out)) self.assertEqual(1, out[0][0]) @@ -89,7 +89,7 @@ def test_multiprocessing_pipe_with_malformed_texts(self): (2, ""), (3, None), ] - out = self.undertest.multiprocessing_pipe(in_data, nproc=1, batch_size=1, return_dict=False) + out = self.undertest.multiprocessing_batch_docs_size(in_data, nproc=1, batch_size=1, return_dict=False) self.assertTrue(type(out) == list) self.assertEqual(3, len(out)) self.assertEqual(1, out[0][0]) @@ -105,7 +105,7 @@ def test_multiprocessing_pipe_return_dict(self): (2, "The dog is sitting outside the house."), (3, "The dog is sitting outside the house.") ] - out = self.undertest.multiprocessing_pipe(in_data, nproc=2, return_dict=True) + out = self.undertest.multiprocessing_batch_docs_size(in_data, nproc=2, return_dict=True) self.assertTrue(type(out) == dict) self.assertEqual(3, len(out)) self.assertEqual({'entities': {}, 'tokens': []}, out[1]) From 72f7ddad20d6e8ae2aa7d8f761c0eb0fe5604290 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Wed, 29 Nov 2023 03:44:16 -0600 Subject: [PATCH 32/64] 869377m3u: Add comment regarding demo link load times to README (#376) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 395aecf69..c47cf9a65 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ To download any of these models, please [follow this link](https://uts.nlm.nih.g ## Demo A demo application is available at [MedCAT](https://medcat.rosalind.kcl.ac.uk). This was trained on MIMIC-III and all of SNOMED-CT. +PS: This link can take a long time to load the first time around. The machine spins up as needed and spins down when inactive. ## Tutorials A guide on how to use MedCAT is available at [MedCAT Tutorials](https://github.com/CogStack/MedCATtutorials). Read more about MedCAT on [Towards Data Science](https://towardsdatascience.com/medcat-introduction-analyzing-electronic-health-records-e1c420afa13a). From 900439aca48bdfb018fa082b6508f418895092ab Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Wed, 29 Nov 2023 12:59:03 +0000 Subject: [PATCH 33/64] intermediate changes of merge_cdb and testing function --- medcat/cdb.py | 1 + tests/test_cdb.py | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/medcat/cdb.py b/medcat/cdb.py index 47a9d8e02..886809604 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -823,6 +823,7 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: dict[ cdb = CDB(config) # names - copy cdb 1 as that is priority, and save computation time + # TODO: CHECK BENEFITS OF USING ADD_NAMES HERE cdb.name2cuis = cdb1.name2cuis.copy() cdb.name2cuis2status = cdb1.name2cuis2status.copy() cdb.name2count_train = cdb1.name2count_train.copy() diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 96425bc8c..8ec055a37 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -6,6 +6,7 @@ import numpy as np from medcat.config import Config from medcat.cdb_maker import CDBMaker +from medcat.cdb import CDB class CDBTests(unittest.TestCase): @@ -82,5 +83,31 @@ def test_cui2snames_population(self): with self.subTest(cui): self.assertIn(cui, self.undertest.cui2snames) + + def test_merge_cdb(self): + # generating CDBs + config = Config() + maker = CDBMaker(config) + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model_creator", "umls_sample.csv") + cdb1 = maker.prepare_csvs(csv_paths=[path]) + cdb2 = cdb1.copy() + + # generating vectors and setting up + zeroes = np.zeros(shape=(1,300)) + ones = np.ones(shape=(1,300)) + for i, cui in enumerate(cdb1.cui2names): + cdb1.cui2context_vectors[cui] = {"short" : zeroes} + cdb2.cui2context_vectors[cui] = {"short" : ones} + cdb1.cui2count_train[cui] = 1 + cdb2.cui2count_train[cui] = i + test_add = {"test": {'tokens': "test_token", 'snames': "test_sname", 'raw_name': "test_raw_name", "is_upper" : "P"}} + cdb1.add_names("C0006826", test_add) + + # merging + cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2) + # tests + + + if __name__ == '__main__': unittest.main() From d8473d9fabdc49eee3976f2d362433e28f36414b Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 <60137864+adam-sutton-1992@users.noreply.github.com> Date: Thu, 30 Nov 2023 11:46:28 +0000 Subject: [PATCH 34/64] Added README.md documentation for CPU only installations (#365) * changed README.md to reflect installation options. * added setup script to demonstrate how wrapper could look for CPU installations * removed setup.sh as unnessescary for cpu only builds * Initial commit for merge_cdb method * Added indentation to make merge_cdb a class method * fixed syntax issues * more lint fixes * more lint fixes * bug fixes of merge_cdb * removed print statements * Added commentary on disk space usage of pytorch-gpu * removed merge_cdb from branch --------- Co-authored-by: adam-sutton-1992 --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index c47cf9a65..bf34f00c6 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,17 @@ To download any of these models, please [follow this link](https://uts.nlm.nih.g - **Paper**: [What’s in a Summary? Laying the Groundwork for Advances in Hospital-Course Summarization](https://www.aclweb.org/anthology/2021.naacl-main.382.pdf) - ([more...](https://github.com/CogStack/MedCAT/blob/master/media/news.md)) +## Installation +To install the latest version of MedCAT run the following command: +``` +pip install medcat +``` +Normal installations of MedCAT will install torch-gpu and all relevant dependancies (such as CUDA). This can require as much as 10 GB more disk space, which isn't required for CPU only usage. + +To install the latest version of MedCAT without torch GPU support run the following command: +``` +pip install medcat --extra_index_url https://download.pytorch.org/whl/cpu/ +``` ## Demo A demo application is available at [MedCAT](https://medcat.rosalind.kcl.ac.uk). This was trained on MIMIC-III and all of SNOMED-CT. PS: This link can take a long time to load the first time around. The machine spins up as needed and spins down when inactive. From 76b75cc4e3558e9a48d1fe8aa43ba23621652a75 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Thu, 30 Nov 2023 05:47:37 -0600 Subject: [PATCH 35/64] Cu 8692zguyq no preferred name (#367) * CU-8692zguyq: Slight simplification of minimum-name-length logic * CU-8692zguyq: Add some tests for prepare_name preprocessor * CU-8692zguyq: Add warning if no preferred name was added along a new CUI * CU-8692zguyq: Add additional warning messages when adding/training a new CUI with no preferred name * CU-8692zguyq: Make no preferred name warnings only run if name status is preferred * CU-8692zguyq: Add tests for no-preferred name warnings * CU-8692zguyq: Add Vocab.make_unigram_table to CAT tests * CU-8692zguyq: Move to built in asserting for logging instead of patching the method * CU-8692zguyq: Add workaround for assertNoLogs on python 3.8 and 3.9 --- medcat/cat.py | 4 ++ medcat/cdb.py | 15 ++++ medcat/preprocessing/cleaners.py | 3 +- tests/preprocessing/__init__.py | 0 tests/preprocessing/test_cleaners.py | 104 +++++++++++++++++++++++++++ tests/test_cat.py | 55 +++++++++++++- 6 files changed, 178 insertions(+), 3 deletions(-) create mode 100644 tests/preprocessing/__init__.py create mode 100644 tests/preprocessing/test_cleaners.py diff --git a/medcat/cat.py b/medcat/cat.py index ef6099566..f49a25022 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -840,6 +840,10 @@ def add_and_train_concept(self, Refer to medcat.cat.cdb.CDB.add_concept """ names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config) + if not names and cui not in self.cdb.cui2preferred_name and name_status == 'P': + logger.warning("No names were able to be prepared in CAT.add_and_train_concept " + "method. As such no preferred name will be able to be specifeid. " + "The CUI: '%s' and raw name: '%s'", cui, name) # Only if not negative, otherwise do not add the new name if in fact it should not be detected if do_add_concept and not negative: self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description, diff --git a/medcat/cdb.py b/medcat/cdb.py index 1110a3b84..2ca8382a7 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -358,6 +358,21 @@ def _add_concept(self, if name_status == 'P' and cui not in self.cui2preferred_name: # Do not overwrite old preferred names self.cui2preferred_name[cui] = name_info['raw_name'] + elif names: + # if no name_info and names is NOT an empty dict + # this shouldn't really happen in the current setup + raise ValueError("Unknown state where there is no name_info, " + "yet the `names` dict is not empty (%s)", names) + elif name_status == 'P' and cui not in self.cui2preferred_name: + # this means names is an empty `names` dict + logger.warning("Did not manage to add a preferred name in `add_cui`. " + "Was trying to do so for cui: '%s'" + "This means that the `names` dict passed was empty. " + "This is _usually_ caused by either no name or too short " + "a name passed to the `prepare_name` method. " + "The minimum length is defined in: " + "'config.cdb_maker.min_letters_required' and " + "is currently set at %s", cui, self.config.cdb_maker['min_letters_required']) # Add other fields if full_build if full_build: diff --git a/medcat/preprocessing/cleaners.py b/medcat/preprocessing/cleaners.py index 18314d562..43e8098e2 100644 --- a/medcat/preprocessing/cleaners.py +++ b/medcat/preprocessing/cleaners.py @@ -48,7 +48,8 @@ def prepare_name(raw_name: str, nlp: Language, names: Dict, config: Config) -> D snames = set() name = config.general['separator'].join(tokens) - if not config.cdb_maker.get('min_letters_required', 0) or len(re.sub("[^A-Za-z]*", '', name)) >= config.cdb_maker.get('min_letters_required', 0): + min_letters = config.cdb_maker.get('min_letters_required', 0) + if not min_letters or len(re.sub("[^A-Za-z]*", '', name)) >= min_letters: if name not in names: sname = "" for token in tokens: diff --git a/tests/preprocessing/__init__.py b/tests/preprocessing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/preprocessing/test_cleaners.py b/tests/preprocessing/test_cleaners.py new file mode 100644 index 000000000..b879d9ee6 --- /dev/null +++ b/tests/preprocessing/test_cleaners.py @@ -0,0 +1,104 @@ +from medcat.preprocessing.cleaners import prepare_name +from medcat.config import Config +from medcat.cdb_maker import CDBMaker + +import logging, os + +import unittest + + +class BaseCDBMakerTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + config = Config() + config.general['log_level'] = logging.DEBUG + config.general["spacy_model"] = "en_core_web_md" + cls.maker = CDBMaker(config) + csvs = [ + os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'examples', 'cdb.csv'), + os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'examples', 'cdb_2.csv') + ] + cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True) + + +class BasePrepareNameTest(BaseCDBMakerTests): + raw_name = 'raw' + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.do_prepare_name() + + # method called after setup, when raw_name has been specified + @classmethod + def do_prepare_name(cls) -> None: + cls.name = cls.cdb.config.general.separator.join(cls.raw_name.split()) + cls.names = prepare_name(cls.raw_name, cls.maker.pipe.spacy_nlp, {}, cls.cdb.config) + + def _dict_has_key_val_type(self, d: dict, key, val_type): + self.assertIn(key, d) + self.assertIsInstance(d[key], val_type) + + def _names_has_key_val_type(self, key, val_type): + self._dict_has_key_val_type(self.names, key, val_type) + + def test_result_has_name(self): + self._names_has_key_val_type(self.name, dict) + + def test_name_info_has_tokens(self): + self._dict_has_key_val_type(self.names[self.name], 'tokens', list) + + def test_name_info_has_words_as_tokens(self): + name_info = self.names[self.name] + tokens = name_info['tokens'] + for word in self.raw_name.split(): + with self.subTest(word): + self.assertIn(word, tokens) + + +class NamePreparationTests_OneLetter(BasePrepareNameTest): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.raw_name = "a" + # the minimum name length is defined by the following config option + # if I don't set this to 1 here, I would see the tests fail + # that would be because the result from `prepare_names` would be empty + cls.cdb.config.cdb_maker.min_letters_required = 1 + super().do_prepare_name() + + +class NamePreparationTests_TwoLetters(BasePrepareNameTest): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.raw_name = "an" + super().do_prepare_name() + + +class NamePreparationTests_MultiToken(BasePrepareNameTest): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.raw_name = "this raw name" + super().do_prepare_name() + + +class NamePreparationTests_Empty(BaseCDBMakerTests): + """In case of an empty name, I would expect the names dict + returned by `prepare_name` to be empty. + """ + empty_raw_name = '' + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.names = prepare_name(cls.empty_raw_name, cls.maker.pipe.spacy_nlp, {}, cls.cdb.config) + + def test_names_dict_is_empty(self): + self.assertEqual(len(self.names), 0) + self.assertEqual(self.names, {}) diff --git a/tests/test_cat.py b/tests/test_cat.py index cd33efbc7..acd337e71 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -4,10 +4,12 @@ import unittest import tempfile import shutil +import logging +import contextlib from transformers import AutoTokenizer from medcat.vocab import Vocab -from medcat.cdb import CDB -from medcat.cat import CAT +from medcat.cdb import CDB, logger as cdb_logger +from medcat.cat import CAT, logger as cat_logger from medcat.utils.checkpoint import Checkpoint from medcat.meta_cat import MetaCAT from medcat.config_meta_cat import ConfigMetaCAT @@ -20,6 +22,7 @@ class CATTests(unittest.TestCase): def setUpClass(cls) -> None: cls.cdb = CDB.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat")) cls.vocab = Vocab.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat")) + cls.vocab.make_unigram_table() cls.cdb.config.general.spacy_model = "en_core_web_md" cls.cdb.config.ner.min_name_len = 2 cls.cdb.config.ner.upper_case_limit_len = 3 @@ -388,6 +391,54 @@ def test_hashing(self): cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) self.assertEqual(cat.get_hash(), cat.config.version.id) + def _assertNoLogs(self, logger: logging.Logger, level: int): + if hasattr(self, 'assertNoLogs'): + return self.assertNoLogs(logger=logger, level=level) + else: + return self.__assertNoLogs(logger=logger, level=level) + + @contextlib.contextmanager + def __assertNoLogs(self, logger: logging.Logger, level: int): + try: + with self.assertLogs(logger, level) as captured_logs: + yield + except AssertionError: + return + if captured_logs: + raise AssertionError("Logs were found: {}".format(captured_logs)) + + def assertLogsDuringAddAndTrainConcept(self, logger: logging.Logger, log_level, + name: str, name_status: str, nr_of_calls: int): + cui = 'CUI-%d'%(hash(name) + id(name)) + with (self.assertLogs(logger=logger, level=log_level) + if nr_of_calls == 1 + else self._assertNoLogs(logger=logger, level=log_level)): + self.undertest.add_and_train_concept(cui, name, name_status=name_status) + + def test_add_and_train_concept_cat_nowarn_long_name(self): + long_name = 'a very long name' + self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=long_name, name_status='', nr_of_calls=0) + + def test_add_and_train_concept_cdb_nowarn_long_name(self): + long_name = 'a very long name' + self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=long_name, name_status='', nr_of_calls=0) + + def test_add_and_train_concept_cat_nowarn_short_name_not_pref(self): + short_name = 'a' + self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=short_name, name_status='', nr_of_calls=0) + + def test_add_and_train_concept_cdb_nowarn_short_name_not_pref(self): + short_name = 'a' + self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='', nr_of_calls=0) + + def test_add_and_train_concept_cat_warns_short_name(self): + short_name = 'a' + self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1) + + def test_add_and_train_concept_cdb_warns_short_name(self): + short_name = 'a' + self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1) + class ModelWithTwoConfigsLoadTests(unittest.TestCase): From 7fddac0626382720c1194b465b1e6f975a00152f Mon Sep 17 00:00:00 2001 From: Xi Bai <82581439+baixiac@users.noreply.github.com> Date: Tue, 5 Dec 2023 17:36:28 +0000 Subject: [PATCH 36/64] Add trainer callbacks for Transformer NER (#377) CU-86938vf30 add trainer callbacks for Transformer NER --- medcat/ner/transformers_ner.py | 16 ++++++++-- tests/ner/test_transformers_ner.py | 50 ++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 tests/ner/test_transformers_ner.py diff --git a/medcat/ner/transformers_ner.py b/medcat/ner/transformers_ner.py index 9623b1b93..227ccc083 100644 --- a/medcat/ner/transformers_ner.py +++ b/medcat/ner/transformers_ner.py @@ -1,6 +1,7 @@ import os import json import logging +import datasets from spacy.tokens import Doc from datetime import datetime from typing import Iterable, Iterator, Optional, Dict, List, cast, Union @@ -18,7 +19,7 @@ from transformers import Trainer, AutoModelForTokenClassification, AutoTokenizer from transformers import pipeline, TrainingArguments -import datasets +from transformers.trainer_callback import TrainerCallback # It should be safe to do this always, as all other multiprocessing #will be finished before data comes to meta_cat @@ -137,7 +138,12 @@ def merge_data_loaded(base, other): return out_path - def train(self, json_path: Union[str, list, None]=None, ignore_extra_labels=False, dataset=None, meta_requirements=None): + def train(self, + json_path: Union[str, list, None]=None, + ignore_extra_labels=False, + dataset=None, + meta_requirements=None, + trainer_callbacks: Optional[List[TrainerCallback]]=None): """Train or continue training a model give a json_path containing a MedCATtrainer export. It will continue training if an existing model is loaded or start new training if the model is blank/new. @@ -149,6 +155,9 @@ def train(self, json_path: Union[str, list, None]=None, ignore_extra_labels=Fals ignore_extra_labels: Makes only sense when an existing deid model was loaded and from the new data we want to ignore labels that did not exist in the old model. + trainer_callbacks (List[TrainerCallback]): + A list of trainer callbacks for collecting metrics during the training at the client side. The + transformers Trainer object will be passed in when each callback is called. """ if dataset is None and json_path is not None: @@ -193,6 +202,9 @@ def train(self, json_path: Union[str, list, None]=None, ignore_extra_labels=Fals compute_metrics=lambda p: metrics(p, tokenizer=self.tokenizer, dataset=encoded_dataset['test'], verbose=self.config.general['verbose_metrics']), data_collator=data_collator, # type: ignore tokenizer=None) + if trainer_callbacks: + for callback in trainer_callbacks: + trainer.add_callback(callback(trainer)) trainer.train() # type: ignore diff --git a/tests/ner/test_transformers_ner.py b/tests/ner/test_transformers_ner.py new file mode 100644 index 000000000..de9eae32c --- /dev/null +++ b/tests/ner/test_transformers_ner.py @@ -0,0 +1,50 @@ +import os +import unittest +from spacy.lang.en import English +from spacy.tokens import Doc, Span +from transformers import TrainerCallback +from medcat.ner.transformers_ner import TransformersNER +from medcat.config import Config +from medcat.cdb_maker import CDBMaker + + +class TransformerNERTest(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + config = Config() + config.general["spacy_model"] = "en_core_web_md" + cdb_maker = CDBMaker(config) + cdb_csv = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "examples", "cdb.csv") + cdb = cdb_maker.prepare_csvs([cdb_csv], full_build=True) + Doc.set_extension("ents", default=[], force=True) + Span.set_extension("confidence", default=-1, force=True) + Span.set_extension("id", default=0, force=True) + Span.set_extension("detected_name", default=None, force=True) + Span.set_extension("link_candidates", default=None, force=True) + Span.set_extension("cui", default=-1, force=True) + Span.set_extension("context_similarity", default=-1, force=True) + cls.undertest = TransformersNER(cdb) + cls.undertest.create_eval_pipeline() + + def test_pipe(self): + doc = English().make_doc("\nPatient Name: John Smith\nAddress: 15 Maple Avenue\nCity: New York\nCC: Chronic back pain\n\nHX: Mr. Smith") + doc = next(self.undertest.pipe([doc])) + assert len(doc.ents) > 0, "No entities were recognised" + + def test_train(self): + tracker = unittest.mock.Mock() + class _DummyCallback(TrainerCallback): + def __init__(self, trainer) -> None: + self._trainer = trainer + def on_epoch_end(self, *args, **kwargs) -> None: + tracker.call() + + train_data = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "resources", "deid_train_data.json") + self.undertest.training_arguments.num_train_epochs = 1 + df, examples, dataset = self.undertest.train(train_data, trainer_callbacks=[_DummyCallback, _DummyCallback]) + assert "fp" in examples + assert "fn" in examples + assert dataset["train"].num_rows == 48 + assert dataset["test"].num_rows == 12 + self.assertEqual(tracker.call.call_count, 2) From 6a820f03e627fee1f02a59a858a340de35bf41f3 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Mon, 11 Dec 2023 20:13:30 +0000 Subject: [PATCH 37/64] changes to merge_cdb and adding unit tests for method --- medcat/cdb.py | 143 +++++++++++++++++++++++----------------------- tests/test_cdb.py | 29 +++++++--- 2 files changed, 92 insertions(+), 80 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 886809604..075a8d611 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -7,6 +7,7 @@ import numpy as np from typing import Dict, Set, Optional, List, Union from functools import partial +from copy import deepcopy from medcat import __version__ from medcat.utils.hasher import Hasher @@ -804,8 +805,11 @@ def calculate_hash(self): return self._hash @staticmethod - def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: dict[str, dict[str, np.array]] = {}): - """Merge two CDB's together to produce a single CDB. + def merge_cdb(cdb1: "CDB", + cdb2: "CDB", + overwrite_training: int = 0, + full_build: bool = False): + """Merge two CDB's together to produce a new, single CDB. The contents of inputs CDBs will not be changed. Args: cdb1 (medcat.cdb.CDB): @@ -813,103 +817,98 @@ def merge_cdb(cdb1: "CDB", cdb2: "CDB", overwrite: int = 0, vector_import: dict[ cui2preferred_name), this cdb values will be prioritised over cdb2. cdb2 (medcat.cdb.CDB): The second medcat cdb to merge. - overwrite (bool): - NYI: Do not merge certain dictionaries, and prioritise a cdb. - vector_import (Dict[str, Dict[str, np.array]]): - NYI: Vectors to import, using the same format as cui2context_vectors. + overwrite_training (int): + Choose to prioritise a CDB's context vectors values over merging gracefully. 0 - no prio, 1 - CDB1, 2 - CDB2 + full_build (bool): + Add additional information from "addl_info" dicts "cui2ontologies" and "cui2description" """ - # TODO: overwriting, vector import - config = cdb1.config.copy() + config = deepcopy(cdb1.config) cdb = CDB(config) - # names - copy cdb 1 as that is priority, and save computation time - # TODO: CHECK BENEFITS OF USING ADD_NAMES HERE - cdb.name2cuis = cdb1.name2cuis.copy() - cdb.name2cuis2status = cdb1.name2cuis2status.copy() - cdb.name2count_train = cdb1.name2count_train.copy() - cdb.name_isupper = cdb1.name_isupper.copy() - for name in cdb2.name2cuis: - if name in cdb1.name2cuis: # if they exist in both cdbs - cdb.name2cuis[name] = list(set(cdb1.name2cuis[name] + cdb2.name2cuis[name])) # unique CUI's only for each name - if name in cdb1.name2cuis2status: - cdb.name2cuis2status[name] = {**cdb2.name2cuis2status[name], **cdb1.name2cuis2status[name]} - if name in cdb1.name2count_train: - cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason - else: # if name only exists in cdb 2 - cdb.name2cuis[name] = cdb2.name2cuis[name] - if name in cdb2.name2cuis2status: - cdb.name2cuis2status[name] = cdb2.name2cuis2status[name] - if name in cdb2.name2count_train: - cdb.name2count_train[name] = cdb2.name2count_train[name] - if name in cdb2.name_isupper: - cdb.name_isupper[name] = cdb2.name_isupper[name] - - # snames - cdb.snames = cdb1.snames.union(cdb2.snames) + # Copy CDB 1 - as all settings from CDB 1 will be carried over + cdb.cui2names = deepcopy(cdb1.cui2names) + cdb.cui2snames = deepcopy(cdb1.cui2snames) + cdb.cui2count_train = deepcopy(cdb1.cui2count_train) + cdb.cui2info = deepcopy(cdb1.cui2info) + cdb.cui2context_vectors = deepcopy(cdb1.cui2context_vectors) + cdb.cui2tags = deepcopy(cdb1.cui2tags) + cdb.cui2type_ids = deepcopy(cdb1.cui2type_ids) + cdb.cui2preferred_name = deepcopy(cdb1.cui2preferred_name) + cdb.name2cuis = deepcopy(cdb1.name2cuis) + cdb.name2cuis2status = deepcopy(cdb1.name2cuis2status) + cdb.name2count_train = deepcopy(cdb1.name2count_train) + cdb.name_isupper = deepcopy(cdb1.name_isupper) + if full_build: + cdb.addl_info = deepcopy(cdb1.addl_info) - # cui merging - cdb.cui2names = cdb1.cui2names.copy() - cdb.cui2snames = cdb1.cui2snames.copy() - cdb.cui2count_train = cdb1.cui2count_train.copy() - cdb.cui2info = cdb1.cui2info.copy() - cdb.cui2context_vectors = cdb1.cui2context_vectors.copy() - cdb.cui2tags = cdb1.cui2tags.copy() - cdb.cui2type_ids = cdb1.cui2type_ids.copy() - cdb.cui2preferred_name = cdb1.cui2preferred_name.copy() - - cdb.cui2average_confidence = cdb1.cui2average_confidence.copy() + # handles cui2names, cui2snames, name_isupper, name2cuis, name2cuis2status, cui2preferred_name for cui in cdb2.cui2names: + names = dict() + for name in cdb2.cui2names[cui]: + names[name] = {'snames' : cdb2.cui2snames.get(cui, set()), 'is_upper' : cdb2.name_isupper.get(name, False), 'tokens' : {}} + name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' + ontologies = set() + description = '' + # For addl_info check cui2original_names as they MUST be added + if full_build and cui in cdb2.addl_info['cui2original_names']: + if 'cui2ontologies' in cdb2.addl_info: + ontologies.update(cdb2.addl_info['cui2ontologies'][cui]) + if 'cui2description' in cdb2.addl_info: + description = cdb2.addl_info['cui2description'][cui] + cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, + type_ids=cdb2.cui2type_ids[cui], description=description, full_build=full_build) if cui in cdb1.cui2names: - cdb.cui2names[cui] = cdb1.cui2names[cui].union(cdb2.cui2names[cui]) - if cui in cdb1.cui2snames: - cdb.cui2snames[cui] = cdb1.cui2snames[cui].union(cdb2.cui2snames[cui]) - if cui in cdb1.cui2count_train: - cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] + cdb1.cui2count_train[cui] - # this is where cui2info would be + if cui in cdb1.cui2count_train or cui in cdb2.cui2count_train: + if overwrite_training == 1 and cui in cdb1.cui2count_train[cui]: + cdb.cui2count_train[cui] = cdb1.cui2count_train[cui] + elif overwrite_training == 2 and cui in cdb2.cui2count_train[cui]: + cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] + else: + cdb.cui2count_train[cui] = cdb1.cui2count_train.get(cui, 0) + cdb2.cui2count_train.get(cui, 0) if cui in cdb1.cui2context_vectors: - contexts = set(list(cdb1.cui2context_vectors[cui]) + list(cdb2.cui2context_vectors[cui].keys())) # xlong, long, medium, short - norm = np.sum([cdb1.cui2count_train[cui], cdb2.cui2count_train[cui]]) - weights = [np.divide(cdb1.cui2count_train[cui], norm), np.divide(cdb2.cui2count_train[cui], norm)] - for s in contexts: - if s in cdb1.cui2context_vectors[cui] and s in cdb2.cui2context_vectors[cui]: - cdb.cui2context_vectors[cui][s] = weights[0] * cdb1.cui2context_vectors[cui][s] + weights[1] * cdb2.cui2context_vectors[cui][s] - elif s in cdb1.cui2context_vectors[cui]: - cdb.cui2context_vectors[cui][s] = cdb1.cui2context_vectors[cui][s] - else: - cdb.cui2context_vectors[cui][s] = cdb2.cui2context_vectors[cui][s] + contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short + if overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]: + weights = [1, 0] + elif overwrite_training == 2 and cui in cdb2.cui2context_vectors[cui]: + weights = [0, 1] + else: + norm = cdb.cui2count_train[cui] + weights = [np.divide(cdb1.cui2count_train.get(cui, 0), norm), np.divide(cdb2.cui2count_train.get(cui, 0), norm)] + for s in contexts: + cdb.cui2context_vectors[cui][s] = (weights[0] * cdb1.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + (weights[1] * cdb2.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) if cui in cdb1.cui2tags: cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) if cui in cdb1.cui2type_ids: cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) - # Nothing to do with prefered name, unless overwrite else: - cdb.cui2names[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2snames: - cdb.cui2snames[cui] = cdb2.cui2snames[cui] if cui in cdb2.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2names[cui] if cui in cdb2.cui2info: - cdb.cui2info[cui] = cdb2.cui2info[cui] # no idea what this is used for, so no merging will be done + cdb.cui2info[cui] = cdb2.cui2info[cui] if cui in cdb2.cui2context_vectors: cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] if cui in cdb2.cui2tags: cdb.cui2tags[cui] = cdb2.cui2tags[cui] if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - if cui in cdb2.cui2preferred_name: - cdb.cui2preferred_name[cui] = cdb2.cui2preferred_name[cui] + + for name in cdb2.name2cuis: + if name in cdb1.name2cuis: # if they exist in both cdbs + if name in cdb1.name2count_train and name in cdb2.name2count_train: + cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason + else: + if name in cdb2.name2count_train: + cdb.name2count_train[name] = cdb2.name2count_train[name] - cdb.addl_info = cdb1.addl_info.copy() - for key in cdb2.addl_info: - if key not in cdb1.addl_info: # doesn't / can't handle clashes TODO: Handle Overwrite Param - cdb.addl_info[key] = cdb2.addl_info[key] + # snames + cdb.snames = cdb1.snames.union(cdb2.snames) # vocab, adding counts if they occur in both - cdb.vocab = cdb1.vocab.copy() + cdb.vocab = deepcopy(cdb1.vocab) for word in cdb2.vocab: if word in cdb.vocab: cdb.vocab[word] += cdb2.vocab[word] else: cdb.vocab[word] = cdb2.vocab[word] - + return cdb diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 8ec055a37..7177ed903 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -85,27 +85,40 @@ def test_cui2snames_population(self): def test_merge_cdb(self): - # generating CDBs + # generating cdbs - two maker are requested as they point to the same created CDB. config = Config() - maker = CDBMaker(config) + config.general["spacy_model"] = "en_core_web_md" + maker1 = CDBMaker(config) + maker2 = CDBMaker(config) # second maker is required as it will otherwise point to same object path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model_creator", "umls_sample.csv") - cdb1 = maker.prepare_csvs(csv_paths=[path]) - cdb2 = cdb1.copy() + cdb1 = maker1.prepare_csvs(csv_paths=[path]) + cdb2 = maker2.prepare_csvs(csv_paths=[path]) # generating vectors and setting up zeroes = np.zeros(shape=(1,300)) ones = np.ones(shape=(1,300)) for i, cui in enumerate(cdb1.cui2names): - cdb1.cui2context_vectors[cui] = {"short" : zeroes} - cdb2.cui2context_vectors[cui] = {"short" : ones} + cdb1.cui2context_vectors[cui] = {"short" : ones} + cdb2.cui2context_vectors[cui] = {"short" : zeroes} cdb1.cui2count_train[cui] = 1 - cdb2.cui2count_train[cui] = i - test_add = {"test": {'tokens': "test_token", 'snames': "test_sname", 'raw_name': "test_raw_name", "is_upper" : "P"}} + cdb2.cui2count_train[cui] = i + 1 + test_add = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper" : "P"}} cdb1.add_names("C0006826", test_add) + unique_test = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper" : "P"}} + cdb2.add_names("UniqueTest", unique_test) + cdb2.cui2context_vectors["UniqueTest"] = {"short" : ones} # merging cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2) + # tests + self.assertIn("test", cdb.cui2names["C0006826"]) + self.assertIn("test_name", cdb.cui2snames["C0006826"]) + self.assertEqual("Cancer", cdb.cui2preferred_name["C0006826"]) + self.assertTrue(np.array_equal(np.ones(shape=(1,300)), cdb.cui2context_vectors["UniqueTest"]["short"])) + base = np.ones(shape=(1,300)) + for i, cui in enumerate(cdb1.cui2names): + self.assertTrue(np.array_equal(cdb.cui2context_vectors[cui]["short"], np.divide(base, i+2))) From f96758aaa857ffd034726fbc84b830793435c0aa Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Tue, 12 Dec 2023 14:06:31 +0000 Subject: [PATCH 38/64] fixing lint issues --- medcat/cdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index f7800aeb6..1870b8f7a 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -967,7 +967,7 @@ def merge_cdb(cdb1: "CDB", for cui in cdb2.cui2names: names = dict() for name in cdb2.cui2names[cui]: - names[name] = {'snames' : cdb2.cui2snames.get(cui, set()), 'is_upper' : cdb2.name_isupper.get(name, False), 'tokens' : {}} + names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}} name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' ontologies = set() description = '' @@ -1013,7 +1013,7 @@ def merge_cdb(cdb1: "CDB", cdb.cui2tags[cui] = cdb2.cui2tags[cui] if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - + for name in cdb2.name2cuis: if name in cdb1.name2cuis: # if they exist in both cdbs if name in cdb1.name2count_train and name in cdb2.name2count_train: From 1975b1c13b657966ff76c1c075e164e79cec5452 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Tue, 12 Dec 2023 14:24:16 +0000 Subject: [PATCH 39/64] fixing flake8 linting --- medcat/cdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 1870b8f7a..d773d1f4f 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -1032,5 +1032,5 @@ def merge_cdb(cdb1: "CDB", cdb.vocab[word] += cdb2.vocab[word] else: cdb.vocab[word] = cdb2.vocab[word] - + return cdb From 6f752c8d2d10768fe9e3a822eb26a1b5aa973aa7 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Wed, 13 Dec 2023 20:09:14 +0000 Subject: [PATCH 40/64] bug fixes, additional tests, and more documentation --- medcat/cdb.py | 51 ++++++++++++++++++++++++----------------------- tests/test_cdb.py | 31 ++++++++++++++++++---------- 2 files changed, 47 insertions(+), 35 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index d773d1f4f..1737b4bad 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -932,6 +932,7 @@ def merge_cdb(cdb1: "CDB", overwrite_training: int = 0, full_build: bool = False): """Merge two CDB's together to produce a new, single CDB. The contents of inputs CDBs will not be changed. + `addl_info` can not be perfectly merged, and will prioritise cdb1. see `full_build` Args: cdb1 (medcat.cdb.CDB): @@ -967,35 +968,33 @@ def merge_cdb(cdb1: "CDB", for cui in cdb2.cui2names: names = dict() for name in cdb2.cui2names[cui]: - names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}} + names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}, 'raw_name': cdb2.get_name(cui)} name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' + # For addl_info check cui2original_names as they MUST be added ontologies = set() description = '' - # For addl_info check cui2original_names as they MUST be added - if full_build and cui in cdb2.addl_info['cui2original_names']: + to_build = False + if full_build and (cui in cdb2.addl_info['cui2original_names'] or cui in cdb2.addl_info['cui2description']): + to_build = True if 'cui2ontologies' in cdb2.addl_info: ontologies.update(cdb2.addl_info['cui2ontologies'][cui]) if 'cui2description' in cdb2.addl_info: description = cdb2.addl_info['cui2description'][cui] cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, - type_ids=cdb2.cui2type_ids[cui], description=description, full_build=full_build) + type_ids=cdb2.cui2type_ids[cui], description=description, full_build=to_build) if cui in cdb1.cui2names: - if cui in cdb1.cui2count_train or cui in cdb2.cui2count_train: - if overwrite_training == 1 and cui in cdb1.cui2count_train[cui]: - cdb.cui2count_train[cui] = cdb1.cui2count_train[cui] - elif overwrite_training == 2 and cui in cdb2.cui2count_train[cui]: + if (cui in cdb1.cui2count_train or cui in cdb2.cui2count_train) and not (overwrite_training == 1 and cui in cdb1.cui2count_train): + if overwrite_training == 2 and cui in cdb2.cui2count_train: cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] else: cdb.cui2count_train[cui] = cdb1.cui2count_train.get(cui, 0) + cdb2.cui2count_train.get(cui, 0) - if cui in cdb1.cui2context_vectors: - contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short - if overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]: - weights = [1, 0] - elif overwrite_training == 2 and cui in cdb2.cui2context_vectors[cui]: + if cui in cdb1.cui2context_vectors and not (overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]): + if overwrite_training == 2 and cui in cdb2.cui2context_vectors: weights = [0, 1] else: norm = cdb.cui2count_train[cui] weights = [np.divide(cdb1.cui2count_train.get(cui, 0), norm), np.divide(cdb2.cui2count_train.get(cui, 0), norm)] + contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short for s in contexts: cdb.cui2context_vectors[cui][s] = (weights[0] * cdb1.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + (weights[1] * cdb2.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) if cui in cdb1.cui2tags: @@ -1014,23 +1013,25 @@ def merge_cdb(cdb1: "CDB", if cui in cdb2.cui2type_ids: cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - for name in cdb2.name2cuis: - if name in cdb1.name2cuis: # if they exist in both cdbs - if name in cdb1.name2count_train and name in cdb2.name2count_train: - cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason - else: - if name in cdb2.name2count_train: - cdb.name2count_train[name] = cdb2.name2count_train[name] + if overwrite_training != 1: + for name in cdb2.name2cuis: + if name in cdb1.name2cuis and overwrite_training == 0: # if they exist in both cdbs + if name in cdb1.name2count_train and name in cdb2.name2count_train: + cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason + else: + if name in cdb2.name2count_train: + cdb.name2count_train[name] = cdb2.name2count_train[name] # snames cdb.snames = cdb1.snames.union(cdb2.snames) # vocab, adding counts if they occur in both cdb.vocab = deepcopy(cdb1.vocab) - for word in cdb2.vocab: - if word in cdb.vocab: - cdb.vocab[word] += cdb2.vocab[word] - else: - cdb.vocab[word] = cdb2.vocab[word] + if overwrite_training != 1: + for word in cdb2.vocab: + if word in cdb.vocab and overwrite_training == 0: + cdb.vocab[word] += cdb2.vocab[word] + else: + cdb.vocab[word] = cdb2.vocab[word] return cdb diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 29c603daa..3ff7e5dad 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -101,32 +101,43 @@ def test_merge_cdb(self): cdb1 = maker1.prepare_csvs(csv_paths=[path]) cdb2 = maker2.prepare_csvs(csv_paths=[path]) - # generating vectors and setting up + # generating context vectors here for for testing the weighted average function (based off cui2count_train) zeroes = np.zeros(shape=(1,300)) ones = np.ones(shape=(1,300)) for i, cui in enumerate(cdb1.cui2names): - cdb1.cui2context_vectors[cui] = {"short" : ones} - cdb2.cui2context_vectors[cui] = {"short" : zeroes} + cdb1.cui2context_vectors[cui] = {"short": ones} + cdb2.cui2context_vectors[cui] = {"short": zeroes} cdb1.cui2count_train[cui] = 1 cdb2.cui2count_train[cui] = i + 1 - test_add = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper" : "P"}} + # adding new names and cuis to each cdb to test after merging + test_add = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper": "P"}} cdb1.add_names("C0006826", test_add) - unique_test = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper" : "P"}} + unique_test = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper": "P"}} cdb2.add_names("UniqueTest", unique_test) - cdb2.cui2context_vectors["UniqueTest"] = {"short" : ones} + cdb2.cui2context_vectors["UniqueTest"] = {"short": zeroes} + cdb2.addl_info["cui2ontologies"] = {} + cdb2.addl_info["cui2description"] = {} + for cui in cdb2.cui2names: + cdb2.addl_info["cui2ontologies"][cui] = ["test_ontology"] + cdb2.addl_info["cui2description"][cui] = "test_description" # merging cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2) + overwrite_cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2, overwrite_training=2, full_build=True) # tests self.assertIn("test", cdb.cui2names["C0006826"]) self.assertIn("test_name", cdb.cui2snames["C0006826"]) self.assertEqual("Cancer", cdb.cui2preferred_name["C0006826"]) - self.assertTrue(np.array_equal(np.ones(shape=(1,300)), cdb.cui2context_vectors["UniqueTest"]["short"])) - base = np.ones(shape=(1,300)) + self.assertTrue(np.array_equal(zeroes, cdb.cui2context_vectors["UniqueTest"]["short"])) for i, cui in enumerate(cdb1.cui2names): - self.assertTrue(np.array_equal(cdb.cui2context_vectors[cui]["short"], np.divide(base, i+2))) - + self.assertTrue(np.array_equal(cdb.cui2context_vectors[cui]["short"], np.divide(ones, i+2))) + self.assertEqual(cdb.addl_info["cui2ontologies"], dict()) + self.assertEqual(cdb.addl_info["cui2ontologies"], dict()) + for cui in cdb2.cui2names: + self.assertTrue(np.array_equal(overwrite_cdb.cui2context_vectors[cui]["short"], zeroes)) + self.assertEqual(overwrite_cdb.addl_info["cui2ontologies"][cui], {"test_ontology"}) + self.assertEqual(overwrite_cdb.addl_info["cui2description"][cui], "test_description") if __name__ == '__main__': From 7d694f2ecaa1025a08a504fb26f45c0fb0e2d3ab Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Wed, 13 Dec 2023 20:41:06 +0000 Subject: [PATCH 41/64] moved set up of cdbs to be merged to tests.helper --- tests/helper.py | 35 +++++++++++++++++++++++++++++++++++ tests/test_cdb.py | 30 ++++-------------------------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/tests/helper.py b/tests/helper.py index 9fb66589b..3da571758 100644 --- a/tests/helper.py +++ b/tests/helper.py @@ -6,6 +6,8 @@ import numpy as np from medcat.vocab import Vocab +from medcat.cdb_maker import CDBMaker +from medcat.config import Config class AsyncMock(unittest.mock.MagicMock): @@ -86,3 +88,36 @@ def check_or_download(self): return with open(self.vocab_path, 'wb') as f: f.write(tmp.content) + + +class ForCDBMerging: + + def __init__(self) -> None: + # generating cdbs - two maker are requested as they point to the same created CDB. + config = Config() + config.general["spacy_model"] = "en_core_web_md" + maker1 = CDBMaker(config) + maker2 = CDBMaker(config) # second maker is required as it will otherwise point to same object + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model_creator", "umls_sample.csv") + self.cdb1 = maker1.prepare_csvs(csv_paths=[path]) + self.cdb2 = maker2.prepare_csvs(csv_paths=[path]) + + # generating context vectors here for for testing the weighted average function (based off cui2count_train) + zeroes = np.zeros(shape=(1,300)) + ones = np.ones(shape=(1,300)) + for i, cui in enumerate(self.cdb1.cui2names): + self.cdb1.cui2context_vectors[cui] = {"short": ones} + self.cdb2.cui2context_vectors[cui] = {"short": zeroes} + self.cdb1.cui2count_train[cui] = 1 + self.cdb2.cui2count_train[cui] = i + 1 + # adding new names and cuis to each cdb to test after merging + test_add = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper": "P"}} + self.cdb1.add_names("C0006826", test_add) + unique_test = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper": "P"}} + self.cdb2.add_names("UniqueTest", unique_test) + self.cdb2.cui2context_vectors["UniqueTest"] = {"short": zeroes} + self.cdb2.addl_info["cui2ontologies"] = {} + self.cdb2.addl_info["cui2description"] = {} + for cui in self.cdb2.cui2names: + self.cdb2.addl_info["cui2ontologies"][cui] = ["test_ontology"] + self.cdb2.addl_info["cui2description"][cui] = "test_description" diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 3ff7e5dad..08b0cee88 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -7,6 +7,7 @@ from medcat.config import Config from medcat.cdb_maker import CDBMaker from medcat.cdb import CDB +from .helper import ForCDBMerging class CDBTests(unittest.TestCase): @@ -92,34 +93,11 @@ def test_cui2snames_population(self): def test_merge_cdb(self): - # generating cdbs - two maker are requested as they point to the same created CDB. - config = Config() - config.general["spacy_model"] = "en_core_web_md" - maker1 = CDBMaker(config) - maker2 = CDBMaker(config) # second maker is required as it will otherwise point to same object - path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "model_creator", "umls_sample.csv") - cdb1 = maker1.prepare_csvs(csv_paths=[path]) - cdb2 = maker2.prepare_csvs(csv_paths=[path]) - - # generating context vectors here for for testing the weighted average function (based off cui2count_train) + to_merge = ForCDBMerging() + cdb1 = to_merge.cdb1 + cdb2 = to_merge.cdb2 zeroes = np.zeros(shape=(1,300)) ones = np.ones(shape=(1,300)) - for i, cui in enumerate(cdb1.cui2names): - cdb1.cui2context_vectors[cui] = {"short": ones} - cdb2.cui2context_vectors[cui] = {"short": zeroes} - cdb1.cui2count_train[cui] = 1 - cdb2.cui2count_train[cui] = i + 1 - # adding new names and cuis to each cdb to test after merging - test_add = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper": "P"}} - cdb1.add_names("C0006826", test_add) - unique_test = {"test": {'tokens': "test_token", 'snames': ["test_name"], 'raw_name': "test_raw_name", "is_upper": "P"}} - cdb2.add_names("UniqueTest", unique_test) - cdb2.cui2context_vectors["UniqueTest"] = {"short": zeroes} - cdb2.addl_info["cui2ontologies"] = {} - cdb2.addl_info["cui2description"] = {} - for cui in cdb2.cui2names: - cdb2.addl_info["cui2ontologies"][cui] = ["test_ontology"] - cdb2.addl_info["cui2description"][cui] = "test_description" # merging cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2) From 7cdd208c69eedf43c176cbe2fbbeba7de17ecd9f Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Thu, 14 Dec 2023 17:56:37 +0000 Subject: [PATCH 42/64] moved merge_cdb to utils and created test_cdb_utils --- medcat/cdb.py | 111 ------------------------------- medcat/utils/cdb_utils.py | 120 ++++++++++++++++++++++++++++++++++ tests/helper.py | 2 +- tests/test_cdb.py | 27 -------- tests/utils/test_cdb_utils.py | 42 ++++++++++++ 5 files changed, 163 insertions(+), 139 deletions(-) create mode 100644 medcat/utils/cdb_utils.py create mode 100644 tests/utils/test_cdb_utils.py diff --git a/medcat/cdb.py b/medcat/cdb.py index 1737b4bad..2ca8382a7 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -7,7 +7,6 @@ import numpy as np from typing import Dict, Set, Optional, List, Union, cast from functools import partial -from copy import deepcopy import os from medcat import __version__ @@ -925,113 +924,3 @@ def calculate_hash(self): self._hash = hasher.hexdigest() logger.info("Found new CDB hash: %s", self._hash) return self._hash - - @staticmethod - def merge_cdb(cdb1: "CDB", - cdb2: "CDB", - overwrite_training: int = 0, - full_build: bool = False): - """Merge two CDB's together to produce a new, single CDB. The contents of inputs CDBs will not be changed. - `addl_info` can not be perfectly merged, and will prioritise cdb1. see `full_build` - - Args: - cdb1 (medcat.cdb.CDB): - The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as - cui2preferred_name), this cdb values will be prioritised over cdb2. - cdb2 (medcat.cdb.CDB): - The second medcat cdb to merge. - overwrite_training (int): - Choose to prioritise a CDB's context vectors values over merging gracefully. 0 - no prio, 1 - CDB1, 2 - CDB2 - full_build (bool): - Add additional information from "addl_info" dicts "cui2ontologies" and "cui2description" - """ - config = deepcopy(cdb1.config) - cdb = CDB(config) - - # Copy CDB 1 - as all settings from CDB 1 will be carried over - cdb.cui2names = deepcopy(cdb1.cui2names) - cdb.cui2snames = deepcopy(cdb1.cui2snames) - cdb.cui2count_train = deepcopy(cdb1.cui2count_train) - cdb.cui2info = deepcopy(cdb1.cui2info) - cdb.cui2context_vectors = deepcopy(cdb1.cui2context_vectors) - cdb.cui2tags = deepcopy(cdb1.cui2tags) - cdb.cui2type_ids = deepcopy(cdb1.cui2type_ids) - cdb.cui2preferred_name = deepcopy(cdb1.cui2preferred_name) - cdb.name2cuis = deepcopy(cdb1.name2cuis) - cdb.name2cuis2status = deepcopy(cdb1.name2cuis2status) - cdb.name2count_train = deepcopy(cdb1.name2count_train) - cdb.name_isupper = deepcopy(cdb1.name_isupper) - if full_build: - cdb.addl_info = deepcopy(cdb1.addl_info) - - # handles cui2names, cui2snames, name_isupper, name2cuis, name2cuis2status, cui2preferred_name - for cui in cdb2.cui2names: - names = dict() - for name in cdb2.cui2names[cui]: - names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}, 'raw_name': cdb2.get_name(cui)} - name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' - # For addl_info check cui2original_names as they MUST be added - ontologies = set() - description = '' - to_build = False - if full_build and (cui in cdb2.addl_info['cui2original_names'] or cui in cdb2.addl_info['cui2description']): - to_build = True - if 'cui2ontologies' in cdb2.addl_info: - ontologies.update(cdb2.addl_info['cui2ontologies'][cui]) - if 'cui2description' in cdb2.addl_info: - description = cdb2.addl_info['cui2description'][cui] - cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, - type_ids=cdb2.cui2type_ids[cui], description=description, full_build=to_build) - if cui in cdb1.cui2names: - if (cui in cdb1.cui2count_train or cui in cdb2.cui2count_train) and not (overwrite_training == 1 and cui in cdb1.cui2count_train): - if overwrite_training == 2 and cui in cdb2.cui2count_train: - cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] - else: - cdb.cui2count_train[cui] = cdb1.cui2count_train.get(cui, 0) + cdb2.cui2count_train.get(cui, 0) - if cui in cdb1.cui2context_vectors and not (overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]): - if overwrite_training == 2 and cui in cdb2.cui2context_vectors: - weights = [0, 1] - else: - norm = cdb.cui2count_train[cui] - weights = [np.divide(cdb1.cui2count_train.get(cui, 0), norm), np.divide(cdb2.cui2count_train.get(cui, 0), norm)] - contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short - for s in contexts: - cdb.cui2context_vectors[cui][s] = (weights[0] * cdb1.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + (weights[1] * cdb2.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) - if cui in cdb1.cui2tags: - cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) - if cui in cdb1.cui2type_ids: - cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) - else: - if cui in cdb2.cui2count_train: - cdb.cui2count_train[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2info: - cdb.cui2info[cui] = cdb2.cui2info[cui] - if cui in cdb2.cui2context_vectors: - cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] - if cui in cdb2.cui2tags: - cdb.cui2tags[cui] = cdb2.cui2tags[cui] - if cui in cdb2.cui2type_ids: - cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - - if overwrite_training != 1: - for name in cdb2.name2cuis: - if name in cdb1.name2cuis and overwrite_training == 0: # if they exist in both cdbs - if name in cdb1.name2count_train and name in cdb2.name2count_train: - cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason - else: - if name in cdb2.name2count_train: - cdb.name2count_train[name] = cdb2.name2count_train[name] - - # snames - cdb.snames = cdb1.snames.union(cdb2.snames) - - # vocab, adding counts if they occur in both - cdb.vocab = deepcopy(cdb1.vocab) - if overwrite_training != 1: - for word in cdb2.vocab: - if word in cdb.vocab and overwrite_training == 0: - cdb.vocab[word] += cdb2.vocab[word] - else: - cdb.vocab[word] = cdb2.vocab[word] - - return cdb diff --git a/medcat/utils/cdb_utils.py b/medcat/utils/cdb_utils.py new file mode 100644 index 000000000..b7097c3f6 --- /dev/null +++ b/medcat/utils/cdb_utils.py @@ -0,0 +1,120 @@ +import logging +import numpy as np + +from copy import deepcopy +from medcat.cdb import CDB + +logger = logging.getLogger(__name__) # separate logger from the package-level one + + +class cdb_utils(object): + + @staticmethod + def merge_cdb(cdb1: "CDB", + cdb2: "CDB", + overwrite_training: int = 0, + full_build: bool = False): + """Merge two CDB's together to produce a new, single CDB. The contents of inputs CDBs will not be changed. + `addl_info` can not be perfectly merged, and will prioritise cdb1. see `full_build` + + Args: + cdb1 (medcat.cdb.CDB): + The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as + cui2preferred_name), this cdb values will be prioritised over cdb2. + cdb2 (medcat.cdb.CDB): + The second medcat cdb to merge. + overwrite_training (int): + Choose to prioritise a CDB's context vectors values over merging gracefully. 0 - no prio, 1 - CDB1, 2 - CDB2 + full_build (bool): + Add additional information from "addl_info" dicts "cui2ontologies" and "cui2description" + """ + config = deepcopy(cdb1.config) + cdb = CDB(config) + + # Copy CDB 1 - as all settings from CDB 1 will be carried over + cdb.cui2names = deepcopy(cdb1.cui2names) + cdb.cui2snames = deepcopy(cdb1.cui2snames) + cdb.cui2count_train = deepcopy(cdb1.cui2count_train) + cdb.cui2info = deepcopy(cdb1.cui2info) + cdb.cui2context_vectors = deepcopy(cdb1.cui2context_vectors) + cdb.cui2tags = deepcopy(cdb1.cui2tags) + cdb.cui2type_ids = deepcopy(cdb1.cui2type_ids) + cdb.cui2preferred_name = deepcopy(cdb1.cui2preferred_name) + cdb.name2cuis = deepcopy(cdb1.name2cuis) + cdb.name2cuis2status = deepcopy(cdb1.name2cuis2status) + cdb.name2count_train = deepcopy(cdb1.name2count_train) + cdb.name_isupper = deepcopy(cdb1.name_isupper) + if full_build: + cdb.addl_info = deepcopy(cdb1.addl_info) + + # handles cui2names, cui2snames, name_isupper, name2cuis, name2cuis2status, cui2preferred_name + for cui in cdb2.cui2names: + names = dict() + for name in cdb2.cui2names[cui]: + names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}, 'raw_name': cdb2.get_name(cui)} + name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' + # For addl_info check cui2original_names as they MUST be added + ontologies = set() + description = '' + to_build = False + if full_build and (cui in cdb2.addl_info['cui2original_names'] or cui in cdb2.addl_info['cui2description']): + to_build = True + if 'cui2ontologies' in cdb2.addl_info: + ontologies.update(cdb2.addl_info['cui2ontologies'][cui]) + if 'cui2description' in cdb2.addl_info: + description = cdb2.addl_info['cui2description'][cui] + cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, + type_ids=cdb2.cui2type_ids[cui], description=description, full_build=to_build) + if cui in cdb1.cui2names: + if (cui in cdb1.cui2count_train or cui in cdb2.cui2count_train) and not (overwrite_training == 1 and cui in cdb1.cui2count_train): + if overwrite_training == 2 and cui in cdb2.cui2count_train: + cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] + else: + cdb.cui2count_train[cui] = cdb1.cui2count_train.get(cui, 0) + cdb2.cui2count_train.get(cui, 0) + if cui in cdb1.cui2context_vectors and not (overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]): + if overwrite_training == 2 and cui in cdb2.cui2context_vectors: + weights = [0, 1] + else: + norm = cdb.cui2count_train[cui] + weights = [np.divide(cdb1.cui2count_train.get(cui, 0), norm), np.divide(cdb2.cui2count_train.get(cui, 0), norm)] + contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short + for s in contexts: + cdb.cui2context_vectors[cui][s] = (weights[0] * cdb1.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + (weights[1] * cdb2.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + if cui in cdb1.cui2tags: + cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) + if cui in cdb1.cui2type_ids: + cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) + else: + if cui in cdb2.cui2count_train: + cdb.cui2count_train[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2info: + cdb.cui2info[cui] = cdb2.cui2info[cui] + if cui in cdb2.cui2context_vectors: + cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] + if cui in cdb2.cui2tags: + cdb.cui2tags[cui] = cdb2.cui2tags[cui] + if cui in cdb2.cui2type_ids: + cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] + + if overwrite_training != 1: + for name in cdb2.name2cuis: + if name in cdb1.name2cuis and overwrite_training == 0: # if they exist in both cdbs + if name in cdb1.name2count_train and name in cdb2.name2count_train: + cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason + else: + if name in cdb2.name2count_train: + cdb.name2count_train[name] = cdb2.name2count_train[name] + + # snames + cdb.snames = cdb1.snames.union(cdb2.snames) + + # vocab, adding counts if they occur in both + cdb.vocab = deepcopy(cdb1.vocab) + if overwrite_training != 1: + for word in cdb2.vocab: + if word in cdb.vocab and overwrite_training == 0: + cdb.vocab[word] += cdb2.vocab[word] + else: + cdb.vocab[word] = cdb2.vocab[word] + + return cdb diff --git a/tests/helper.py b/tests/helper.py index 3da571758..52943c3cd 100644 --- a/tests/helper.py +++ b/tests/helper.py @@ -119,5 +119,5 @@ def __init__(self) -> None: self.cdb2.addl_info["cui2ontologies"] = {} self.cdb2.addl_info["cui2description"] = {} for cui in self.cdb2.cui2names: - self.cdb2.addl_info["cui2ontologies"][cui] = ["test_ontology"] + self.cdb2.addl_info["cui2ontologies"][cui] = {"test_ontology"} self.cdb2.addl_info["cui2description"][cui] = "test_description" diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 08b0cee88..eb98e28ba 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -7,7 +7,6 @@ from medcat.config import Config from medcat.cdb_maker import CDBMaker from medcat.cdb import CDB -from .helper import ForCDBMerging class CDBTests(unittest.TestCase): @@ -92,31 +91,5 @@ def test_cui2snames_population(self): self.assertIn(cui, self.undertest.cui2snames) - def test_merge_cdb(self): - to_merge = ForCDBMerging() - cdb1 = to_merge.cdb1 - cdb2 = to_merge.cdb2 - zeroes = np.zeros(shape=(1,300)) - ones = np.ones(shape=(1,300)) - - # merging - cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2) - overwrite_cdb = CDB.merge_cdb(cdb1=cdb1, cdb2=cdb2, overwrite_training=2, full_build=True) - - # tests - self.assertIn("test", cdb.cui2names["C0006826"]) - self.assertIn("test_name", cdb.cui2snames["C0006826"]) - self.assertEqual("Cancer", cdb.cui2preferred_name["C0006826"]) - self.assertTrue(np.array_equal(zeroes, cdb.cui2context_vectors["UniqueTest"]["short"])) - for i, cui in enumerate(cdb1.cui2names): - self.assertTrue(np.array_equal(cdb.cui2context_vectors[cui]["short"], np.divide(ones, i+2))) - self.assertEqual(cdb.addl_info["cui2ontologies"], dict()) - self.assertEqual(cdb.addl_info["cui2ontologies"], dict()) - for cui in cdb2.cui2names: - self.assertTrue(np.array_equal(overwrite_cdb.cui2context_vectors[cui]["short"], zeroes)) - self.assertEqual(overwrite_cdb.addl_info["cui2ontologies"][cui], {"test_ontology"}) - self.assertEqual(overwrite_cdb.addl_info["cui2description"][cui], "test_description") - - if __name__ == '__main__': unittest.main() diff --git a/tests/utils/test_cdb_utils.py b/tests/utils/test_cdb_utils.py new file mode 100644 index 000000000..3f699d767 --- /dev/null +++ b/tests/utils/test_cdb_utils.py @@ -0,0 +1,42 @@ +import unittest +import numpy as np +from tests.helper import ForCDBMerging +from medcat.utils.cdb_utils import cdb_utils + + +class CDBMergeTests(unittest.TestCase): + @classmethod + def setUp(cls) -> None: + to_merge = ForCDBMerging() + cls.cdb1 = to_merge.cdb1 + cls.cdb2 = to_merge.cdb2 + cls.merged_cdb = cdb_utils.merge_cdb(cdb1=cls.cdb1, cdb2=cls.cdb2) + cls.overwrite_cdb = cdb_utils.merge_cdb(cdb1=cls.cdb1, cdb2=cls.cdb2, overwrite_training=2, full_build=True) + cls.zeroes = np.zeros(shape=(1,300)) + cls.ones = np.ones(shape=(1,300)) + + def test_merge_inserts(self): + self.assertIn("test", self.merged_cdb.cui2names["C0006826"]) + self.assertIn("test_name", self.merged_cdb.cui2snames["C0006826"]) + self.assertEqual("Cancer", self.merged_cdb.cui2preferred_name["C0006826"]) + + def test_no_full_build(self): + self.assertEqual(self.merged_cdb.addl_info["cui2ontologies"], dict()) + self.assertEqual(self.merged_cdb.addl_info["cui2ontologies"], dict()) + + def test_full_build(self): + for cui in self.cdb2.cui2names: + self.assertEqual(self.overwrite_cdb.addl_info["cui2ontologies"][cui], {"test_ontology"}) + self.assertEqual(self.overwrite_cdb.addl_info["cui2description"][cui], "test_description") + + def test_vector_merge(self): + self.assertTrue(np.array_equal(self.zeroes, self.merged_cdb.cui2context_vectors["UniqueTest"]["short"])) + for i, cui in enumerate(self.cdb1.cui2names): + self.assertTrue(np.array_equal(self.merged_cdb.cui2context_vectors[cui]["short"], np.divide(self.ones, i+2))) + + + def test_overwrite_parameter(self): + for cui in self.cdb2.cui2names: + self.assertTrue(np.array_equal(self.overwrite_cdb.cui2context_vectors[cui]["short"], self.zeroes)) + self.assertEqual(self.overwrite_cdb.addl_info["cui2ontologies"][cui], {"test_ontology"}) + self.assertEqual(self.overwrite_cdb.addl_info["cui2description"][cui], "test_description") From fe9ef662cc9446c7556df1da76a2f9d040397632 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Fri, 15 Dec 2023 09:54:38 +0000 Subject: [PATCH 43/64] removed class wrapper in cdb utils and fixed class set up in tests --- medcat/utils/cdb_utils.py | 203 +++++++++++++++++----------------- tests/utils/test_cdb_utils.py | 18 +-- 2 files changed, 109 insertions(+), 112 deletions(-) diff --git a/medcat/utils/cdb_utils.py b/medcat/utils/cdb_utils.py index b7097c3f6..445fb7d6f 100644 --- a/medcat/utils/cdb_utils.py +++ b/medcat/utils/cdb_utils.py @@ -7,114 +7,111 @@ logger = logging.getLogger(__name__) # separate logger from the package-level one -class cdb_utils(object): +def merge_cdb(cdb1: "CDB", + cdb2: "CDB", + overwrite_training: int = 0, + full_build: bool = False): + """Merge two CDB's together to produce a new, single CDB. The contents of inputs CDBs will not be changed. + `addl_info` can not be perfectly merged, and will prioritise cdb1. see `full_build` - @staticmethod - def merge_cdb(cdb1: "CDB", - cdb2: "CDB", - overwrite_training: int = 0, - full_build: bool = False): - """Merge two CDB's together to produce a new, single CDB. The contents of inputs CDBs will not be changed. - `addl_info` can not be perfectly merged, and will prioritise cdb1. see `full_build` + Args: + cdb1 (medcat.cdb.CDB): + The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as + cui2preferred_name), this cdb values will be prioritised over cdb2. + cdb2 (medcat.cdb.CDB): + The second medcat cdb to merge. + overwrite_training (int): + Choose to prioritise a CDB's context vectors values over merging gracefully. 0 - no prio, 1 - CDB1, 2 - CDB2 + full_build (bool): + Add additional information from "addl_info" dicts "cui2ontologies" and "cui2description" + """ + config = deepcopy(cdb1.config) + cdb = CDB(config) - Args: - cdb1 (medcat.cdb.CDB): - The first medcat cdb to merge. In cases where merging isn't suitable isn't ideal (such as - cui2preferred_name), this cdb values will be prioritised over cdb2. - cdb2 (medcat.cdb.CDB): - The second medcat cdb to merge. - overwrite_training (int): - Choose to prioritise a CDB's context vectors values over merging gracefully. 0 - no prio, 1 - CDB1, 2 - CDB2 - full_build (bool): - Add additional information from "addl_info" dicts "cui2ontologies" and "cui2description" - """ - config = deepcopy(cdb1.config) - cdb = CDB(config) + # Copy CDB 1 - as all settings from CDB 1 will be carried over + cdb.cui2names = deepcopy(cdb1.cui2names) + cdb.cui2snames = deepcopy(cdb1.cui2snames) + cdb.cui2count_train = deepcopy(cdb1.cui2count_train) + cdb.cui2info = deepcopy(cdb1.cui2info) + cdb.cui2context_vectors = deepcopy(cdb1.cui2context_vectors) + cdb.cui2tags = deepcopy(cdb1.cui2tags) + cdb.cui2type_ids = deepcopy(cdb1.cui2type_ids) + cdb.cui2preferred_name = deepcopy(cdb1.cui2preferred_name) + cdb.name2cuis = deepcopy(cdb1.name2cuis) + cdb.name2cuis2status = deepcopy(cdb1.name2cuis2status) + cdb.name2count_train = deepcopy(cdb1.name2count_train) + cdb.name_isupper = deepcopy(cdb1.name_isupper) + if full_build: + cdb.addl_info = deepcopy(cdb1.addl_info) - # Copy CDB 1 - as all settings from CDB 1 will be carried over - cdb.cui2names = deepcopy(cdb1.cui2names) - cdb.cui2snames = deepcopy(cdb1.cui2snames) - cdb.cui2count_train = deepcopy(cdb1.cui2count_train) - cdb.cui2info = deepcopy(cdb1.cui2info) - cdb.cui2context_vectors = deepcopy(cdb1.cui2context_vectors) - cdb.cui2tags = deepcopy(cdb1.cui2tags) - cdb.cui2type_ids = deepcopy(cdb1.cui2type_ids) - cdb.cui2preferred_name = deepcopy(cdb1.cui2preferred_name) - cdb.name2cuis = deepcopy(cdb1.name2cuis) - cdb.name2cuis2status = deepcopy(cdb1.name2cuis2status) - cdb.name2count_train = deepcopy(cdb1.name2count_train) - cdb.name_isupper = deepcopy(cdb1.name_isupper) - if full_build: - cdb.addl_info = deepcopy(cdb1.addl_info) + # handles cui2names, cui2snames, name_isupper, name2cuis, name2cuis2status, cui2preferred_name + for cui in cdb2.cui2names: + names = dict() + for name in cdb2.cui2names[cui]: + names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}, 'raw_name': cdb2.get_name(cui)} + name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' + # For addl_info check cui2original_names as they MUST be added + ontologies = set() + description = '' + to_build = False + if full_build and (cui in cdb2.addl_info['cui2original_names'] or cui in cdb2.addl_info['cui2description']): + to_build = True + if 'cui2ontologies' in cdb2.addl_info: + ontologies.update(cdb2.addl_info['cui2ontologies'][cui]) + if 'cui2description' in cdb2.addl_info: + description = cdb2.addl_info['cui2description'][cui] + cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, + type_ids=cdb2.cui2type_ids[cui], description=description, full_build=to_build) + if cui in cdb1.cui2names: + if (cui in cdb1.cui2count_train or cui in cdb2.cui2count_train) and not (overwrite_training == 1 and cui in cdb1.cui2count_train): + if overwrite_training == 2 and cui in cdb2.cui2count_train: + cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] + else: + cdb.cui2count_train[cui] = cdb1.cui2count_train.get(cui, 0) + cdb2.cui2count_train.get(cui, 0) + if cui in cdb1.cui2context_vectors and not (overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]): + if overwrite_training == 2 and cui in cdb2.cui2context_vectors: + weights = [0, 1] + else: + norm = cdb.cui2count_train[cui] + weights = [np.divide(cdb1.cui2count_train.get(cui, 0), norm), np.divide(cdb2.cui2count_train.get(cui, 0), norm)] + contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short + for s in contexts: + cdb.cui2context_vectors[cui][s] = (weights[0] * cdb1.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + (weights[1] * cdb2.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + if cui in cdb1.cui2tags: + cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) + if cui in cdb1.cui2type_ids: + cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) + else: + if cui in cdb2.cui2count_train: + cdb.cui2count_train[cui] = cdb2.cui2names[cui] + if cui in cdb2.cui2info: + cdb.cui2info[cui] = cdb2.cui2info[cui] + if cui in cdb2.cui2context_vectors: + cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] + if cui in cdb2.cui2tags: + cdb.cui2tags[cui] = cdb2.cui2tags[cui] + if cui in cdb2.cui2type_ids: + cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - # handles cui2names, cui2snames, name_isupper, name2cuis, name2cuis2status, cui2preferred_name - for cui in cdb2.cui2names: - names = dict() - for name in cdb2.cui2names[cui]: - names[name] = {'snames': cdb2.cui2snames.get(cui, set()), 'is_upper': cdb2.name_isupper.get(name, False), 'tokens': {}, 'raw_name': cdb2.get_name(cui)} - name_status = cdb2.name2cuis2status.get(name, 'A').get(cui, 'A') # get the name status if it exists, default to 'A' - # For addl_info check cui2original_names as they MUST be added - ontologies = set() - description = '' - to_build = False - if full_build and (cui in cdb2.addl_info['cui2original_names'] or cui in cdb2.addl_info['cui2description']): - to_build = True - if 'cui2ontologies' in cdb2.addl_info: - ontologies.update(cdb2.addl_info['cui2ontologies'][cui]) - if 'cui2description' in cdb2.addl_info: - description = cdb2.addl_info['cui2description'][cui] - cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, - type_ids=cdb2.cui2type_ids[cui], description=description, full_build=to_build) - if cui in cdb1.cui2names: - if (cui in cdb1.cui2count_train or cui in cdb2.cui2count_train) and not (overwrite_training == 1 and cui in cdb1.cui2count_train): - if overwrite_training == 2 and cui in cdb2.cui2count_train: - cdb.cui2count_train[cui] = cdb2.cui2count_train[cui] - else: - cdb.cui2count_train[cui] = cdb1.cui2count_train.get(cui, 0) + cdb2.cui2count_train.get(cui, 0) - if cui in cdb1.cui2context_vectors and not (overwrite_training == 1 and cui in cdb1.cui2context_vectors[cui]): - if overwrite_training == 2 and cui in cdb2.cui2context_vectors: - weights = [0, 1] - else: - norm = cdb.cui2count_train[cui] - weights = [np.divide(cdb1.cui2count_train.get(cui, 0), norm), np.divide(cdb2.cui2count_train.get(cui, 0), norm)] - contexts = set(list(cdb1.cui2context_vectors.get(cui, {}).keys()) + list(cdb2.cui2context_vectors.get(cui, {}).keys())) # xlong, long, medium, short - for s in contexts: - cdb.cui2context_vectors[cui][s] = (weights[0] * cdb1.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) + (weights[1] * cdb2.cui2context_vectors[cui].get(s, np.zeros(shape=(300)))) - if cui in cdb1.cui2tags: - cdb.cui2tags[cui].append(cdb2.cui2tags[cui]) - if cui in cdb1.cui2type_ids: - cdb.cui2type_ids[cui] = cdb1.cui2type_ids[cui].union(cdb2.cui2type_ids[cui]) + if overwrite_training != 1: + for name in cdb2.name2cuis: + if name in cdb1.name2cuis and overwrite_training == 0: # if they exist in both cdbs + if name in cdb1.name2count_train and name in cdb2.name2count_train: + cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason else: - if cui in cdb2.cui2count_train: - cdb.cui2count_train[cui] = cdb2.cui2names[cui] - if cui in cdb2.cui2info: - cdb.cui2info[cui] = cdb2.cui2info[cui] - if cui in cdb2.cui2context_vectors: - cdb.cui2context_vectors[cui] = cdb2.cui2context_vectors[cui] - if cui in cdb2.cui2tags: - cdb.cui2tags[cui] = cdb2.cui2tags[cui] - if cui in cdb2.cui2type_ids: - cdb.cui2type_ids[cui] = cdb2.cui2type_ids[cui] - - if overwrite_training != 1: - for name in cdb2.name2cuis: - if name in cdb1.name2cuis and overwrite_training == 0: # if they exist in both cdbs - if name in cdb1.name2count_train and name in cdb2.name2count_train: - cdb.name2count_train[name] = str(int(cdb1.name2count_train[name]) + int(cdb2.name2count_train[name])) # these are strings for some reason - else: - if name in cdb2.name2count_train: - cdb.name2count_train[name] = cdb2.name2count_train[name] + if name in cdb2.name2count_train: + cdb.name2count_train[name] = cdb2.name2count_train[name] - # snames - cdb.snames = cdb1.snames.union(cdb2.snames) + # snames + cdb.snames = cdb1.snames.union(cdb2.snames) - # vocab, adding counts if they occur in both - cdb.vocab = deepcopy(cdb1.vocab) - if overwrite_training != 1: - for word in cdb2.vocab: - if word in cdb.vocab and overwrite_training == 0: - cdb.vocab[word] += cdb2.vocab[word] - else: - cdb.vocab[word] = cdb2.vocab[word] + # vocab, adding counts if they occur in both + cdb.vocab = deepcopy(cdb1.vocab) + if overwrite_training != 1: + for word in cdb2.vocab: + if word in cdb.vocab and overwrite_training == 0: + cdb.vocab[word] += cdb2.vocab[word] + else: + cdb.vocab[word] = cdb2.vocab[word] - return cdb + return cdb diff --git a/tests/utils/test_cdb_utils.py b/tests/utils/test_cdb_utils.py index 3f699d767..bc0e6796f 100644 --- a/tests/utils/test_cdb_utils.py +++ b/tests/utils/test_cdb_utils.py @@ -1,19 +1,19 @@ import unittest import numpy as np from tests.helper import ForCDBMerging -from medcat.utils.cdb_utils import cdb_utils +from medcat.utils.cdb_utils import merge_cdb class CDBMergeTests(unittest.TestCase): - @classmethod - def setUp(cls) -> None: + + def setUp(self) -> None: to_merge = ForCDBMerging() - cls.cdb1 = to_merge.cdb1 - cls.cdb2 = to_merge.cdb2 - cls.merged_cdb = cdb_utils.merge_cdb(cdb1=cls.cdb1, cdb2=cls.cdb2) - cls.overwrite_cdb = cdb_utils.merge_cdb(cdb1=cls.cdb1, cdb2=cls.cdb2, overwrite_training=2, full_build=True) - cls.zeroes = np.zeros(shape=(1,300)) - cls.ones = np.ones(shape=(1,300)) + self.cdb1 = to_merge.cdb1 + self.cdb2 = to_merge.cdb2 + self.merged_cdb = merge_cdb(cdb1=self.cdb1, cdb2=self.cdb2) + self.overwrite_cdb = merge_cdb(cdb1=self.cdb1, cdb2=self.cdb2, overwrite_training=2, full_build=True) + self.zeroes = np.zeros(shape=(1,300)) + self.ones = np.ones(shape=(1,300)) def test_merge_inserts(self): self.assertIn("test", self.merged_cdb.cui2names["C0006826"]) From f70e61d97e51a7680475b12ae8b7868d0c0f1728 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Fri, 15 Dec 2023 10:07:24 +0000 Subject: [PATCH 44/64] changed test object setup to class setup --- tests/utils/test_cdb_utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/utils/test_cdb_utils.py b/tests/utils/test_cdb_utils.py index bc0e6796f..777a2506b 100644 --- a/tests/utils/test_cdb_utils.py +++ b/tests/utils/test_cdb_utils.py @@ -6,14 +6,15 @@ class CDBMergeTests(unittest.TestCase): - def setUp(self) -> None: + @classmethod + def setUpClass(cls): to_merge = ForCDBMerging() - self.cdb1 = to_merge.cdb1 - self.cdb2 = to_merge.cdb2 - self.merged_cdb = merge_cdb(cdb1=self.cdb1, cdb2=self.cdb2) - self.overwrite_cdb = merge_cdb(cdb1=self.cdb1, cdb2=self.cdb2, overwrite_training=2, full_build=True) - self.zeroes = np.zeros(shape=(1,300)) - self.ones = np.ones(shape=(1,300)) + cls.cdb1 = to_merge.cdb1 + cls.cdb2 = to_merge.cdb2 + cls.merged_cdb = merge_cdb(cdb1=cls.cdb1, cdb2=cls.cdb2) + cls.overwrite_cdb = merge_cdb(cdb1=cls.cdb1, cdb2=cls.cdb2, overwrite_training=2, full_build=True) + cls.zeroes = np.zeros(shape=(1,300)) + cls.ones = np.ones(shape=(1,300)) def test_merge_inserts(self): self.assertIn("test", self.merged_cdb.cui2names["C0006826"]) From c74fe1f64ff590c6bf8a139a698600b2d2087967 Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Fri, 15 Dec 2023 10:24:26 +0000 Subject: [PATCH 45/64] removed erroneous new line --- tests/test_cdb.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_cdb.py b/tests/test_cdb.py index eb98e28ba..f7be24d64 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -90,6 +90,5 @@ def test_cui2snames_population(self): with self.subTest(cui): self.assertIn(cui, self.undertest.cui2snames) - if __name__ == '__main__': unittest.main() From 45cef2b553c3f498e387ded15391716b77a640d6 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 18 Dec 2023 17:00:11 +0200 Subject: [PATCH 46/64] CU-2e77a31 improve print stats (#366) * Add base class for CAT * Add CDB base class * Some whitespace fixes for base modules * CU-2e77a31: Move print stats to their own module and class * CU-2e77a31: Fix issues introduced by moving print stats * CU-2e77a31: Rename print_stats to get_stats and add option to avoid printed output * CU-2e77a31: Add test for print_stats * CU-2e77a31: Remove unused import * CU-2e77a31: Add new package to setup.py * CU-2e77a31: Fix a bunch of typing issues * CU-2e77a31: Revert CAT and CDB abstraction --- medcat/cat.py | 218 +---------------- medcat/stats/__init__.py | 0 medcat/stats/stats.py | 340 +++++++++++++++++++++++++++ medcat/utils/filters.py | 43 +++- medcat/utils/regression/targeting.py | 4 +- setup.py | 2 +- tests/test_cat.py | 42 +++- 7 files changed, 433 insertions(+), 216 deletions(-) create mode 100644 medcat/stats/__init__.py create mode 100644 medcat/stats/stats.py diff --git a/medcat/cat.py b/medcat/cat.py index f49a25022..d3003b24b 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -2,7 +2,6 @@ import glob import shutil import pickle -import traceback import json import logging import math @@ -24,7 +23,6 @@ from medcat.pipe import Pipe from medcat.preprocessing.taggers import tag_skip_and_punct from medcat.cdb import CDB -from medcat.utils.matutils import intersect_nonempty_set from medcat.utils.data_utils import make_mc_train_test, get_false_positives from medcat.utils.normalizers import BasicSpellChecker from medcat.utils.checkpoint import Checkpoint, CheckpointConfig, CheckpointManager @@ -32,15 +30,16 @@ from medcat.utils.hasher import Hasher from medcat.ner.vocab_based_ner import NER from medcat.linking.context_based_linker import Linker -from medcat.utils.filters import get_project_filters from medcat.preprocessing.cleaners import prepare_name from medcat.meta_cat import MetaCAT from medcat.utils.meta_cat.data_utils import json_to_fake_spacy -from medcat.config import Config, LinkingFilters +from medcat.config import Config from medcat.vocab import Vocab from medcat.utils.decorators import deprecated from medcat.ner.transformers_ner import TransformersNER from medcat.utils.saving.serializer import SPECIALITY_NAMES, ONE2MANY +from medcat.stats.stats import get_stats +from medcat.utils.filters import set_project_filters logger = logging.getLogger(__name__) # separate logger from the package-level one @@ -442,7 +441,8 @@ def _print_stats(self, use_overlaps: bool = False, use_cui_doc_limit: bool = False, use_groups: bool = False, - extra_cui_filter: Optional[Set] = None) -> Tuple: + extra_cui_filter: Optional[Set] = None, + do_print: bool = True) -> Tuple: """TODO: Refactor and make nice Print metrics on a dataset (F1, P, R), it will also print the concepts that have the most FP,FN,TP. @@ -482,204 +482,12 @@ def _print_stats(self, Number of occurrence for each CUI. examples (dict): Examples for each of the fp, fn, tp. Format will be examples['fp']['cui'][]. + do_print (bool): + Whether to print stats out. Defaults to True. """ - tp = 0 - fp = 0 - fn = 0 - fps: Dict = {} - fns: Dict = {} - tps: Dict = {} - cui_prec: Dict = {} - cui_rec: Dict = {} - cui_f1: Dict = {} - cui_counts: Dict = {} - examples: Dict = {'fp': {}, 'fn': {}, 'tp': {}} - - fp_docs: Set = set() - fn_docs: Set = set() - - orig_filters = self.config.linking.filters.copy_of() - local_filters = self.config.linking.filters - for pind, project in tqdm(enumerate(data['projects']), desc="Stats project", total=len(data['projects']), leave=False): - local_filters.cuis = set() - - # Add extra filter if set - self._set_project_filters(local_filters, project, extra_cui_filter, use_project_filters) - - for dind, doc in tqdm( - enumerate(project["documents"]), - desc="Stats document", - total=len(project["documents"]), - leave=False, - ): - anns = self._get_doc_annotations(doc) - - # Apply document level filtering, in this case project_filter is ignored while the extra_cui_filter is respected still - if use_cui_doc_limit: - _cuis = set([ann['cui'] for ann in anns]) - if _cuis: - local_filters.cuis = intersect_nonempty_set(_cuis, extra_cui_filter) - else: - local_filters.cuis = {'empty'} - - spacy_doc: Doc = self(doc['text']) # type: ignore - - if use_overlaps: - p_anns = spacy_doc._.ents - else: - p_anns = spacy_doc.ents - - anns_norm = [] - anns_norm_neg = [] - anns_examples = [] - anns_norm_cui = [] - for ann in anns: - cui = ann['cui'] - if local_filters.check_filters(cui): - if use_groups: - cui = self.cdb.addl_info['cui2group'].get(cui, cui) - - if ann.get('validated', True) and (not ann.get('killed', False) and not ann.get('deleted', False)): - anns_norm.append((ann['start'], cui)) - anns_examples.append({"text": doc['text'][max(0, ann['start']-60):ann['end']+60], - "cui": cui, - "start": ann['start'], - "end": ann['end'], - "source value": ann['value'], - "acc": 1, - "project name": project.get('name'), - "document name": doc.get('name'), - "project id": project.get('id'), - "document id": doc.get('id')}) - elif ann.get('validated', True) and (ann.get('killed', False) or ann.get('deleted', False)): - anns_norm_neg.append((ann['start'], cui)) - - if ann.get("validated", True): - # This is used to test was someone annotating for this CUI in this document - anns_norm_cui.append(cui) - cui_counts[cui] = cui_counts.get(cui, 0) + 1 - - p_anns_norm = [] - p_anns_examples = [] - for ann in p_anns: - cui = ann._.cui - if use_groups: - cui = self.cdb.addl_info['cui2group'].get(cui, cui) - - p_anns_norm.append((ann.start_char, cui)) - p_anns_examples.append({"text": doc['text'][max(0, ann.start_char-60):ann.end_char+60], - "cui": cui, - "start": ann.start_char, - "end": ann.end_char, - "source value": ann.text, - "acc": float(ann._.context_similarity), - "project name": project.get('name'), - "document name": doc.get('name'), - "project id": project.get('id'), - "document id": doc.get('id')}) - for iann, ann in enumerate(p_anns_norm): - cui = ann[1] - if ann in anns_norm: - tp += 1 - tps[cui] = tps.get(cui, 0) + 1 - - example = p_anns_examples[iann] - examples['tp'][cui] = examples['tp'].get(cui, []) + [example] - else: - fp += 1 - fps[cui] = fps.get(cui, 0) + 1 - fp_docs.add(doc.get('name', 'unk')) - - # Add example for this FP prediction - example = p_anns_examples[iann] - if ann in anns_norm_neg: - # Means that it really was annotated as negative - example['real_fp'] = True - - examples['fp'][cui] = examples['fp'].get(cui, []) + [example] - - for iann, ann in enumerate(anns_norm): - if ann not in p_anns_norm: - cui = ann[1] - fn += 1 - fn_docs.add(doc.get('name', 'unk')) - - fns[cui] = fns.get(cui, 0) + 1 - examples['fn'][cui] = examples['fn'].get(cui, []) + [anns_examples[iann]] - - try: - prec = tp / (tp + fp) - rec = tp / (tp + fn) - f1 = 2*(prec*rec) / (prec + rec) - print("Epoch: {}, Prec: {}, Rec: {}, F1: {}\n".format(epoch, prec, rec, f1)) - print("Docs with false positives: {}\n".format("; ".join([str(x) for x in list(fp_docs)[0:10]]))) - print("Docs with false negatives: {}\n".format("; ".join([str(x) for x in list(fn_docs)[0:10]]))) - - # Sort fns & prec - fps = {k: v for k, v in sorted(fps.items(), key=lambda item: item[1], reverse=True)} - fns = {k: v for k, v in sorted(fns.items(), key=lambda item: item[1], reverse=True)} - tps = {k: v for k, v in sorted(tps.items(), key=lambda item: item[1], reverse=True)} - - - # F1 per concept - for cui in tps.keys(): - prec = tps[cui] / (tps.get(cui, 0) + fps.get(cui, 0)) - rec = tps[cui] / (tps.get(cui, 0) + fns.get(cui, 0)) - f1 = 2*(prec*rec) / (prec + rec) - cui_prec[cui] = prec - cui_rec[cui] = rec - cui_f1[cui] = f1 - - - # Get top 10 - pr_fps = [(self.cdb.cui2preferred_name.get(cui, - list(self.cdb.cui2names.get(cui, [cui]))[0]), cui, fps[cui]) for cui in list(fps.keys())[0:10]] - pr_fns = [(self.cdb.cui2preferred_name.get(cui, - list(self.cdb.cui2names.get(cui, [cui]))[0]), cui, fns[cui]) for cui in list(fns.keys())[0:10]] - pr_tps = [(self.cdb.cui2preferred_name.get(cui, - list(self.cdb.cui2names.get(cui, [cui]))[0]), cui, tps[cui]) for cui in list(tps.keys())[0:10]] - - - print("\n\nFalse Positives\n") - for one in pr_fps: - print("{:70} - {:20} - {:10}".format(str(one[0])[0:69], str(one[1])[0:19], one[2])) - print("\n\nFalse Negatives\n") - for one in pr_fns: - print("{:70} - {:20} - {:10}".format(str(one[0])[0:69], str(one[1])[0:19], one[2])) - print("\n\nTrue Positives\n") - for one in pr_tps: - print("{:70} - {:20} - {:10}".format(str(one[0])[0:69], str(one[1])[0:19], one[2])) - print("*"*110 + "\n") - - except Exception: - traceback.print_exc() - - self.config.linking.filters = orig_filters - - return fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples - - def _set_project_filters(self, local_filters: LinkingFilters, project: dict, - extra_cui_filter: Optional[Set], use_project_filters: bool): - """Set the project filters to a LinkingFilters object based on - the specified project. - - Args: - local_filters (LinkingFilters): The linking filters instance - project (dict): The project - extra_cui_filter (Optional[Set]): Extra CUIs (if specified) - use_project_filters (bool): Whether to use per-project filters - """ - if isinstance(extra_cui_filter, set): - local_filters.cuis = extra_cui_filter - - if use_project_filters: - project_filter = get_project_filters(cuis=project.get('cuis', None), - type_ids=project.get('tuis', None), - cdb=self.cdb, - project=project) - # Intersect project filter with existing if it has something - if project_filter: - local_filters.cuis = intersect_nonempty_set(project_filter, local_filters.cuis) + return get_stats(self, data=data, epoch=epoch, use_project_filters=use_project_filters, + use_overlaps=use_overlaps, use_cui_doc_limit=use_cui_doc_limit, + use_groups=use_groups, extra_cui_filter=extra_cui_filter, do_print=do_print) def _init_ckpts(self, is_resumed, checkpoint): if self.config.general.checkpoint.steps is not None or checkpoint is not None: @@ -1114,15 +922,15 @@ def train_supervised_raw(self, # then add the extra CUI filters if retain_filters and extra_cui_filter and not retain_extra_cui_filter: # adding project filters without extra_cui_filters - self._set_project_filters(local_filters, project, set(), use_filters) + set_project_filters(self.cdb.addl_info, local_filters, project, set(), use_filters) orig_filters.merge_with(local_filters) # adding extra_cui_filters, but NOT project filters - self._set_project_filters(local_filters, project, extra_cui_filter, False) + set_project_filters(self.cdb.addl_info, local_filters, project, extra_cui_filter, False) # refrain from doing it again for subsequent epochs retain_filters = False else: # Set filters in case we are using the train_from_fp - self._set_project_filters(local_filters, project, extra_cui_filter, use_filters) + set_project_filters(self.cdb.addl_info, local_filters, project, extra_cui_filter, use_filters) for idx_doc in trange(current_document, len(project['documents']), initial=current_document, total=len(project['documents']), desc='Document', leave=False): doc = project['documents'][idx_doc] diff --git a/medcat/stats/__init__.py b/medcat/stats/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/medcat/stats/stats.py b/medcat/stats/stats.py new file mode 100644 index 000000000..06b712158 --- /dev/null +++ b/medcat/stats/stats.py @@ -0,0 +1,340 @@ +from typing import Dict, Optional, Set, Tuple, Callable, List, cast + +from tqdm import tqdm +import traceback + +from spacy.tokens import Doc + +from medcat.utils.filters import set_project_filters +from medcat.utils.matutils import intersect_nonempty_set +from medcat.config import LinkingFilters + + +class StatsBuilder: + + def __init__(self, + filters: LinkingFilters, + addl_info: dict, + doc_getter: Callable[[Optional[str], bool], Optional[Doc]], + doc_annotation_getter: Callable[[dict], list], + cui2group: Dict[str, str], + cui2preferred_name: Dict[str, str], + cui2names: Dict[str, Set[str]], + use_project_filters: bool = False, + use_overlaps: bool = False, + use_cui_doc_limit: bool = False, + use_groups: bool = False, + extra_cui_filter: Optional[Set] = None) -> None: + self.filters = filters + self.addl_info = addl_info + self.doc_getter = doc_getter + self._get_doc_annotations = doc_annotation_getter + self.cui2group = cui2group + self.cui2preferred_name = cui2preferred_name + self.cui2names = cui2names + self.use_project_filters = use_project_filters + self.use_overlaps = use_overlaps + self.use_cui_doc_limit = use_cui_doc_limit + self.use_groups = use_groups + self.extra_cui_filter = extra_cui_filter + self._reset_stats() + + def _reset_stats(self): + self.tp = 0 + self.fp = 0 + self.fn = 0 + self.fps: Dict = {} + self.fns: Dict = {} + self.tps: Dict = {} + self.cui_prec: Dict = {} + self.cui_rec: Dict = {} + self.cui_f1: Dict = {} + self.cui_counts: Dict = {} + self.examples: Dict = {'fp': {}, 'fn': {}, 'tp': {}} + self.fp_docs: Set = set() + self.fn_docs: Set = set() + + def process_project(self, project: dict) -> None: + self.filters.cuis = set() + + # Add extra filter if set + set_project_filters(self.addl_info, self.filters, project, self.extra_cui_filter, self.use_project_filters) + + documents = project["documents"] + for dind, doc in tqdm( + enumerate(documents), + desc="Stats document", + total=len(documents), + leave=False, + ): + self.process_document(cast(str, project.get('name')), + cast(str, project.get('id')), doc) + + def process_document(self, project_name: str, project_id: str, doc: dict) -> None: + anns = self._get_doc_annotations(doc) + + # Apply document level filtering, in this case project_filter is ignored while the extra_cui_filter is respected still + if self.use_cui_doc_limit: + _cuis = set([ann['cui'] for ann in anns]) + if _cuis: + self.filters.cuis = intersect_nonempty_set(_cuis, self.extra_cui_filter) + else: + self.filters.cuis = {'empty'} + + spacy_doc: Doc = self.doc_getter(doc['text']) # type: ignore + + if self.use_overlaps: + p_anns = spacy_doc._.ents + else: + p_anns = spacy_doc.ents + + (anns_norm, anns_norm_neg, + anns_examples, _) = self._preprocess_annotations(project_name, project_id, doc, anns) + + p_anns_norm, p_anns_examples = self._process_p_anns(project_name, project_id, + doc, p_anns) + self._count_p_anns_norm(doc, anns_norm, anns_norm_neg, + p_anns_norm, p_anns_examples) + self._process_anns_norm(doc, anns_norm, p_anns_norm, anns_examples) + + def _process_anns_norm(self, doc: dict, anns_norm: list, p_anns_norm: list, + anns_examples: list) -> None: + for iann, ann in enumerate(anns_norm): + if ann not in p_anns_norm: + cui = ann[1] + self.fn += 1 + self.fn_docs.add(doc.get('name', 'unk')) + + self.fns[cui] = self.fns.get(cui, 0) + 1 + self.examples['fn'][cui] = self.examples['fn'].get(cui, []) + [anns_examples[iann]] + + def _process_p_anns(self, project_name: str, project_id: str, doc: dict, p_anns: list) -> Tuple[list, list]: + p_anns_norm = [] + p_anns_examples = [] + for ann in p_anns: + cui = ann._.cui + if self.use_groups: + cui = self.cui2group.get(cui, cui) + + p_anns_norm.append((ann.start_char, cui)) + p_anns_examples.append(self._create_annoation_2(project_name, project_id, cui, doc, ann)) + return p_anns_norm, p_anns_examples + + def _count_p_anns_norm(self, doc: dict, anns_norm: list, anns_norm_neg: list, + p_anns_norm: list, p_anns_examples: list) -> None: + for iann, ann in enumerate(p_anns_norm): + cui = ann[1] + if ann in anns_norm: + self.tp += 1 + self.tps[cui] = self.tps.get(cui, 0) + 1 + + example = p_anns_examples[iann] + self.examples['tp'][cui] = self.examples['tp'].get(cui, []) + [example] + else: + self.fp += 1 + self.fps[cui] = self.fps.get(cui, 0) + 1 + self.fp_docs.add(doc.get('name', 'unk')) + + # Add example for this FP prediction + example = p_anns_examples[iann] + if ann in anns_norm_neg: + # Means that it really was annotated as negative + example['real_fp'] = True + + self.examples['fp'][cui] = self.examples['fp'].get(cui, []) + [example] + + def _create_annoation(self, project_name: str, project_id: str, cui: str, doc: dict, ann: Dict) -> Dict: + return {"text": doc['text'][max(0, ann['start']-60):ann['end']+60], + "cui": cui, + "start": ann['start'], + "end": ann['end'], + "source value": ann['value'], + "acc": 1, + "project name": project_name, + "document name": doc.get('name'), + "project id": project_id, + "document id": doc.get('id')} + + def _create_annoation_2(self, project_name: str, project_id: str, cui: str, doc: dict, ann) -> Dict: + return {"text": doc['text'][max(0, ann.start_char-60):ann.end_char+60], + "cui": cui, + "start": ann.start_char, + "end": ann.end_char, + "source value": ann.text, + "acc": float(ann._.context_similarity), + "project name": project_name, + "document name": doc.get('name'), + "project id": project_id, + "document id": doc.get('id')} + + def _preprocess_annotations(self, project_name: str, project_id: str, + doc: dict, anns: List[Dict]) -> Tuple[list, list, list, list]: + anns_norm = [] + anns_norm_neg = [] + anns_examples = [] + anns_norm_cui = [] + for ann in anns: + cui = ann['cui'] + if self.filters.check_filters(cui): + if self.use_groups: + cui = self.cui2group.get(cui, cui) + + if ann.get('validated', True) and (not ann.get('killed', False) and not ann.get('deleted', False)): + anns_norm.append((ann['start'], cui)) + anns_examples.append(self._create_annoation(project_name, project_id, cui, doc, ann)) + elif ann.get('validated', True) and (ann.get('killed', False) or ann.get('deleted', False)): + anns_norm_neg.append((ann['start'], cui)) + + if ann.get("validated", True): + # This is used to test was someone annotating for this CUI in this document + anns_norm_cui.append(cui) + self.cui_counts[cui] = self.cui_counts.get(cui, 0) + 1 + return anns_norm, anns_norm_neg, anns_examples, anns_norm_cui + + def finalise_report(self, epoch: int, do_print: bool = True): + try: + prec = self.tp / (self.tp + self.fp) + rec = self.tp / (self.tp + self.fn) + f1 = 2*(prec*rec) / (prec + rec) + if do_print: + print("Epoch: {}, Prec: {}, Rec: {}, F1: {}\n".format(epoch, prec, rec, f1)) + print("Docs with false positives: {}\n".format("; ".join([str(x) for x in list(self.fp_docs)[0:10]]))) + print("Docs with false negatives: {}\n".format("; ".join([str(x) for x in list(self.fn_docs)[0:10]]))) + + # Sort fns & prec + fps = {k: v for k, v in sorted(self.fps.items(), key=lambda item: item[1], reverse=True)} + fns = {k: v for k, v in sorted(self.fns.items(), key=lambda item: item[1], reverse=True)} + tps = {k: v for k, v in sorted(self.tps.items(), key=lambda item: item[1], reverse=True)} + + + # F1 per concept + for cui in tps.keys(): + prec = tps[cui] / (tps.get(cui, 0) + fps.get(cui, 0)) + rec = tps[cui] / (tps.get(cui, 0) + fns.get(cui, 0)) + f1 = 2*(prec*rec) / (prec + rec) + self.cui_prec[cui] = prec + self.cui_rec[cui] = rec + self.cui_f1[cui] = f1 + + + # Get top 10 + pr_fps = [(self.cui2preferred_name.get(cui, + list(self.cui2names.get(cui, [cui]))[0]), cui, fps[cui]) for cui in list(fps.keys())[0:10]] + pr_fns = [(self.cui2preferred_name.get(cui, + list(self.cui2names.get(cui, [cui]))[0]), cui, fns[cui]) for cui in list(fns.keys())[0:10]] + pr_tps = [(self.cui2preferred_name.get(cui, + list(self.cui2names.get(cui, [cui]))[0]), cui, tps[cui]) for cui in list(tps.keys())[0:10]] + + if do_print: + print("\n\nFalse Positives\n") + for one in pr_fps: + print("{:70} - {:20} - {:10}".format(str(one[0])[0:69], str(one[1])[0:19], one[2])) + print("\n\nFalse Negatives\n") + for one in pr_fns: + print("{:70} - {:20} - {:10}".format(str(one[0])[0:69], str(one[1])[0:19], one[2])) + print("\n\nTrue Positives\n") + for one in pr_tps: + print("{:70} - {:20} - {:10}".format(str(one[0])[0:69], str(one[1])[0:19], one[2])) + print("*"*110 + "\n") + + except Exception: + traceback.print_exc() + + def unwrap(self) -> Tuple: + return (self.fps, self.fns, self.tps, + self.cui_prec, self.cui_rec, self.cui_f1, + self.cui_counts, self.examples) + + @classmethod + def from_cat(cls, cat, + local_filters: LinkingFilters, + use_project_filters: bool = False, + use_overlaps: bool = False, + use_cui_doc_limit: bool = False, + use_groups: bool = False, + extra_cui_filter: Optional[Set] = None) -> 'StatsBuilder': + return StatsBuilder(filters=local_filters, + addl_info=cat.cdb.addl_info, + doc_getter=cat.__call__, + doc_annotation_getter=cat._get_doc_annotations, + cui2group=cat.cdb.addl_info['cui2group'], + cui2preferred_name=cat.cdb.cui2preferred_name, + cui2names=cat.cdb.cui2names, + use_project_filters=use_project_filters, + use_overlaps=use_overlaps, + use_cui_doc_limit=use_cui_doc_limit, + use_groups=use_groups, + extra_cui_filter=extra_cui_filter) + + +def get_stats(cat, + data: Dict, + epoch: int = 0, + use_project_filters: bool = False, + use_overlaps: bool = False, + use_cui_doc_limit: bool = False, + use_groups: bool = False, + extra_cui_filter: Optional[Set] = None, + do_print: bool = True) -> Tuple: + """TODO: Refactor and make nice + Print metrics on a dataset (F1, P, R), it will also print the concepts that have the most FP,FN,TP. + + Args: + cat: (CAT): + The model pack. + data (list of dict): + The json object that we get from MedCATtrainer on export. + epoch (int): + Used during training, so we know what epoch is it. + use_project_filters (boolean): + Each project in MedCATtrainer can have filters, do we want to respect those filters + when calculating metrics. + use_overlaps (boolean): + Allow overlapping entities, nearly always False as it is very difficult to annotate overlapping entites. + use_cui_doc_limit (boolean): + If True the metrics for a CUI will be only calculated if that CUI appears in a document, in other words + if the document was annotated for that CUI. Useful in very specific situations when during the annotation + process the set of CUIs changed. + use_groups (boolean): + If True concepts that have groups will be combined and stats will be reported on groups. + extra_cui_filter(Optional[Set]): + This filter will be intersected with all other filters, or if all others are not set then only this one will be used. + + Returns: + fps (dict): + False positives for each CUI. + fns (dict): + False negatives for each CUI. + tps (dict): + True positives for each CUI. + cui_prec (dict): + Precision for each CUI. + cui_rec (dict): + Recall for each CUI. + cui_f1 (dict): + F1 for each CUI. + cui_counts (dict): + Number of occurrence for each CUI. + examples (dict): + Examples for each of the fp, fn, tp. Format will be examples['fp']['cui'][]. + do_print (bool): + Whether to print stats out. Defaults to True. + """ + orig_filters = cat.config.linking.filters.copy_of() + local_filters = cat.config.linking.filters + builder = StatsBuilder.from_cat(cat, + local_filters=local_filters, + use_project_filters=use_project_filters, + use_overlaps=use_overlaps, + use_cui_doc_limit=use_cui_doc_limit, + use_groups=use_groups, + extra_cui_filter=extra_cui_filter) + for pind, project in tqdm(enumerate(data['projects']), desc="Stats project", total=len(data['projects']), leave=False): + builder.process_project(project) + + # this is the part that prints out the stats + builder.finalise_report(epoch, do_print=do_print) + + cat.config.linking.filters = orig_filters + + return builder.unwrap() diff --git a/medcat/utils/filters.py b/medcat/utils/filters.py index c4803027a..cb85e0e26 100644 --- a/medcat/utils/filters.py +++ b/medcat/utils/filters.py @@ -1,3 +1,9 @@ +from typing import Optional, Set, Dict + +from medcat.config import LinkingFilters +from medcat.utils.matutils import intersect_nonempty_set + + def check_filters(cui, filters): """Checks is a CUI in the filters @@ -15,7 +21,7 @@ def check_filters(cui, filters): return False -def get_all_irrelevant_cuis(project, cdb): +def get_all_irrelevant_cuis(project): i_cuis = set() for d in project['documents']: for a in d['annotations']: @@ -24,7 +30,7 @@ def get_all_irrelevant_cuis(project, cdb): return i_cuis -def get_project_filters(cuis, type_ids, cdb, project=None): +def get_project_filters(cuis, type_ids, addl_info: Dict, project=None): cui_filter = set() if isinstance(cuis, str): if cuis is not None and cuis: @@ -33,10 +39,10 @@ def get_project_filters(cuis, type_ids, cdb, project=None): type_ids = [x.strip().upper() for x in type_ids.split(",")] # Convert type_ids to cuis - if 'type_id2cuis' in cdb.addl_info: + if 'type_id2cuis' in addl_info: for type_id in type_ids: - if type_id in cdb.addl_info['type_id2cuis']: - cui_filter.update(cdb.addl_info['type_id2cuis'][type_id]) + if type_id in addl_info['type_id2cuis']: + cui_filter.update(addl_info['type_id2cuis'][type_id]) else: raise Exception("Impossible to create filters, disable them.") else: @@ -45,8 +51,33 @@ def get_project_filters(cuis, type_ids, cdb, project=None): cui_filter = set(cuis) if project is not None: - i_cuis = get_all_irrelevant_cuis(project, cdb) + i_cuis = get_all_irrelevant_cuis(project) for i_cui in i_cuis: cui_filter.remove(i_cui) return cui_filter + + +def set_project_filters(addl_info: Dict, local_filters: LinkingFilters, project: dict, + extra_cui_filter: Optional[Set], use_project_filters: bool): + """Set the project filters to a LinkingFilters object based on + the specified project. + + Args: + addl_info (Dict): The CDB additional information + local_filters (LinkingFilters): The linking filters instance + project (dict): The project + extra_cui_filter (Optional[Set]): Extra CUIs (if specified) + use_project_filters (bool): Whether to use per-project filters + """ + if isinstance(extra_cui_filter, set): + local_filters.cuis = extra_cui_filter + + if use_project_filters: + project_filter = get_project_filters(cuis=project.get('cuis', None), + type_ids=project.get('tuis', None), + addl_info=addl_info, + project=project) + # Intersect project filter with existing if it has something + if project_filter: + local_filters.cuis = intersect_nonempty_set(project_filter, local_filters.cuis) diff --git a/medcat/utils/regression/targeting.py b/medcat/utils/regression/targeting.py index 19f19bb3f..7a13b2bcc 100644 --- a/medcat/utils/regression/targeting.py +++ b/medcat/utils/regression/targeting.py @@ -25,12 +25,12 @@ class TranslationLayer: Args: cui2names (Dict[str, Set[str]]): The map from CUI to names - name2cuis (Dict[str, Set[str]]): The map from name to CUIs + name2cuis (Dict[str, List[str]]): The map from name to CUIs cui2type_ids (Dict[str, Set[str]]): The map from CUI to type_ids cui2children (Dict[str, Set[str]]): The map from CUI to child CUIs """ - def __init__(self, cui2names: Dict[str, Set[str]], name2cuis: Dict[str, Set[str]], + def __init__(self, cui2names: Dict[str, Set[str]], name2cuis: Dict[str, List[str]], cui2type_ids: Dict[str, Set[str]], cui2children: Dict[str, Set[str]]) -> None: self.cui2names = cui2names self.name2cuis = name2cuis diff --git a/setup.py b/setup.py index ab49eaff1..34963943a 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ url="https://github.com/CogStack/MedCAT", packages=['medcat', 'medcat.utils', 'medcat.preprocessing', 'medcat.ner', 'medcat.linking', 'medcat.datasets', 'medcat.tokenizers', 'medcat.utils.meta_cat', 'medcat.pipeline', 'medcat.utils.ner', - 'medcat.utils.saving', 'medcat.utils.regression'], + 'medcat.utils.saving', 'medcat.utils.regression', 'medcat.stats'], install_requires=[ 'numpy>=1.22.0', # first to support 3.11 'pandas>=1.4.2', # first to support 3.11 diff --git a/tests/test_cat.py b/tests/test_cat.py index acd337e71..368b1e885 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -17,6 +17,7 @@ class CATTests(unittest.TestCase): + SUPERVISED_TRAINING_JSON = os.path.join(os.path.dirname(__file__), "resources", "medcat_trainer_export.json") @classmethod def setUpClass(cls) -> None: @@ -39,7 +40,8 @@ def setUpClass(cls) -> None: @classmethod def tearDownClass(cls) -> None: cls.undertest.destroy_pipe() - shutil.rmtree(cls.meta_cat_dir) + if os.path.exists(cls.meta_cat_dir): + shutil.rmtree(cls.meta_cat_dir) def tearDown(self) -> None: self.cdb.config.annotation_output.include_text_in_output = False @@ -214,7 +216,7 @@ def test_get_entities_multi_texts_including_text(self): def test_train_supervised(self): nepochs = 3 num_of_documents = 27 - data_path = os.path.join(os.path.dirname(__file__), "resources", "medcat_trainer_export.json") + data_path = self.SUPERVISED_TRAINING_JSON ckpt_dir_path = tempfile.TemporaryDirectory().name checkpoint = Checkpoint(dir_path=ckpt_dir_path, steps=1, max_to_keep=sys.maxsize) fp, fn, tp, p, r, f1, cui_counts, examples = self.undertest.train_supervised(data_path, @@ -391,6 +393,42 @@ def test_hashing(self): cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) self.assertEqual(cat.get_hash(), cat.config.version.id) + def test_print_stats(self): + # based on current JSON + EXP_FALSE_NEGATIVES = {'C0017168': 2, 'C0020538': 43, 'C0038454': 4, 'C0007787': 1, 'C0155626': 4, 'C0011860': 12, + 'C0042029': 6, 'C0010068': 2, 'C0007222': 1, 'C0027051': 6, 'C0878544': 1, 'C0020473': 12, + 'C0037284': 21, 'C0003864': 4, 'C0011849': 12, 'C0005686': 1, 'C0085762': 3, 'C0030920': 2, + 'C0854135': 3, 'C0004096': 4, 'C0010054': 10, 'C0497156': 10, 'C0011334': 2, 'C0018939': 1, + 'C1561826': 2, 'C0276289': 2, 'C0041834': 9, 'C0000833': 2, 'C0238792': 1, 'C0040034': 3, + 'C0035078': 5, 'C0018799': 5, 'C0042109': 1, 'C0035439': 1, 'C0035435': 1, 'C0018099': 1, + 'C1277187': 1, 'C0024117': 7, 'C0004238': 4, 'C0032227': 6, 'C0008679': 1, 'C0013146': 6, + 'C0032285': 1, 'C0002871': 7, 'C0149871': 4, 'C0442886': 1, 'C0022104': 1, 'C0034065': 5, + 'C0011854': 6, 'C1398668': 1, 'C0020676': 2, 'C1301700': 1, 'C0021167': 1, 'C0029456': 2, + 'C0011570': 10, 'C0009324': 1, 'C0011882': 1, 'C0020615': 1, 'C0242510': 2, 'C0033581': 2, + 'C0011168': 3, 'C0039082': 2, 'C0009241': 2, 'C1404970': 1, 'C0018524': 3, 'C0150063': 1, + 'C0917799': 1, 'C0178417': 1, 'C0033975': 1, 'C0011253': 1, 'C0018802': 8, 'C0022661': 4, + 'C0017658': 1, 'C0023895': 2, 'C0003123': 1, 'C0041582': 4, 'C0085096': 4, 'C0403447': 2, + 'C2363741': 2, 'C0457949': 1, 'C0040336': 1, 'C0037315': 2, 'C0024236': 3, 'C0442874': 1, + 'C0028754': 4, 'C0520679': 5, 'C0028756': 2, 'C0029408': 5, 'C0409959': 2, 'C0018801': 1, + 'C3844825': 1, 'C0022660': 2, 'C0005779': 4, 'C0011175': 1, 'C0018965': 4, 'C0018889': 1, + 'C0022354': 2, 'C0033377': 1, 'C0042769': 1, 'C0035222': 1, 'C1456868': 2, 'C1145670': 1, + 'C0018790': 1, 'C0263746': 1, 'C0206172': 1, 'C0021400': 1, 'C0243026': 1, 'C0020443': 1, + 'C0001883': 1, 'C0031350': 1, 'C0010709': 4, 'C1565489': 7, 'C3489393': 1, 'C0005586': 2, + 'C0158288': 5, 'C0700594': 4, 'C0158266': 3, 'C0006444': 2, 'C0024003': 1} + with open(self.SUPERVISED_TRAINING_JSON) as f: + data = json.load(f) + (fps, fns, tps, + cui_prec, cui_rec, cui_f1, + cui_counts, examples) = self.undertest._print_stats(data) + self.assertEqual(fps, {}) + self.assertEqual(fns, EXP_FALSE_NEGATIVES) + self.assertEqual(tps, {}) + self.assertEqual(cui_prec, {}) + self.assertEqual(cui_rec, {}) + self.assertEqual(cui_f1, {}) + self.assertEqual(len(cui_counts), 136) + self.assertEqual(len(examples), 3) + def _assertNoLogs(self, logger: logging.Logger, level: int): if hasattr(self, 'assertNoLogs'): return self.assertNoLogs(logger=logger, level=level) From 90bf65e5fcde5daa3ad3bd6fefcbdaa56a7eea5f Mon Sep 17 00:00:00 2001 From: jenniferajiang Date: Mon, 18 Dec 2023 15:00:19 +0000 Subject: [PATCH 47/64] Load stopwords in Defaults before spacy model --- medcat/pipe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/pipe.py b/medcat/pipe.py index 3861267df..e1e99af2e 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -38,9 +38,9 @@ class Pipe(object): """ def __init__(self, tokenizer: Tokenizer, config: Config) -> None: - self._nlp = spacy.load(config.general.spacy_model, disable=config.general.spacy_disabled_components) if config.preprocessing.stopwords is not None: - self._nlp.Defaults.stop_words = set(config.preprocessing.stopwords) + Language.Defaults.stop_words = set(config.preprocessing.stopwords) + self._nlp = spacy.load(config.general.spacy_model, disable=config.general.spacy_disabled_components) self._nlp.tokenizer = tokenizer(self._nlp, config) # Set max document length self._nlp.max_length = config.preprocessing.max_document_length From 9e5fca15148de22ba83d53079f7ed86910bfdcc3 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 18 Dec 2023 17:03:51 +0200 Subject: [PATCH 48/64] CU-8693az82g Remove cdb tests side effects (#380) * 8693az82g: Add method to CDBMaker to reset the CDB * 8693az82g: Add test in CDB tests to ensure a new CDB is used for each test * 8693az82g: Reset CDB in CDB tests before each test to avoid side effects --- medcat/cdb_maker.py | 14 ++++++++++++++ tests/test_cdb.py | 10 ++++++++++ 2 files changed, 24 insertions(+) diff --git a/medcat/cdb_maker.py b/medcat/cdb_maker.py index ca98f821e..a4dd7dd27 100644 --- a/medcat/cdb_maker.py +++ b/medcat/cdb_maker.py @@ -49,6 +49,14 @@ def __init__(self, config: Config, cdb: Optional[CDB] = None) -> None: name='skip_and_punct', additional_fields=['is_punct']) + def reset_cdb(self) -> None: + """This will re-create a new internal CDB based on the same config. + + This will be necessary if/when you're wishing to call `prepare_csvs` + multiple times on the same object `CDBMaker` instance. + """ + self.cdb = CDB(config=self.config) + def prepare_csvs(self, csv_paths: Union[pd.DataFrame, List[str]], sep: str = ',', @@ -59,6 +67,12 @@ def prepare_csvs(self, only_existing_cuis: bool = False, **kwargs) -> CDB: r"""Compile one or multiple CSVs into a CDB. + Note: This class/method generally uses the same instance of the CDB. + So if you're using the same CDBMaker and calling `prepare_csvs` + multiple times, you are likely to get leakage from prior calls + into new ones. + To reset the CDB, call `reset_cdb`. + Args: csv_paths (Union[pd.DataFrame, List[str]]): An array of paths to the csv files that should be processed. Can also be an array of pd.DataFrames diff --git a/tests/test_cdb.py b/tests/test_cdb.py index f7be24d64..1be74edfe 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -22,11 +22,21 @@ def setUp(self) -> None: cdb_2_csv = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb_2.csv") self.tmp_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tmp") os.makedirs(self.tmp_dir, exist_ok=True) + # resetting the CDB because otherwise the CDBMaker + # will refer to and modify the same instance of the CDB + # and this can (and does!) create side effects + CDBTests.cdb_maker.reset_cdb() self.undertest = CDBTests.cdb_maker.prepare_csvs([cdb_csv, cdb_2_csv], full_build=True) def tearDown(self) -> None: shutil.rmtree(self.tmp_dir) + def test_setup_changes_cdb(self): + id1 = id(self.undertest) + self.setUp() + id2 = id(self.undertest) + self.assertNotEqual(id1, id2) + def test_name2cuis(self): self.assertEqual({ 'second~csv': ['C0000239'], From 72ac8d7526f1d5750588289bde22d124dc850964 Mon Sep 17 00:00:00 2001 From: jenniferajiang Date: Mon, 18 Dec 2023 18:03:22 +0000 Subject: [PATCH 49/64] Added tests --- medcat/pipe.py | 3 ++- tests/test_pipe.py | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/medcat/pipe.py b/medcat/pipe.py index e1e99af2e..3ed042219 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -39,7 +39,8 @@ class Pipe(object): def __init__(self, tokenizer: Tokenizer, config: Config) -> None: if config.preprocessing.stopwords is not None: - Language.Defaults.stop_words = set(config.preprocessing.stopwords) + cls = spacy.util.get_lang_class('en') + cls.Defaults.stop_words = set(config.preprocessing.stopwords) self._nlp = spacy.load(config.general.spacy_model, disable=config.general.spacy_disabled_components) self._nlp.tokenizer = tokenizer(self._nlp, config) # Set max document length diff --git a/tests/test_pipe.py b/tests/test_pipe.py index e6da42898..17a6bbe59 100644 --- a/tests/test_pipe.py +++ b/tests/test_pipe.py @@ -28,6 +28,7 @@ def setUpClass(cls) -> None: cls.config.ner['max_skip_tokens'] = 1 cls.config.ner['upper_case_limit_len'] = 4 cls.config.linking['disamb_length_limit'] = 2 + cls.config.preprocessing.stopwords = {'stop', 'words'} cls.cdb = CDB(config=cls.config) downloader = VocabDownloader() @@ -42,7 +43,7 @@ def setUpClass(cls) -> None: _tokenizer = TokenizerWrapperBERT(hf_tokenizers=AutoTokenizer.from_pretrained("bert-base-uncased")) cls.meta_cat = MetaCAT(tokenizer=_tokenizer) - cls.text = "CDB - I was running and then Movar Virus attacked and CDb" + cls.text = "stop of CDB - I was running and then Movar Virus attacked and CDb" cls.undertest = Pipe(tokenizer=spacy_split_all, config=cls.config) @classmethod @@ -81,6 +82,11 @@ def test_add_meta_cat(self): PipeTests.undertest.add_meta_cat(PipeTests.meta_cat) self.assertEqual(PipeTests.meta_cat.name, Language.get_factory_meta(PipeTests.meta_cat.name).factory) + + def test_stopwords_loading(self): + self.assertEqual(PipeTests.undertest._nlp.Defaults.stop_words, PipeTests.config.preprocessing.stopwords) + self.assertEqual(PipeTests.undertest._nlp(PipeTests.text)[0].is_stop, True) + self.assertEqual(PipeTests.undertest._nlp(PipeTests.text)[1].is_stop, False) def test_batch_multi_process(self): PipeTests.undertest.add_tagger(tagger=tag_skip_and_punct, additional_fields=["is_punct"]) From 22e22550ba0da762351212a21bb5203a58016861 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Thu, 21 Dec 2023 19:27:41 +0200 Subject: [PATCH 50/64] CU-8693bpq82 fallback spacy model (#384) * CU-8693bpq82: Add fallback spacy model along with test * CU-8693bpq82: Remove debug output * CU-8693bpq82: Add exception info to warning upon spacy model load failure and fallback --- medcat/cdb.py | 2 +- medcat/pipe.py | 23 ++++++++++++++++++++++- tests/test_cat.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 2ca8382a7..76cb7327e 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -526,7 +526,7 @@ def load_config(self, config_path: str) -> None: if not os.path.exists(config_path): if not self._config_from_file: # if there's no config defined anywhere - raise ValueError("Could not find a config in the CDB nor ", + raise ValueError("Could not find a config in the CDB nor " "in the config.json for this model " f"({os.path.dirname(config_path)})", ) diff --git a/medcat/pipe.py b/medcat/pipe.py index 3861267df..1ad9e6766 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -22,6 +22,9 @@ logger = logging.getLogger(__name__) # different logger from the package-level one +DEFAULT_SPACY_MODEL = 'en_core_web_md' + + class Pipe(object): """A wrapper around the standard spacy pipeline. @@ -38,7 +41,22 @@ class Pipe(object): """ def __init__(self, tokenizer: Tokenizer, config: Config) -> None: - self._nlp = spacy.load(config.general.spacy_model, disable=config.general.spacy_disabled_components) + try: + self._nlp = self._init_nlp(config) + except Exception as e: + if config.general.spacy_model == DEFAULT_SPACY_MODEL: + raise e + logger.warning("Could not load spacy model from '%s'. " + "Falling back to installed en_core_web_md. " + "For best compatibility, we'd recommend " + "packaging and using your model pack with " + "the spacy model it was designed for", + config.general.spacy_model, exc_info=e) + # we're changing the config value so that this propages + # to other places that try to load the model. E.g: + # medcat.utils.normalizers.TokenNormalizer.__init__ + config.general.spacy_model = DEFAULT_SPACY_MODEL + self._nlp = self._init_nlp(config) if config.preprocessing.stopwords is not None: self._nlp.Defaults.stop_words = set(config.preprocessing.stopwords) self._nlp.tokenizer = tokenizer(self._nlp, config) @@ -48,6 +66,9 @@ def __init__(self, tokenizer: Tokenizer, config: Config) -> None: # Set log level logger.setLevel(self.config.general.log_level) + def _init_nlp(selef, config: Config) -> Language: + return spacy.load(config.general.spacy_model, disable=config.general.spacy_disabled_components) + def add_tagger(self, tagger: Callable, name: Optional[str] = None, additional_fields: List[str] = []) -> None: """Add any kind of a tagger for tokens. diff --git a/tests/test_cat.py b/tests/test_cat.py index 368b1e885..7012e24c7 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -10,6 +10,7 @@ from medcat.vocab import Vocab from medcat.cdb import CDB, logger as cdb_logger from medcat.cat import CAT, logger as cat_logger +from medcat.pipe import logger as pipe_logger from medcat.utils.checkpoint import Checkpoint from medcat.meta_cat import MetaCAT from medcat.config_meta_cat import ConfigMetaCAT @@ -499,6 +500,34 @@ def test_loading_model_pack_with_cdb_config_and_config_json_raises_exception(sel CAT.load_model_pack(self.model_path) +class ModelLoadsUnreadableSpacy(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cls.temp_dir = tempfile.TemporaryDirectory() + model_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples") + cdb = CDB.load(os.path.join(model_path, 'cdb.dat')) + cdb.config.general.spacy_model = os.path.join(cls.temp_dir.name, "en_core_web_md") + # save CDB in new location + cdb.save(os.path.join(cls.temp_dir.name, 'cdb.dat')) + # save config in new location + cdb.config.save(os.path.join(cls.temp_dir.name, 'config.json')) + # copy vocab into new location + vocab_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat") + cls.vocab_path = os.path.join(cls.temp_dir.name, 'vocab.dat') + shutil.copyfile(vocab_path, cls.vocab_path) + + @classmethod + def tearDownClass(cls) -> None: + # REMOVE temp dir + cls.temp_dir.cleanup() + + def test_loads_without_specified_spacy_model(self): + with self.assertLogs(logger=pipe_logger, level=logging.WARNING): + cat = CAT.load_model_pack(self.temp_dir.name) + self.assertTrue(isinstance(cat, CAT)) + + class ModelWithZeroConfigsLoadTests(unittest.TestCase): @classmethod From 37a9d92b4310017b54dd17e5b081dcd2a875e6c2 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 22 Dec 2023 11:47:30 +0000 Subject: [PATCH 51/64] Remove tests of internals where possible --- tests/test_pipe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_pipe.py b/tests/test_pipe.py index 17a6bbe59..8ce47cfb5 100644 --- a/tests/test_pipe.py +++ b/tests/test_pipe.py @@ -85,8 +85,9 @@ def test_add_meta_cat(self): def test_stopwords_loading(self): self.assertEqual(PipeTests.undertest._nlp.Defaults.stop_words, PipeTests.config.preprocessing.stopwords) - self.assertEqual(PipeTests.undertest._nlp(PipeTests.text)[0].is_stop, True) - self.assertEqual(PipeTests.undertest._nlp(PipeTests.text)[1].is_stop, False) + doc = PipeTests.undertest(PipeTests.text) + self.assertEqual(doc[0].is_stop, True) + self.assertEqual(doc[1].is_stop, False) def test_batch_multi_process(self): PipeTests.undertest.add_tagger(tagger=tag_skip_and_punct, additional_fields=["is_punct"]) From 392f80b71d2737bdb8be92a8335820e92ef0101d Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 22 Dec 2023 11:50:45 +0000 Subject: [PATCH 52/64] Add test for skipping of stopwords --- tests/test_cat.py | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tests/test_cat.py b/tests/test_cat.py index acd337e71..28ede9ca5 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -39,7 +39,8 @@ def setUpClass(cls) -> None: @classmethod def tearDownClass(cls) -> None: cls.undertest.destroy_pipe() - shutil.rmtree(cls.meta_cat_dir) + if os.path.exists(cls.meta_cat_dir): + shutil.rmtree(cls.meta_cat_dir) def tearDown(self) -> None: self.cdb.config.annotation_output.include_text_in_output = False @@ -440,6 +441,46 @@ def test_add_and_train_concept_cdb_warns_short_name(self): self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1) +class GetEntitiesWithStopWords(unittest.TestCase): + # NB! The order in which the different CDBs are created + # is important here since the way that the stop words are + # set is class-based, it creates the side effect of having + # the same stop words the next time around + # regardless of whether or not they should've been set + + @classmethod + def setUpClass(cls) -> None: + cls.cdb = CDB.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat")) + cls.vocab = Vocab.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat")) + cls.vocab.make_unigram_table() + cls.cdb.config.general.spacy_model = "en_core_web_md" + cls.cdb.config.ner.min_name_len = 2 + cls.cdb.config.ner.upper_case_limit_len = 3 + cls.cdb.config.general.spell_check = True + cls.cdb.config.linking.train_count_threshold = 10 + cls.cdb.config.linking.similarity_threshold = 0.3 + cls.cdb.config.linking.train = True + cls.cdb.config.linking.disamb_length_limit = 5 + cls.cdb.config.general.full_unlink = True + # the regular CAT without stopwords + cls.no_stopwords = CAT(cdb=cls.cdb, config=cls.cdb.config, vocab=cls.vocab, meta_cats=[]) + # this (the following two lines) + # needs to be done before initialising the CAT + # since that initialises the pipe + cls.cdb.config.preprocessing.stopwords = {"stop", "words"} + cls.cdb.config.preprocessing.skip_stopwords = True + # the CAT that skips the stopwords + cls.w_stopwords = CAT(cdb=cls.cdb, config=cls.cdb.config, vocab=cls.vocab, meta_cats=[]) + + def test_stopwords_are_skipped(self, text: str = "second words csv"): + # without stopwords no entities are captured + # with stopwords, the `second words csv` entity is captured + doc_no_stopwords = self.no_stopwords(text) + self.cdb.config.preprocessing.skip_stopwords = True + doc_w_stopwords = self.w_stopwords(text) + self.assertGreater(len(doc_no_stopwords), len(doc_w_stopwords)) + + class ModelWithTwoConfigsLoadTests(unittest.TestCase): @classmethod From 276bcf1b1b777c52543b9b91f87d59702a501dc2 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 22 Dec 2023 11:52:07 +0000 Subject: [PATCH 53/64] Avoid supporting only English for stopwords --- medcat/pipe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/medcat/pipe.py b/medcat/pipe.py index 3ed042219..906b73b17 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -39,7 +39,8 @@ class Pipe(object): def __init__(self, tokenizer: Tokenizer, config: Config) -> None: if config.preprocessing.stopwords is not None: - cls = spacy.util.get_lang_class('en') + lang = config.general.spacy_model.split('_', 1)[0] + cls = spacy.util.get_lang_class(lang) cls.Defaults.stop_words = set(config.preprocessing.stopwords) self._nlp = spacy.load(config.general.spacy_model, disable=config.general.spacy_disabled_components) self._nlp.tokenizer = tokenizer(self._nlp, config) From 69c23934b794ac0a9f7866387ce02546dcf102f7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 22 Dec 2023 12:49:33 +0000 Subject: [PATCH 54/64] Remove debug output --- medcat/pipe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/medcat/pipe.py b/medcat/pipe.py index 3698a716c..485d1d5d4 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -43,7 +43,6 @@ class Pipe(object): def __init__(self, tokenizer: Tokenizer, config: Config) -> None: if config.preprocessing.stopwords is not None: lang = config.general.spacy_model.split('_', 1)[0] - print("(stopwords) LANG", lang) cls = spacy.util.get_lang_class(lang) cls.Defaults.stop_words = set(config.preprocessing.stopwords) try: @@ -262,7 +261,6 @@ def _ensure_serializable(doc: Doc) -> Doc: def __call__(self, text: Union[str, Iterable[str]]) -> Union[Doc, List[Doc]]: if isinstance(text, str): - print("Pipe.__call__ w", id(self._nlp)) return self._nlp(text) if len(text) > 0 else None # type: ignore elif isinstance(text, Iterable): docs = [] From 80b438755555aa700f9ba1d07c5138f21aa26f7e Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 22 Dec 2023 13:03:43 +0000 Subject: [PATCH 55/64] Make sure stopwords language getter works for file-path spacy models --- medcat/pipe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/medcat/pipe.py b/medcat/pipe.py index 485d1d5d4..c29267f38 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -1,4 +1,5 @@ import types +import os import spacy import gc import logging @@ -42,7 +43,7 @@ class Pipe(object): def __init__(self, tokenizer: Tokenizer, config: Config) -> None: if config.preprocessing.stopwords is not None: - lang = config.general.spacy_model.split('_', 1)[0] + lang = os.path.basename(config.general.spacy_model).split('_', 1)[0] cls = spacy.util.get_lang_class(lang) cls.Defaults.stop_words = set(config.preprocessing.stopwords) try: From abfb1e7ca91b87f4f840e335ab047ddd387b5885 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 8 Jan 2024 14:35:51 +0200 Subject: [PATCH 56/64] CU-8693cv3w0 Fix fallback spacy model existance on pip installs (#386) * CU-8693cv3w0: Add method to ensure spacy model and use it when falling back to default model * CU-8693cv3w0: Add logged output when installing/downloading spacy model --- medcat/pipe.py | 2 ++ medcat/utils/helpers.py | 32 ++++++++++++++++++++++++++++++++ tests/utils/test_helpers.py | 24 ++++++++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 tests/utils/test_helpers.py diff --git a/medcat/pipe.py b/medcat/pipe.py index c29267f38..7bf06364b 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -18,6 +18,7 @@ from medcat.pipeline.pipe_runner import PipeRunner from medcat.preprocessing.taggers import tag_skip_and_punct from medcat.ner.transformers_ner import TransformersNER +from medcat.utils.helpers import ensure_spacy_model logger = logging.getLogger(__name__) # different logger from the package-level one @@ -60,6 +61,7 @@ def __init__(self, tokenizer: Tokenizer, config: Config) -> None: # we're changing the config value so that this propages # to other places that try to load the model. E.g: # medcat.utils.normalizers.TokenNormalizer.__init__ + ensure_spacy_model(DEFAULT_SPACY_MODEL) config.general.spacy_model = DEFAULT_SPACY_MODEL self._nlp = self._init_nlp(config) self._nlp.tokenizer = tokenizer(self._nlp, config) diff --git a/medcat/utils/helpers.py b/medcat/utils/helpers.py index f783a9b06..816b316ce 100644 --- a/medcat/utils/helpers.py +++ b/medcat/utils/helpers.py @@ -537,3 +537,35 @@ def has_new_spacy() -> bool: return (major > 3 or (major == 3 and minor > 3) or (major == 3 and minor == 3 and patch >= 1)) + + +def has_spacy_model(model_name: str) -> bool: + """Checks if the spacy model is available. + + Args: + model_name (str): The model name. + + Returns: + bool: True if the model is available, False otherwise. + """ + import spacy.util + return model_name in spacy.util.get_installed_models() + + +def ensure_spacy_model(model_name: str) -> None: + """Ensure the specified spacy model exists. + + If the model does not currently exist, it will attempt downloading it. + + Args: + model_name (str): The spacy model name. + """ + import subprocess + if has_spacy_model(model_name): + return + # running in subprocess so that we can catch the exception + # if the model name is unknown. Otherwise we'd just be bumped + # out of python (sys.exit). + logger.info("Installing the spacy model %s using the CLI command " + "'python -m spacy download %s'", model_name, model_name) + subprocess.run(["python", "-m", "spacy", "download", model_name], check=True) diff --git a/tests/utils/test_helpers.py b/tests/utils/test_helpers.py new file mode 100644 index 000000000..6703ce91a --- /dev/null +++ b/tests/utils/test_helpers.py @@ -0,0 +1,24 @@ +from medcat.utils.helpers import has_spacy_model, ensure_spacy_model +from medcat.pipe import DEFAULT_SPACY_MODEL + +import unittest +import subprocess + + +class HasSpacyModelTests(unittest.TestCase): + + def test_no_rubbish_model(self, model_name='rubbish_model'): + self.assertFalse(has_spacy_model(model_name)) + + def test_has_def_model(self, model_name=DEFAULT_SPACY_MODEL): + self.assertTrue(has_spacy_model(model_name)) + + +class EnsureSpacyModelTests(unittest.TestCase): + + def test_fails_rubbish_model(self, model_name='rubbish_model'): + with self.assertRaises(subprocess.CalledProcessError): + ensure_spacy_model(model_name) + + def test_success_def_model(self, model_name=DEFAULT_SPACY_MODEL): + ensure_spacy_model(model_name) From d9a1facaa866a327666435c608b261daf3eb8fdf Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 8 Jan 2024 15:43:07 +0200 Subject: [PATCH 57/64] CU-8693b0a61 Add method to get spacy model version (#381) * CU-8693b0a61: Add method to find spacy folder in model pack along with some tests * CU-8693b0a61: Add test for spacy folder finding (full path) * CU-8693b0a61: Add method for finding spacy model in model pack along with tests * CU-8693b0a61: Add method for finding current spacy version * CU-8693b0a61: Add method for getting spacy model version installed * CU-8693b0a61: Fix getting spacy model folder return path * CU-8693b0a61: Add method to get name and meta of spacy model based on model pack * CU-8693b0a61: Add missing fake spacy model meta * CU-8693b0a61: Add missing docstrings * CU-8693b0a61: Change name of method for clarity * CU-8693b0a61: Add method to get spacy model name and version from model pack path * CU-8693b0a61: Fix a few typing issues * CU-8693b0a61: Add a missing docstring * CU-8693b0a61: Match folder name of fake spacy model to its name * CU-8693b0a61: Make the final method return true name of spacy model instead of folder name * Add additional output to method for getting spacy model version - the compatible spacy versions * CU-8693b0a61: Add method for querying whether the spacy version is compatible with a range * CU-8693b0a61: Add better abstraction for spacy version mocking in tests * CU-8693b0a61: Add some more abstraction for fake model pack in tests * CU-8693b0a61: Add method for checking whethera model pack has a spacy model compatible with installed spacy version * CU-8693b0a61: Improve abstraction within tests * CU-8693b0a61: Add method to check which of two versions is older * CU-8693b0a61: Fix fake spacy model versioning * CU-8693b0a61: Add method for determining whether a model pack has semi-compatible spacy model * CU-8693b0a61: Add missing word in docstring. * CU-8693b0a61: Change some method to protected ones --- medcat/utils/spacy_compatibility.py | 211 +++++++++++++++ tests/resources/ff_core_fake_dr/meta.json | 8 + tests/utils/test_spacy_compatibility.py | 302 ++++++++++++++++++++++ 3 files changed, 521 insertions(+) create mode 100644 medcat/utils/spacy_compatibility.py create mode 100644 tests/resources/ff_core_fake_dr/meta.json create mode 100644 tests/utils/test_spacy_compatibility.py diff --git a/medcat/utils/spacy_compatibility.py b/medcat/utils/spacy_compatibility.py new file mode 100644 index 000000000..a64737f21 --- /dev/null +++ b/medcat/utils/spacy_compatibility.py @@ -0,0 +1,211 @@ +"""This module attempts to read the spacy compatibilty of +a model pack and (if necessary) compare it to the installed +spacy version. +""" +from typing import Tuple, List, cast +import os +import re +from packaging import version +from packaging.specifiers import SpecifierSet + +import spacy + + +SPACY_MODEL_REGEX = re.compile(r"(\w{2}_core_(\w{3,4})_(sm|md|lg|trf|xxl|\w+))|(spacy_model)") + + +def _is_spacy_model_folder(folder_name: str) -> bool: + """Check if a folder within a model pack contains a spacy model. + + The idea is to do this without loading the model. That is because + the version of the model may be incompatible with what we've got. + And as such, loading may not be possible. + + Args: + folder_name (str): The folder to check. + + Returns: + bool: Whether the folder contains a spacy model. + """ + # since we're trying to identify this solely from the + # folder name, we only care about the base name. + folder_name = os.path.basename(folder_name) + if folder_name.startswith("meta_"): + # these are MetaCat stuff (or should be) + return False + return bool(SPACY_MODEL_REGEX.match(folder_name)) + + +def _find_spacy_model_folder(model_pack_folder: str) -> str: + """Find the spacy model folder in a model pack folder. + + Args: + model_pack_folder (str): The model pack folder + + Raises: + ValueError: If it's ambiguous or there's no model folder. + + Returns: + str: The full path to the model folder. + """ + options: List[str] = [] + for folder_name in os.listdir(model_pack_folder): + full_folder_path = os.path.join(model_pack_folder, folder_name) + if not os.path.isdir(full_folder_path): + continue + if _is_spacy_model_folder(folder_name): + options.append(full_folder_path) + if len(options) != 1: + raise ValueError("Unable to determine spacy folder name from " + f"{len(options)} ambiguous folders: {options}") + return options[0] + + +def get_installed_spacy_version() -> str: + """Get the spacy version installed currently. + + Returns: + str: The currently installed spacy verison. + """ + return spacy.__version__ + + +def get_installed_model_version(model_name: str) -> str: + """Get the version of a model installed in spacy. + + Args: + model_name (str): The model name. + + Returns: + str: The version of the installed model. + """ + if model_name not in spacy.util.get_installed_models(): + return 'N/A' + # NOTE: I don't really know when spacy.info + # might return a str instead + return cast(dict, spacy.info(model_name))['version'] + + +def _get_name_and_meta_of_spacy_model_in_medcat_modelpack(model_pack_path: str) -> Tuple[str, dict]: + """Gets the name and meta information about a spacy model within a medcat model pack. + + PS: This gets the raw (folder) name of the spacy model. + While this is usually (in models created after v1.2.4) + identical to the spacy model version, that may not always + be the case. + + Args: + model_pack_path (str): The model pack path. + + Returns: + Tuple[str, dict]: The name of the spacy model, and the meta information. + """ + spacy_model_folder = _find_spacy_model_folder(model_pack_path) + # NOTE: I don't really know when spacy.info + # might return a str instead + info = cast(dict, spacy.info(spacy_model_folder)) + return os.path.basename(spacy_model_folder), info + + +def get_name_and_version_of_spacy_model_in_medcat_modelpack(model_pack_path: str) -> Tuple[str, str, str]: + """Get the name, version, and compatible spacy versions of a spacy model within a medcat model pack. + + PS: This gets the real name of the spacy model. + While this is usually (in models created after v1.2.4) + identical to the folder name, that may not always + be the case. + + Args: + model_pack_path (str): The model pack path. + + Returns: + Tuple[str, str, str]: The name of the spacy model, its version, and supported spacy version. + """ + _, info = _get_name_and_meta_of_spacy_model_in_medcat_modelpack(model_pack_path) + true_name = info["lang"] + "_" + info['name'] + return true_name, info['version'], info["spacy_version"] + + +def _is_spacy_version_within_range(spacy_version_range: str) -> bool: + """Checks whether the spacy version is within the specified range. + + The expected format of the version range is similar to that used + in requirements and/or pip installs. E.g: + - >=3.1.0,<3.2.0 + - ==3.1.0 + - >=3.1.0 + - <3.20 + + Args: + spacy_version_range (str): The requires spacy version range. + + Returns: + bool: Whether the specified range is compatible. + """ + spacy_version = version.parse(get_installed_spacy_version()) + range = SpecifierSet(spacy_version_range) + return range.contains(spacy_version) + + +def medcat_model_pack_has_compatible_spacy_model(model_pack_path: str) -> bool: + """Checks whether a medcat model pack has a spacy model compatible with installed spacy version. + + Args: + model_pack_path (str): The model pack path. + + Returns: + bool: Whether the spacy model in the model pack is compatible. + """ + _, _, spacy_range = get_name_and_version_of_spacy_model_in_medcat_modelpack(model_pack_path) + return _is_spacy_version_within_range(spacy_range) + + +def is_older_spacy_version(model_version: str) -> bool: + """Checks if the specified version is older than the installed version. + + Args: + model_version (str): The specified spacy version. + + Returns: + bool: Whether the specified version is older. + """ + installed_version = version.parse(get_installed_spacy_version()) + model_version = version.parse(model_version) + return model_version <= installed_version + + +def medcat_model_pack_has_semi_compatible_spacy_model(model_pack_path: str) -> bool: + """Checks whether the spacy model within a medcat model pack is + compatible or older than the installed spacy version. + + This method returns `True` if the spacy model is compatible or + released with a lower version number compared to the spacy + version currently installed. + + We've found that most of the time older models will work with + a newer version of spacy. Though there is a warning on spacy's + side and they do not guarantee 100% compatibility, we've not + seen issues so far. + + E.g for installed spacy 3.4.4 all the following will be suiable: + - en_core_web_md-3.1.0 + - en_core_web_md-3.2.0 + - en_core_web_md-3.3.0 + - en_core_web_md-3.4.1 + However, for the same version, the following would not be suitable: + - en_core_web_md-3.5.0 + - en_core_web_md-3.6.0 + - en_core_web_md-3.7.1 + + Args: + model_pack_path (str): The model pack path. + + Returns: + bool: Whether the spacy model in the model pack is compatible. + """ + (_, + model_version, + spacy_range) = get_name_and_version_of_spacy_model_in_medcat_modelpack(model_pack_path) + if _is_spacy_version_within_range(spacy_range): + return True + return is_older_spacy_version(model_version) diff --git a/tests/resources/ff_core_fake_dr/meta.json b/tests/resources/ff_core_fake_dr/meta.json new file mode 100644 index 000000000..fe9825db7 --- /dev/null +++ b/tests/resources/ff_core_fake_dr/meta.json @@ -0,0 +1,8 @@ +{ + "lang":"ff", + "name":"core_fake_dr", + "version":"3.1.0", + "description":"This is a FAKE model", + "author":"Fakio Martimus", + "spacy_version":">=3.1.0,<3.2.0" + } \ No newline at end of file diff --git a/tests/utils/test_spacy_compatibility.py b/tests/utils/test_spacy_compatibility.py new file mode 100644 index 000000000..5cf0dd03e --- /dev/null +++ b/tests/utils/test_spacy_compatibility.py @@ -0,0 +1,302 @@ +import medcat.utils.spacy_compatibility as module_under_test +from medcat.utils.spacy_compatibility import _is_spacy_model_folder, _find_spacy_model_folder +from medcat.utils.spacy_compatibility import get_installed_spacy_version, get_installed_model_version +from medcat.utils.spacy_compatibility import _get_name_and_meta_of_spacy_model_in_medcat_modelpack +from medcat.utils.spacy_compatibility import get_name_and_version_of_spacy_model_in_medcat_modelpack +from medcat.utils.spacy_compatibility import _is_spacy_version_within_range +from medcat.utils.spacy_compatibility import medcat_model_pack_has_compatible_spacy_model +from medcat.utils.spacy_compatibility import is_older_spacy_version +from medcat.utils.spacy_compatibility import medcat_model_pack_has_semi_compatible_spacy_model + +import unittest + +from typing import Callable +import random +import string +import tempfile +import os +from contextlib import contextmanager + + +FAKE_SPACY_MODEL_NAME = "ff_core_fake_dr" +FAKE_SPACY_MODEL_DIR = os.path.join("tests", "resources", FAKE_SPACY_MODEL_NAME) +FAKE_MODELPACK_MODEL_DIR = os.path.join(FAKE_SPACY_MODEL_DIR, '..') + + +class SpacyModelFolderIdentifierTests(unittest.TestCase): + expected_working_spacy_models = [ + "en_core_sci_sm", + "en_core_web_sm", + "en_core_web_md", + "en_core_web_lg", + "en_core_web_trf", + "nl_core_news_sm", + "nl_core_news_md", + "nl_core_news_lg", + ] + # the following were used in medcat models created prior + # to v1.2.4 + expected_working_legacy_names = [ + "spacy_model" + ] + + def test_works_expected_models(self): + for model_name in self.expected_working_spacy_models: + with self.subTest(model_name): + self.assertTrue(_is_spacy_model_folder(model_name)) + + def test_works_legacy_models(self): + for model_name in self.expected_working_legacy_names: + with self.subTest(model_name): + self.assertTrue(_is_spacy_model_folder(model_name)) + + def test_works_fill_path(self): + for model_name in self.expected_working_legacy_names: + full_folder_path = os.path.join("some", "folder", "structure", model_name) + with self.subTest(full_folder_path): + self.assertTrue(_is_spacy_model_folder(model_name)) + + def get_all_garbage(self) -> list: + """Generate garbage "spacy names". + + Returns: + List[str]: Some random strings that shouldn't be spacy models. + """ + my_examples = ["garbage_in_and_out", "meta_Presence", "something"] + true_randoms_N10 = [''.join(random.choices(string.ascii_uppercase + string.digits, k=10)) for _ in range(10)] + true_randoms_N20 = [''.join(random.choices(string.ascii_uppercase + string.digits, k=20)) for _ in range(10)] + return my_examples + true_randoms_N10 + true_randoms_N20 + + def test_does_not_work_grabage(self): + for garbage in self.get_all_garbage(): + with self.subTest(garbage): + self.assertFalse(_is_spacy_model_folder(garbage)) + + +class FindSpacyFolderJustOneFolderEmptyFilesTests(unittest.TestCase): + + @classmethod + def setUpClass(cls, spacy_folder_name='en_core_web_md') -> None: + # setup temp folder + cls.temp_folder = tempfile.TemporaryDirectory() + cls.fake_modelpack_folder_name = cls.temp_folder.name + # create spacy folder + cls.spacy_folder = os.path.join(cls.fake_modelpack_folder_name, spacy_folder_name) + os.makedirs(cls.spacy_folder) + # create 2 empty files + filenames = ["file1.dat", "file2.json"] + filenames = [os.path.join(cls.fake_modelpack_folder_name, fn) for fn in filenames] + for fn in filenames: + with open(fn, 'w'): + pass # open and write empty file + + @classmethod + def tearDownClass(cls) -> None: + cls.temp_folder.cleanup() + + def test_finds(self): + found_folder_path = _find_spacy_model_folder(self.fake_modelpack_folder_name) + self.assertEqual(found_folder_path, self.spacy_folder) + + +class FindSpacyFolderMoreFoldersEmptyFilesTests(FindSpacyFolderJustOneFolderEmptyFilesTests): + + @classmethod + def setUpClass(cls, spacy_folder_name='en_core_web_md') -> None: + super().setUpClass(spacy_folder_name) + # add a few folders + folder_names = ["meta_Presence", "garbage_in_garbage_out"] + folder_names = [os.path.join(cls.fake_modelpack_folder_name, fn) for fn in folder_names] + for folder in folder_names: + os.makedirs(folder) + + +class SpacyVersionTests(unittest.TestCase): + + def test_version_received(self): + installed = get_installed_spacy_version() + import spacy + expected = spacy.__version__ + self.assertEqual(installed, expected) + + +class InstalledVersionChecker(unittest.TestCase): + + def test_existing(self, model_name: str = 'en_core_web_md'): + version = get_installed_model_version(model_name) + self.assertIsInstance(version, str) + self.assertNotEqual(version, "N/A") + + def test_non_existing(self, model_name: str = 'en_core_web_lg'): + version = get_installed_model_version(model_name) + self.assertIsInstance(version, str) + self.assertEqual(version, "N/A") + + +class GetSpacyModelInfoTests(unittest.TestCase): + expected_version = "3.1.0" + + @classmethod + def setUpClass(cls) -> None: + cls.name, cls.info = _get_name_and_meta_of_spacy_model_in_medcat_modelpack(FAKE_MODELPACK_MODEL_DIR) + + def test_reads_name(self): + self.assertEqual(self.name, FAKE_SPACY_MODEL_NAME) + + def test_reads_info(self): + self.assertIsInstance(self.info, dict) + self.assertTrue(self.info) # not empty + + +class GetSpacyModelVersionTests(GetSpacyModelInfoTests): + expected_spacy_version = ">=3.1.0,<3.2.0" + + @classmethod + def setUpClass(cls) -> None: + (cls.name, + cls.version, + cls.spacy_version) = get_name_and_version_of_spacy_model_in_medcat_modelpack(FAKE_MODELPACK_MODEL_DIR) + + def test_name_correct(self): + self.assertEqual(self.name, FAKE_SPACY_MODEL_NAME) + + def test_version_correct(self): + self.assertEqual(self.version, self.expected_version) + + def test_spacy_version_correct(self): + self.assertEqual(self.spacy_version, self.expected_spacy_version) + + +@contextmanager +def custom_spacy_version(mock_version: str): + """Changes the apparently installed spacy version. + """ + print(f"Mocking spacy version to: {mock_version}") + _old_method = module_under_test.get_installed_spacy_version + module_under_test.get_installed_spacy_version = lambda: mock_version + yield mock_version + print("Returning regular spacy version getter") + module_under_test.get_installed_spacy_version = _old_method + + +class VersionMockBaseTests(unittest.TestCase): + + def base_subtest_for(self, target_fun: Callable[[str], bool], + spacy_model_range: str, spacy_version: str, should_work: bool) -> None: + with self.subTest(spacy_version): + if should_work: + self.assertTrue(target_fun(spacy_model_range)) + else: + self.assertFalse(target_fun(spacy_model_range)) + + def base_check_version(self, target_fun: Callable[[str], bool], + spacy_model_range: str, spacy_version: str, should_work: bool = True) -> None: + with custom_spacy_version(spacy_version): + self.base_subtest_for(target_fun, spacy_model_range, spacy_version, should_work) + +class SpacyVersionMockBaseTests(VersionMockBaseTests): + + def _subtest_for(self, spacy_model_range: str, spacy_version: str, should_work: bool) -> None: + return self.base_subtest_for(_is_spacy_version_within_range, + spacy_model_range, spacy_version, should_work) + + def _check_version(self, spacy_model_range: str, spacy_version: str, should_work: bool = True) -> None: + return self.base_check_version(_is_spacy_version_within_range, + spacy_model_range, spacy_version, should_work) + + +class SpacyVersionInRangeOldRangeTests(SpacyVersionMockBaseTests): + """This is for versions before 1.7.0. + Those versions used to have spacy constraints of 'spacy<3.1.4,>=3.1.0' + and as such, they used v3.1.0 of en_core_web_md. + """ + spacy_model_range = ">=3.1.0,<3.2.0" # model range for en_core_web_md-3.1.0 + useful_spacy_versions = ["3.1.0", "3.1.2", "3.1.3"] + unsupported_spacy_versions = ["3.2.0", "3.5.3", "3.6.0"] + + def test_works_in_range(self): + for spacy_version in self.useful_spacy_versions: + self._check_version(self.spacy_model_range, spacy_version, should_work=True) + + def test_not_suitable_outside_range(self): + for spacy_version in self.unsupported_spacy_versions: + self._check_version(self.spacy_model_range, spacy_version, should_work=False) + + +class SpacyVersionInRangeNewRangeTests(SpacyVersionInRangeOldRangeTests): + """This is for versions AFTER (and includring) 1.7.0. + Those versions used to have spacy constraints of 'spacy>=3.1.0' + and as such, we use v3.4.0 of en_core_web_md. + + In this setup, generally (in GHA at 14.12.2023) + the spacy version for python version: + 3.8 -> spacy-3.7.2 + 3.9 -> spacy-3.7.2 + 3.10 -> spacy-3.7.2 + 3.11 -> spacy-3.7.2 + Alongside the `en_core_web_md-3.4.0` is installed. + It technically has the compatibility of >=3.4.0,<3.5.0. + But practically, I've seen no issues with spacy==3.7.2. + """ + spacy_model_range = ">=3.1.0" # model range for medcat>=1.7.0 + useful_spacy_versions = ["3.1.0", "3.1.2", "3.1.3", + "3.7.2", "3.6.3"] + unsupported_spacy_versions = ["3.0.0"] + + +class ModelPackHasCompatibleSpacyRangeTests(unittest.TestCase): + test_spacy_version = "3.1.0" + + def test_is_in_range(self): + with custom_spacy_version(self.test_spacy_version): + b = medcat_model_pack_has_compatible_spacy_model(FAKE_MODELPACK_MODEL_DIR) + self.assertTrue(b) + +class ModelPackHasInCompatibleSpacyRangeTests(unittest.TestCase): + test_spacy_version = "3.2.0" + + def test_is_in_range(self): + with custom_spacy_version(self.test_spacy_version): + b = medcat_model_pack_has_compatible_spacy_model(FAKE_MODELPACK_MODEL_DIR) + self.assertFalse(b) + + +class IsOlderSpacyVersionTests(VersionMockBaseTests): + test_spacy_version = "3.4.4" + expected_older = ["3.1.0", "3.2.0", "3.3.0", "3.4.0"] + expected_newer = ["3.5.0", "3.6.0", "3.7.1"] + + def _check_version(self, model_version: str, should_work: bool = True) -> None: + self.base_check_version(is_older_spacy_version, model_version, self.test_spacy_version, should_work) + + def test_older_works(self): + for model_version in self.expected_older: + self._check_version(model_version, should_work=True) + + def test_newer_fails(self): + for model_version in self.expected_newer: + self._check_version(model_version, should_work=False) + + +class HasSemiCompatibleSpacyModelTests(unittest.TestCase): + # model version on file is 3.1.0, + # and spacy_version range >=3.1.0,<3.2.0" + good_spacy_version = "3.1.3" + semi_good_spacy_version = "3.4.4" # newer than the model + bad_spacy_version = "3.0.0" # older than the model + + def run_subtest(self, spacy_version: str, should_work: bool) -> None: + with custom_spacy_version(spacy_version): + if should_work: + self.assertTrue(medcat_model_pack_has_semi_compatible_spacy_model(FAKE_MODELPACK_MODEL_DIR)) + else: + self.assertFalse(medcat_model_pack_has_semi_compatible_spacy_model(FAKE_MODELPACK_MODEL_DIR)) + + def test_works_compatible_spacy_version(self): + self.run_subtest(self.good_spacy_version, should_work=True) + + def test_works_semi_compatible_spacy_version(self): + self.run_subtest(self.semi_good_spacy_version, should_work=True) + + def test_fails_incompatible_spacy_version(self): + self.run_subtest(self.bad_spacy_version, should_work=False) From 4de89315f28211368e89a6f2c5e0fe69433750af Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 29 Jan 2024 13:59:23 +0200 Subject: [PATCH 58/64] CU-8693kp0gw: Pin more recent versions for major dependencies; Avoid major bumps where applicable (#392) * CU-8693kp0gw: Pin more recent versions for major dependencies; Avoid major bumps where applicable * CU-8693kp0gw: Bump default spacy model version (3.4.0 -> 3.6.0) * CU-8693kp0gw: Bump fakse test-spacy model version compatibility range * CU-8693kp0gw: Fix spacy compatibility tests for new fake model * CU-8693kp0gw: Undo what I thought was a typo fix (moved back from 'movar~virus' to 'movar~viruse') --- requirements-dev.txt | 2 +- requirements.txt | 2 +- setup.py | 18 +++++++++--------- tests/resources/ff_core_fake_dr/meta.json | 2 +- tests/test_ner.py | 3 +-- tests/utils/test_spacy_compatibility.py | 6 +++--- 6 files changed, 16 insertions(+), 17 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 5df4545d3..b4861fac3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ . -https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0-py3-none-any.whl +https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl flake8==4.0.1 mypy==1.0.0 mypy-extensions==0.4.3 diff --git a/requirements.txt b/requirements.txt index a2387e16a..45842566e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ . -https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0-py3-none-any.whl +https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl diff --git a/setup.py b/setup.py index 34963943a..7f018ae75 100644 --- a/setup.py +++ b/setup.py @@ -18,18 +18,18 @@ 'medcat.tokenizers', 'medcat.utils.meta_cat', 'medcat.pipeline', 'medcat.utils.ner', 'medcat.utils.saving', 'medcat.utils.regression', 'medcat.stats'], install_requires=[ - 'numpy>=1.22.0', # first to support 3.11 + 'numpy>=1.22.0,<1.26.0', # 1.22.0 is first to support python 3.11; post 1.26.0 there's issues with scipy 'pandas>=1.4.2', # first to support 3.11 - 'gensim>=4.3.0', # first to support 3.11 - 'spacy>=3.1.0', - 'scipy~=1.9.2', # first to support 3.11 - 'transformers>=4.34.0', + 'gensim>=4.3.0,<5.0.0', # 5.3.0 is first to support 3.11; avoid major version bump + 'spacy>=3.6.0,<4.0.0', # Some later model packs (e.g HPO) are made with 3.6.0 spacy model; avoid major version bump + 'scipy~=1.9.2', # 1.9.2 is first to support 3.11 + 'transformers>=4.34.0,<5.0.0', # avoid major version bump 'accelerate>=0.23.0', # required by Trainer class in de-id - 'torch>=1.13.0', # first to support 3.11 + 'torch>=1.13.0,<3.0.0', # 1.13 is first to support 3.11; 2.1.2 has been compatible, but avoid major 3.0.0 for now 'tqdm>=4.27', - 'scikit-learn>=1.1.3', # first to supporrt 3.11 - 'dill>=0.3.4', # allow later versions with later versions of datasets (tested with 0.3.6) - 'datasets>=2.2.2', # allow later versions, tested with 2.7.1 + 'scikit-learn>=1.1.3,<2.0.0', # 1.1.3 is first to supporrt 3.11; avoid major version bump + 'dill>=0.3.6,<1.0.0', # stuff saved in 0.3.6/0.3.7 is not always compatible with 0.3.4/0.3.5; avoid major bump + 'datasets>=2.2.2,<3.0.0', # avoid major bump 'jsonpickle>=2.0.0', # allow later versions, tested with 3.0.0 'psutil>=5.8.0', # 0.70.12 uses older version of dill (i.e less than 0.3.5) which is required for datasets diff --git a/tests/resources/ff_core_fake_dr/meta.json b/tests/resources/ff_core_fake_dr/meta.json index fe9825db7..48c50ae41 100644 --- a/tests/resources/ff_core_fake_dr/meta.json +++ b/tests/resources/ff_core_fake_dr/meta.json @@ -4,5 +4,5 @@ "version":"3.1.0", "description":"This is a FAKE model", "author":"Fakio Martimus", - "spacy_version":">=3.1.0,<3.2.0" + "spacy_version":">=3.1.0,<4.0.0" } \ No newline at end of file diff --git a/tests/test_ner.py b/tests/test_ner.py index 102224552..b5b185842 100644 --- a/tests/test_ner.py +++ b/tests/test_ner.py @@ -67,8 +67,7 @@ def tearDownClass(cls) -> None: cls.pipe.destroy() def test_aa_cdb_names_output(self): - print("Fixing 'movar~viruse' -> 'movar-virus' for newere en_core_web_md") - target_result = {'S-229004': {'movar~virus', 'movar', 'movar~viruses'}, 'S-229005': {'cdb'}} + target_result = {'S-229004': {'movar~viruse', 'movar', 'movar~viruses'}, 'S-229005': {'cdb'}} self.assertEqual(self.cdb.cui2names, target_result) def test_ab_entities_length(self): diff --git a/tests/utils/test_spacy_compatibility.py b/tests/utils/test_spacy_compatibility.py index 5cf0dd03e..ff10571c4 100644 --- a/tests/utils/test_spacy_compatibility.py +++ b/tests/utils/test_spacy_compatibility.py @@ -149,7 +149,7 @@ def test_reads_info(self): class GetSpacyModelVersionTests(GetSpacyModelInfoTests): - expected_spacy_version = ">=3.1.0,<3.2.0" + expected_spacy_version = ">=3.1.0,<4.0.0" @classmethod def setUpClass(cls) -> None: @@ -253,9 +253,9 @@ def test_is_in_range(self): self.assertTrue(b) class ModelPackHasInCompatibleSpacyRangeTests(unittest.TestCase): - test_spacy_version = "3.2.0" + test_spacy_version = "3.0.0" - def test_is_in_range(self): + def test_is_not_in_range(self): with custom_spacy_version(self.test_spacy_version): b = medcat_model_pack_has_compatible_spacy_model(FAKE_MODELPACK_MODEL_DIR) self.assertFalse(b) From 85cbe7726e05361fbcf60208153943bf3334b0e4 Mon Sep 17 00:00:00 2001 From: Jerry Genser Date: Tue, 30 Jan 2024 04:00:23 -0500 Subject: [PATCH 59/64] add: metacat can predict on spans in arbitrary spangroups (#391) * add: ability to predict on other spangroups * add: pr comments and better error * fix: typo * fix: linting --- medcat/config_meta_cat.py | 3 +++ medcat/meta_cat.py | 27 +++++++++++++-------- tests/test_meta_cat.py | 49 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 67 insertions(+), 12 deletions(-) diff --git a/medcat/config_meta_cat.py b/medcat/config_meta_cat.py index ae3e82ef8..47f42dc28 100644 --- a/medcat/config_meta_cat.py +++ b/medcat/config_meta_cat.py @@ -37,6 +37,9 @@ class General(MixingConfig, BaseModel): a deployment.""" pipe_batch_size_in_chars: int = 20000000 """How many characters are piped at once into the meta_cat class""" + span_group: Optional[str] = None + """If set, the spacy span group that the metacat model will assign annotations. + Otherwise defaults to doc._.ents or doc.ents per the annotate_overlapping settings""" class Config: extra = Extra.allow diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py index d92e6ea61..bf7f09709 100644 --- a/medcat/meta_cat.py +++ b/medcat/meta_cat.py @@ -5,7 +5,7 @@ import numpy from multiprocessing import Lock from torch import nn, Tensor -from spacy.tokens import Doc +from spacy.tokens import Doc, Span from datetime import datetime from typing import Iterable, Iterator, Optional, Dict, List, Tuple, cast, Union from medcat.utils.hasher import Hasher @@ -357,6 +357,20 @@ def load(cls, save_dir_path: str, config_dict: Optional[Dict] = None) -> "MetaCA return meta_cat + def get_ents(self, doc: Doc) -> Iterable[Span]: + spangroup_name = self.config.general.span_group + if spangroup_name: + try: + return doc.spans[spangroup_name] + except KeyError: + raise Exception(f"Configuration error MetaCAT was configured to set meta_anns on {spangroup_name} but this spangroup was not set on the doc.") + + # Should we annotate overlapping entities + if self.config.general['annotate_overlapping']: + return doc._.ents + + return doc.ents + def prepare_document(self, doc: Doc, input_ids: List, offset_mapping: List, lowercase: bool) -> Tuple: """Prepares document. @@ -381,11 +395,7 @@ def prepare_document(self, doc: Doc, input_ids: List, offset_mapping: List, lowe cntx_right = config.general['cntx_right'] replace_center = config.general['replace_center'] - # Should we annotate overlapping entities - if config.general['annotate_overlapping']: - ents = doc._.ents - else: - ents = doc.ents + ents = self.get_ents(doc) samples = [] last_ind = 0 @@ -522,10 +532,7 @@ def _set_meta_anns(self, predictions = all_predictions[start_ind:end_ind] confidences = all_confidences[start_ind:end_ind] - if config.general['annotate_overlapping']: - ents = doc._.ents - else: - ents = doc.ents + ents = self.get_ents(doc) for ent in ents: ent_ind = ent_id2ind[ent._.id] diff --git a/tests/test_meta_cat.py b/tests/test_meta_cat.py index df5be9f77..8cd444668 100644 --- a/tests/test_meta_cat.py +++ b/tests/test_meta_cat.py @@ -7,7 +7,8 @@ from medcat.meta_cat import MetaCAT from medcat.config_meta_cat import ConfigMetaCAT from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT - +import spacy +from spacy.tokens import Span class MetaCATTests(unittest.TestCase): @@ -19,7 +20,7 @@ def setUpClass(cls) -> None: config.train['nepochs'] = 1 config.model['input_size'] = 100 - cls.meta_cat = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config) + cls.meta_cat: MetaCAT = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config) cls.tmp_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tmp") os.makedirs(cls.tmp_dir, exist_ok=True) @@ -44,6 +45,50 @@ def test_save_load(self): self.assertEqual(f1, n_f1) + def _prepare_doc_w_spangroup(self, spangroup_name: str): + """ + Create spans under an arbitrary spangroup key + """ + Span.set_extension('id', default=0, force=True) + Span.set_extension('meta_anns', default=None, force=True) + nlp = spacy.blank("en") + doc = nlp("Pt has diabetes and copd.") + span_0 = doc.char_span(7,15, label="diabetes") + assert span_0.text == 'diabetes' + + span_1 = doc.char_span(20,24, label="copd") + assert span_1.text == 'copd' + doc.spans[spangroup_name] = [span_0, span_1] + return doc + + def test_predict_spangroup(self): + json_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'resources', 'mct_export_for_meta_cat_test.json') + self.meta_cat.train(json_path, save_dir_path=self.tmp_dir) + self.meta_cat.save(self.tmp_dir) + n_meta_cat = MetaCAT.load(self.tmp_dir) + + spangroup_name = "mock_span_group" + n_meta_cat.config.general.span_group = spangroup_name + + doc = self._prepare_doc_w_spangroup(spangroup_name) + doc = n_meta_cat(doc) + spans = doc.spans[spangroup_name] + self.assertEqual(len(spans), 2) + + # All spans are annotate + for span in spans: + self.assertEqual(span._.meta_anns['Status']['value'], "Affirmed") + + # Informative error if spangroup is not set + doc = self._prepare_doc_w_spangroup("foo") + n_meta_cat.config.general.span_group = "bar" + try: + doc = n_meta_cat(doc) + except Exception as error: + self.assertIn("Configuration error", str(error)) + + n_meta_cat.config.general.span_group = None + if __name__ == '__main__': unittest.main() From 0a9a615d9700feead5f1e95f7f6358803680d05d Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Thu, 8 Feb 2024 18:16:09 +0000 Subject: [PATCH 60/64] CU-8693ruk7p: Bump mypy version in dev-requirements (#396) * CU-8693ruk7p: Bump mypy version (along with extensions) in dev-requirements * CU-8693ruk7p: Pin max major version for mypy --- requirements-dev.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index b4861fac3..2c9528cde 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,8 +1,8 @@ . https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl flake8==4.0.1 -mypy==1.0.0 -mypy-extensions==0.4.3 +mypy>=1.7.0,<2.0.0 +mypy-extensions>=1.0.0 types-aiofiles==0.8.3 types-PyYAML==6.0.3 types-setuptools==57.4.10 From df74f3214e4825f51ded1aa09fb98ab805203fa0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 10:23:10 +0000 Subject: [PATCH 61/64] Bump django from 3.2.23 to 3.2.24 in /webapp/webapp (#395) * Bump django from 3.2.23 to 3.2.24 in /webapp/webapp Bumps [django](https://github.com/django/django) from 3.2.23 to 3.2.24. - [Commits](https://github.com/django/django/compare/3.2.23...3.2.24) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] * CU-8693ruk7p: Bump mypy version in dev-requirements (#396) (#398) * CU-8693ruk7p: Bump mypy version (along with extensions) in dev-requirements * CU-8693ruk7p: Pin max major version for mypy --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Mart Ratas --- webapp/webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/webapp/requirements.txt b/webapp/webapp/requirements.txt index ce68f853d..4de8f07cc 100644 --- a/webapp/webapp/requirements.txt +++ b/webapp/webapp/requirements.txt @@ -1,4 +1,4 @@ -Django==3.2.23 +Django==3.2.24 django-dbbackup==4.0.0b0 django-storages[boto3]==1.12.3 django-cron==0.5.1 From e8658c487a07bf63547bf6f328c10200af886f1e Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 12 Feb 2024 11:03:50 +0000 Subject: [PATCH 62/64] CU-8693t24ed: Add workaround for older DeID models in newer MedCAT (#397) * CU-8693t24ed: Add workaround for older DeID models in newer MedCAT * CU-8693t24ed: Add a check for existing attribute before changing it --- medcat/ner/transformers_ner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/medcat/ner/transformers_ner.py b/medcat/ner/transformers_ner.py index 227ccc083..729be4625 100644 --- a/medcat/ner/transformers_ner.py +++ b/medcat/ner/transformers_ner.py @@ -79,6 +79,10 @@ def __init__(self, cdb, config: Optional[ConfigTransformersNER] = None, def create_eval_pipeline(self): self.ner_pipe = pipeline(model=self.model, task="ner", tokenizer=self.tokenizer.hf_tokenizer) + if not hasattr(self.ner_pipe.tokenizer, '_in_target_context_manager'): + # NOTE: this will fix the DeID model(s) created before medcat 1.9.3 + # though this fix may very well be unstable + self.ner_pipe.tokenizer._in_target_context_manager = False self.ner_pipe.device = self.model.device def get_hash(self): From 08570eb1789cfebb3eff4c2337606ea7f57ddeac Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 12 Feb 2024 16:43:25 +0000 Subject: [PATCH 63/64] CU-2hz5ump deid mulitprocessing (#393) * CU-2hz5ump: Separate the text replacement in deid * CU-2hz5ump: Fix some indentation on multiprocessing methods in CAT * CU-2hz5ump: Add method to deid multithreaded * CU-2hz5ump: Add tests for deid multiprocessing * CU-2hz5ump: Fix return type for multiprocessing deid method * CU-2hz5ump: Remove unused import * CU-2hz5ump: Fix typing issue within deid multi texts method * CU-2hz5ump: Add removal parts to deid tests * CU-2hz5ump: Add error handling with message to deid multiprocessing issues * CU-2hz5ump: Unpin mypy for dev requirements * CU-2hz5ump: Fix mypy unpin typo * CU-2hz5ump: Force later version of mypy * CU-2hz5ump: Force mypy extensions to newer version * CU-2hz5ump: Add 20 minute timeout to main workflow * CU-2hz5ump: Add 20 minute timeout to main workflow (build) * CU-2hz5ump: Add 19 minute timeout to tests step of main workflow * CU-2hz5ump: Move to a 17 minute timeout to tests step of main workflow * CU-2hz5ump: Add a 10 minute timeout for multiprocessing DeID tests * Revert "CU-2hz5ump: Add a 10 minute timeout for multiprocessing DeID tests" This reverts commit 5e223346cd8897620befaf219413d7a414d63539. * CU-2hz5ump: Add a 3 minute timeout (through a decorator) to multiprocessing DeID tests * CU-2hz5ump: Remove overly strict DeID test * CU-2hz5ump: Add condition for number of results for multiprocessing DeID test --- .github/workflows/main.yml | 4 +- medcat/cat.py | 43 +++++++++++------- medcat/utils/ner/deid.py | 39 +++++++++++++++- medcat/utils/ner/helpers.py | 13 +++++- requirements-dev.txt | 1 + tests/resources/deid_test_data.json | 1 + tests/utils/ner/test_deid.py | 70 ++++++++++++++++++++++++++++- 7 files changed, 148 insertions(+), 23 deletions(-) create mode 100644 tests/resources/deid_test_data.json diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a5468fb9b..687160ed9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -33,7 +33,8 @@ jobs: flake8 medcat - name: Test run: | - python -m unittest discover + timeout 17m python -m unittest discover + continue-on-error: true publish-to-test-pypi: @@ -43,6 +44,7 @@ jobs: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') != true runs-on: ubuntu-20.04 + timeout-minutes: 20 concurrency: publish-to-test-pypi needs: [build] diff --git a/medcat/cat.py b/medcat/cat.py index d3003b24b..9159eddd8 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -1005,11 +1005,11 @@ def get_entities(self, return out def get_entities_multi_texts(self, - texts: Union[Iterable[str], Iterable[Tuple]], - only_cui: bool = False, - addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'], - n_process: Optional[int] = None, - batch_size: Optional[int] = None) -> List[Dict]: + texts: Union[Iterable[str], Iterable[Tuple]], + only_cui: bool = False, + addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'], + n_process: Optional[int] = None, + batch_size: Optional[int] = None) -> List[Dict]: """Get entities Args: @@ -1053,6 +1053,15 @@ def get_entities_multi_texts(self, for o in out: if o is not None: o.pop('text', None) + except RuntimeError as e: + if e.args == ('_share_filename_: only available on CPU',): + raise ValueError("Issue while performing multiprocessing. " + "This is mostly likely to happen when " + "using NER models (i.e DeId). If that is " + "the case you could either a) save the " + "model on disk and then load it back up; " + "or b) install cpu-only toch.") from e + raise e finally: self.pipe.reset_error_handler() @@ -1375,20 +1384,20 @@ def multiprocessing_pipe(self, in_data: Union[List[Tuple], Iterable[Tuple]], return_dict: bool = True, batch_factor: int = 2) -> Union[List[Tuple], Dict]: return self.multiprocessing_batch_docs_size(in_data=in_data, nproc=nproc, - batch_size=batch_size, - only_cui=only_cui, - addl_info=addl_info, - return_dict=return_dict, - batch_factor=batch_factor) + batch_size=batch_size, + only_cui=only_cui, + addl_info=addl_info, + return_dict=return_dict, + batch_factor=batch_factor) def multiprocessing_batch_docs_size(self, - in_data: Union[List[Tuple], Iterable[Tuple]], - nproc: Optional[int] = None, - batch_size: Optional[int] = None, - only_cui: bool = False, - addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'], - return_dict: bool = True, - batch_factor: int = 2) -> Union[List[Tuple], Dict]: + in_data: Union[List[Tuple], Iterable[Tuple]], + nproc: Optional[int] = None, + batch_size: Optional[int] = None, + only_cui: bool = False, + addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'], + return_dict: bool = True, + batch_factor: int = 2) -> Union[List[Tuple], Dict]: """Run multiprocessing NOT FOR TRAINING. This method batches the data based on the number of documents as specified by the user. diff --git a/medcat/utils/ner/deid.py b/medcat/utils/ner/deid.py index 7c5d0231c..13ee5e04c 100644 --- a/medcat/utils/ner/deid.py +++ b/medcat/utils/ner/deid.py @@ -34,12 +34,12 @@ - config - cdb """ -from typing import Union, Tuple, Any +from typing import Union, Tuple, Any, List, Iterable, Optional from medcat.cat import CAT from medcat.utils.ner.model import NerModel -from medcat.utils.ner.helpers import _deid_text as deid_text +from medcat.utils.ner.helpers import _deid_text as deid_text, replace_entities_in_text class DeIdModel(NerModel): @@ -72,8 +72,43 @@ def deid_text(self, text: str, redact: bool = False) -> str: Returns: str: The deidentified text. """ + self.cat.get_entities return deid_text(self.cat, text, redact=redact) + def deid_multi_texts(self, + texts: Union[Iterable[str], Iterable[Tuple]], + redact: bool = False, + addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'], + n_process: Optional[int] = None, + batch_size: Optional[int] = None) -> List[str]: + """Deidentify text on multiple branches + + Args: + texts (Union[Iterable[str], Iterable[Tuple]]): Text to be annotated + redact (bool): Whether to redact the information. + addl_info (List[str], optional): Additional info. Defaults to ['cui2icd10', 'cui2ontologies', 'cui2snomed']. + n_process (Optional[int], optional): Number of processes. Defaults to None. + batch_size (Optional[int], optional): The size of a batch. Defaults to None. + + Returns: + List[str]: List of deidentified documents. + """ + entities = self.cat.get_entities_multi_texts(texts, addl_info=addl_info, + n_process=n_process, batch_size=batch_size) + out = [] + for raw_text, _ents in zip(texts, entities): + ents = _ents['entities'] + text: str + if isinstance(raw_text, tuple): + text = raw_text[1] + elif isinstance(raw_text, str): + text = raw_text + else: + raise ValueError(f"Unknown raw text: {type(raw_text)}: {raw_text}") + new_text = replace_entities_in_text(text, ents, get_cui_name=self.cat.cdb.get_name, redact=redact) + out.append(new_text) + return out + @classmethod def load_model_pack(cls, model_pack_path: str) -> 'DeIdModel': """Load DeId model from model pack. diff --git a/medcat/utils/ner/helpers.py b/medcat/utils/ner/helpers.py index 7dcada3dd..b22809696 100644 --- a/medcat/utils/ner/helpers.py +++ b/medcat/utils/ner/helpers.py @@ -1,3 +1,5 @@ +from typing import Callable, Dict + from medcat.utils.data_utils import count_annotations from medcat.cdb import CDB @@ -27,11 +29,18 @@ def _deid_text(cat, text: str, redact: bool = False) -> str: Returns: str: The de-identified document. """ - new_text = str(text) entities = cat.get_entities(text)['entities'] + return replace_entities_in_text(text, entities, cat.cdb.get_name, redact=redact) + + +def replace_entities_in_text(text: str, + entities: Dict, + get_cui_name: Callable[[str], str], + redact: bool = False) -> str: + new_text = str(text) for ent in sorted(entities.values(), key=lambda ent: ent['start'], reverse=True): r = "*"*(ent['end']-ent['start'] - ) if redact else cat.cdb.get_name(ent['cui']) + ) if redact else get_cui_name(ent['cui']) new_text = new_text[:ent['start']] + f'[{r}]' + new_text[ent['end']:] return new_text diff --git a/requirements-dev.txt b/requirements-dev.txt index 2c9528cde..1d210bc4c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,3 +6,4 @@ mypy-extensions>=1.0.0 types-aiofiles==0.8.3 types-PyYAML==6.0.3 types-setuptools==57.4.10 +timeout-decorator==0.5.0 diff --git a/tests/resources/deid_test_data.json b/tests/resources/deid_test_data.json new file mode 100644 index 000000000..1ed443a69 --- /dev/null +++ b/tests/resources/deid_test_data.json @@ -0,0 +1 @@ +{"projects": [{"name": "/Users/martratas/Documents/CogStack/.MedCAT.nosync/MedCAT/temp/deid/testing-PHI-Gold-fixed.tar.gz", "documents": [{"text": "\n\n\nRecord date: 2090-07-16\n\n\n\n\nNAME: Curtis, Om \nMRN: 7682941\n \nHe is feeling great. He is all done with his radiation to the left axilla for metastatic\nsquamous cell cancer. He is following closely with the radiation oncologist and the\nmedical oncologist. He is seeing them both later this month. He has had no\nproblems with chest pains or shortness of breath. All in all, things are going well.\n\nPHYSICAL EXAM: On exam, no acute distress. Lungs are clear. Heart is regular\nrate and rhythm. No murmurs, gallops or rubs. He does have some skin\ndiscoloration around the left axilla but I feel no mass. He has a well-healed incision. \nThere is no hair noted in or around the axilla. Extremities with no edema.\n\nASSESSMENT AND PLAN: \n\n(1) CAD/hypertension/diabetes mellitus. This is stable. Check glycosylated\n hemoglobin. \n\n(2) Metastatic squamous cell cancer. He is being followed closely by Oncology for\n this. Follow-up with me in the spring.\n\nWilliam V. Geiger, M.D.\n\nWVG/xin/quilici\n\n\n\n\n", "name": "119-03.xml", "annotations": [{"start": "16", "end": "26", "cui": "DATE", "value": "2090-07-16"}, {"start": "40", "end": "50", "cui": "PATIENT", "value": "Curtis, Om"}, {"start": "61", "end": "68", "cui": "MEDICALRECORD", "value": "7682941"}, {"start": "972", "end": "978", "cui": "DATE", "value": "spring"}, {"start": "981", "end": "998", "cui": "DOCTOR", "value": "William V. Geiger"}, {"start": "1006", "end": "1009", "cui": "DOCTOR", "value": "WVG"}, {"start": "1010", "end": "1013", "cui": "DOCTOR", "value": "xin"}, {"start": "1014", "end": "1021", "cui": "DOCTOR", "value": "quilici"}]}, {"text": "\n\n\nRecord date: 2078-03-17\n\n\n\nPatient Name: JORGENSON,VIVIANLEE [ 47190847(JMH) ] Date of Visit: 03/17/2078\n\n\nCC: Syncope, Afib\n\n\nHPI: \n71 year old lady with a history of A fib was seen in clinic today for complaints of DOE and increased wt. She was found to be in mild to moderate CHF and an increase in her torsemide dose was recommended. On her way to a blood draw, while in the elevator, she had syncope and hit her head. The fall was witnessed by her daughter. There was no prodrome, no nausea, no incontinence with the fall. The pt does not have any seizure or hypoglycemia history. She notes an 8 lb wt gain over 2-3 weeks and denies dietary indiscretions. She takes her medications faithfully which include an escalating dose of torsemide in over the last several months. She's been hospitalized twice for CHF - once in 2075 and again in 11/77.\n\n\nThe pt was brought to the ED. In the ED the patient had a head CT was done which did not reveal any acute pathology.\n\n\nPMH:Cardiomyopathy : Nonischemic, 6/19/74 cath no significant coronary disease, 11/26/77 EF 20% with and global hypokinesis \nCoronary artery disease : 6/19/74 cath: RCA 30%\nThyroid cancer : Papillary nodule Ca, 2071, a/p thyroidectomy\nDiabetes: Had been on glyburide, currently diet controlled, 11/25 A1c 6.3\nHypothyroidism: 11/25: 1.4 normal\nElevated cholesterol: 11/26: LDL of 36 and an HDL of 26\nHypertension \nAtrial fibrillation on Coumadin\nNon sustained ventricular tachycardia \n\n\nMedications\nAmbien (ZOLPIDEM TARTRATE) 5 MG (5MG TABLET take 1) PO QHS PRN \nAsa (ACETYLSALICYLIC ACID) 81 MG (81MG TABLET take 1) PO QD \nCelexa (CITALOPRAM) 20MG TABLET take 1 Tablet(s) PO QD \nCoreg (CARVEDILOL) 12.5 MG (12.5MG TABLET take 1) PO BID \nCoumadin (WARFARIN SODIUM) 1 MG (2.5MG TABLET take 1) PO QPM \nDigoxin 0.0612 MG (125MCG TABLET take 1) PO QD \nLipitor (ATORVASTATIN) 20 MG (20MG TABLET take 1) PO QD \nLisinopril 40 MG PO QD \nPotassium CHLORIDE SLOW REL. (KCL SLOW RELEASE) 20 MEQ (20MEQ TAB PRT SR take 1) PO BID \nSynthroid (LEVOTHYROXINE SODIUM) 150MCG TABLET PO variable \nTorsemide 200 MG (20MG TABLET take 1) PO QD \n\n\n\n\n\nAllergies\nNKA \n \nFH: Mom had MI, age 50\n\n\nSH: \n Lives with husband (Blacksmith, recently had CABG) and has 4 grown children.\n\nHealth-Related Behaviors\nAlcohol-social only\nTobacco-prior use, 60 ppy, quit 2067Drug use-no illicit drugs \n\n\nPE: \nT 98, BP 100/65, HR 131, RR 18, 96%RA\nGEN: NAD\nHEENT: PERRL, EOMI, mm moist\nCV: irreg irreg, no murmurs appreciated. JVP 15 cm\nLUNG: Rales in bases bilaterally\nABD: soft, non-tender, non-distended\nEXT: no c/c/e\nNeuro: A&Ox3, moves all extremiteis.\n\n\nCXR: enlarged heart. Minimal pulmonary infiltrate\nEKG: A fib, no ischemic changes, poor RWP. Unchanged from previous.\n\n\nLabs\nResultsDate/Time NA K CL CO2 03/17/2078 [1] 139 4.6 103 28 03/17/2078 139 4.2 102 26 Date/Time BUN CRE EGFR GLU 03/17/2078 [2] 50 (*) 1.73 (*) 29 [3] 110 03/17/2078 50 (*) 1.66 (*#) 30 [4] 106 Date/Time ANION 03/17/2078 [5] 8 03/17/2078 11 Date/Time CA MG TBILI TP 03/17/2078 [6] 9.7 3.0 (*) 6.9 03/17/2078 9.7 2.2 2.9 (*) 7.0 Date/Time ALB GLOB LIPS 03/17/2078 [7] 4.0 2.9 62 (*)[8] 03/17/2078 4.0 3.0 Date/Time ALT/SGPT AST/SGOT ALKP TBILI 03/17/2078 [9] 8 24 106 3.0 (*) 03/17/2078 8 (#) 22 106 2.9 (*) Date/Time CK CK-MB TROP-I 03/17/2078 [10] 56 2.1 SEE DETAIL[11] Date/Time TSH 03/17/2078 4.091 Date/Time WBC RBC HGB HCT 03/17/2078 [12] 5.68 (#) 4.67 15.2 45.8 Date/Time MCV MCH MCHC PLT 03/17/2078 [13] 98.2 (*#) 32.6 (*#) 33.1 136 (*) Date/Time RDW 03/17/2078 [14] 14.6 (*) Date/Time %POLY-A %LYMPH-A %MONO-A %EOS-A 03/17/2078 [15] 76.4 (*) 15.4 (*) 6.6 1.3 Date/Time %BASO-A 03/17/2078 [16] 0.3 Date/Time ANEUT-A ALYMP-A AMONO-A AEOS-A 03/17/2078 [17] 4.34 .88 0.37 0.07 Date/Time ABASO-A 03/17/2078 [18] 0.02 Date/Time HYPO MACRO 03/17/2078 [19] + + Date/Time PT PT-INR PTT 03/17/2078 [20] 24.1 (*) 2.1 (*) 39.2 (*) A/P: 71 year old lady with syncope of unclear eitiology with CHF/Afib. Ddx includes cardiac arrhythmia, hypovolemia, vaso-vagal. Neurologic causes less likely from history.\n\n#CV - i. Low suspicion ACS- Rule out with serial CE\n- Continue asa, statin#CV - p. Decompensated CHF- Diuresis with IV lasix. Consider lasix drip if inadequate response.\n- Daily wt. Strict I/O.- Cont. digoxin- Will start amiodarone 400 bid- Discuss possible role of AICD with patient and family.#CV - r. - Telemetry#AFR - acute on chronic - likely from poor forward flow- reduce lisinopril- urine electrolytes, eos.\n#Psych\n- continue celexa\n#Endocrine\n- continue home thyroid regimen\n#FEN\n- low sodium, 2L fluid restricted diet#CODE: Full\n\n\n\n\n\n\n\n\n____________________________________\nXavier B. Nix, M.D., Ph.D.\n\n\n\n", "name": "132-04.xml", "annotations": [{"start": "16", "end": "26", "cui": "DATE", "value": "2078-03-17"}, {"start": "44", "end": "63", "cui": "PATIENT", "value": "JORGENSON,VIVIANLEE"}, {"start": "66", "end": "74", "cui": "MEDICALRECORD", "value": "47190847"}, {"start": "75", "end": "78", "cui": "HOSPITAL", "value": "JMH"}, {"start": "97", "end": "107", "cui": "DATE", "value": "03/17/2078"}, {"start": "136", "end": "138", "cui": "AGE", "value": "71"}, {"start": "828", "end": "832", "cui": "DATE", "value": "2075"}, {"start": "846", "end": "851", "cui": "DATE", "value": "11/77"}, {"start": "1010", "end": "1017", "cui": "DATE", "value": "6/19/74"}, {"start": "1056", "end": "1064", "cui": "DATE", "value": "11/26/77"}, {"start": "1127", "end": "1134", "cui": "DATE", "value": "6/19/74"}, {"start": "1187", "end": "1191", "cui": "DATE", "value": "2071"}, {"start": "1272", "end": "1277", "cui": "DATE", "value": "11/25"}, {"start": "1302", "end": "1307", "cui": "DATE", "value": "11/25"}, {"start": "1344", "end": "1349", "cui": "DATE", "value": "11/26"}, {"start": "2145", "end": "2147", "cui": "AGE", "value": "50"}, {"start": "2177", "end": "2187", "cui": "PROFESSION", "value": "Blacksmith"}, {"start": "2312", "end": "2316", "cui": "DATE", "value": "2067"}, {"start": "2814", "end": "2824", "cui": "DATE", "value": "03/17/2078"}, {"start": "2892", "end": "2902", "cui": "DATE", "value": "03/17/2078"}, {"start": "3048", "end": "3058", "cui": "DATE", "value": "03/17/2078"}, {"start": "3126", "end": "3136", "cui": "DATE", "value": "03/17/2078"}, {"start": "3237", "end": "3247", "cui": "DATE", "value": "03/17/2078"}, {"start": "3270", "end": "3280", "cui": "DATE", "value": "03/17/2078"}, {"start": "3381", "end": "3391", "cui": "DATE", "value": "03/17/2078"}, {"start": "3459", "end": "3469", "cui": "DATE", "value": "03/17/2078"}, {"start": "3600", "end": "3610", "cui": "DATE", "value": "03/17/2078"}, {"start": "3663", "end": "3673", "cui": "DATE", "value": "03/17/2078"}, {"start": "3804", "end": "3814", "cui": "DATE", "value": "03/17/2078"}, {"start": "3882", "end": "3892", "cui": "DATE", "value": "03/17/2078"}, {"start": "4023", "end": "4033", "cui": "DATE", "value": "03/17/2078"}, {"start": "4119", "end": "4129", "cui": "DATE", "value": "03/17/2078"}, {"start": "4230", "end": "4240", "cui": "DATE", "value": "03/17/2078"}, {"start": "4386", "end": "4396", "cui": "DATE", "value": "03/17/2078"}, {"start": "4497", "end": "4507", "cui": "DATE", "value": "03/17/2078"}, {"start": "4608", "end": "4618", "cui": "DATE", "value": "03/17/2078"}, {"start": "4719", "end": "4729", "cui": "DATE", "value": "03/17/2078"}, {"start": "4830", "end": "4840", "cui": "DATE", "value": "03/17/2078"}, {"start": "4941", "end": "4951", "cui": "DATE", "value": "03/17/2078"}, {"start": "5022", "end": "5032", "cui": "DATE", "value": "03/17/2078"}, {"start": "5133", "end": "5143", "cui": "DATE", "value": "03/17/2078"}, {"start": "5223", "end": "5225", "cui": "AGE", "value": "71"}, {"start": "5976", "end": "5989", "cui": "DOCTOR", "value": "Xavier B. Nix"}]}, {"text": "\n\n\nRecord date: 2092-10-29\n\n \n\nTeam 3 Intern Admit Note\n\nName: Walton, Levi\n\nMR#: 2554172\n\nDate: 10/29/92\nPCP: Paul Eggleston\n\nCardiology: Youmans\n\n\n\nCC: emesis, light-headedness \n\n\n\nHPI: 85 y/o with CAD and active ischemia on 6/87 stress test (medically managed) who was feeling well until last night. During the day yesterday, he went on his daily walk with his wife and felt fine. In the evening around 8:30 pm, he experienced dull, 8/10 pain under both arm pits which resolved within 20 minutes after taking 3 SL TNG 5 mins apart. He denies SOB, N/V, radiation or diaphoresis associated with this arm pain. He states that he gets similar bilateral underarm pain about twice per week which is usually relieved by a single SL TNG within 10-15 mins. The arm pain is not exertional.\n\n\tAfter the arm pain relieved last night, the pt took 3 Aleve tabs which his daughter bought for him for arthritis pain. A couple of hours later, he became nauseated and had a single episode of dark brown emesis. After vomiting, he felt light-headed while walking around the house. He had a bowel movement this AM which he states was darker than usual, almost black looking. He denies BRBPR, abdominal pain, CP, SOB, palpitations, headache, fever or chills.\n\n\tIn the setting of significant fatigue and light-headedness, the pt presented to the ED this morning. Admission vitals: 97.2, 120/78, HR 80's, RR12, sats 97% RA. He was noted to be in A fib. Hct was 22 (down from 38 last week). NG lavage was not performed in the ED. However, he did have another episode of bilateral arm pain which was relieved by SL TNG x 3. HR at the time of arm pain was in the 80's and ECG showed A fib with possible TWI in I,L and V6 but relatively unchanged from admission ECG without pain. He subsequently received lopressor 2.5 IV/12.5 PO, isordil 10 PO and zantac 50 IV. \n\n\t \n\nPMHx:\n\n1.\tCAD\n\na.\tS/p anterior MI 2082 - pt does not recall having chest pain associated with MI\n\nb.\tCath 2082 - 3 vessel CAD, PTCA to mid-LAD lesion, 90% RCA, occluded OM2\n\nc.\tETT-mibi 6/87 - 75% MPHR, 8 METS. 0.5-1 mm ST depressions in inferior and percordial (V4, V5) leads; imaging showed inferior and posterior ischemia\n\nd.\tECHO 3/87 - EF 67%, LA 37mm, no WMA \n\n2.\tBPH\n\n3.\tOA/DJD bilateral shoulders\n\n4.\tcolonoscopy 2087 - hemorrhoids, diverticulosis, single tubular adenoma excised; no endoscopy since then \n\n5.\tDM II - diet controlled; last A1c 6.6 (10/92)\n\n6.\thyperlipidemia - chol 111, LDL 55 HDL 33, tri 115 (10/92 on treatment)\n\n\n\nMeds:\n\n1.\tatenolol 100 QD\n\n2.\tisordil 30 QID\n\n3.\tASA 81 QD\n\n4.\tlisinopril 20 QD\n\n5.\tSL TNG prn\n\n6.\tMVI\n\n7.\tlipitor 10 QD\n\n\n\nAllergies: NKDA\n\n\n\nSHx: lives with wife; walks around the mall daily\n\nTobacco - quit >35 years ago\n\nEtOH - 2 mixed drinks (vodka) per night; denies anything more\n\nDrugs - none\n\n\n\nPE: vitals 95.0 115/58 87 18 98% RA\n\nGeneral: no acute distress\n\nHEENT: sclera anicteric; EOMI, PERRLA, OP without masses or infiltrate\n\nNECK: JVP flat; carotid pulses brisk and symmetric; no carotid bruits; no thyromegaly; no cervical or supraclav. LAD\n\nLUNGS: CTA B\n\nCARDIAC: irreg irreg, nl s1s2, no M/R/G\n\nABD: soft, NT/ND, pos. BS, no HSM\n\nRECTAL: guaiac positive, normal tone (per ED resident)\n\nEXT: no axillary or inguinal LAD; no c/c/e\n\nMSK: significantly limited and painful ROM at both shoulders\n\nNEURO: A&O x 3; CN II-XII intact; otherwise non-focal\n\n\n\nLabs:\n\nSodium (Stat Lab) 134 L (135-145) mmol/L\n\nPotassium (Stat Lab) 3.9 (3.4-4.8) mmol/L\n\nChloride (Stat Lab) 102 (100-108) mmol/L\n\nCO2 (Stat Lab) 22.5 L (23.0-31.9) mmol/L\n\nBUN (Stat Lab) 67 H (8-25) mg/dl\n\nCreatinine (Stat Lab) 1.4 (0.6-1.5) mg/dl\n\nGlucose (Stat Lab) 291 H (70-110) mg/dl\n\n\n\nCalcium 9.2 (8.5-10.5) mg/dl\n\nPhosphorus 3.9 (2.6-4.5) mg/dl\n\nMagnesium 1.6 (1.4-2.0) meq/L\n\nTotal Protein 6.5 (6.0-8.3) g/dl\n\nAlbumin 3.5 (3.3-5.0) g/dl\n\nGlobulin 3.0 (2.6-4.1) g/dl\n\nDirect Bilirubin 0.1 (0-0.4) mg/dl\n\nTotal Bilirubin 0.3 (0-1.0) mg/dl\n\nAlkaline Phosphatase 67 (45-115) U/L\n\nTransaminase-SGPT 16 (10-55) U/L\n\nAmylase 16 (3-100) units/L\n\nLipase 2.5 (1.3-6.0) U/dl\n\nTransaminase-SGOT 28 (10-40) U/L\n\n\n\nCreatine Kinase Isoenz BORDERLINE (NEG)\n\nTroponin-I NEGATIVE (NEG)\n\n\n\nCreatine Kinase Isoenz 13.2 H (0.0-6.9) ng/ml\n\nCPK Isoenzymes Index 12.8 H (0.0-3.5) %\n\nTroponin-T 0.06 (0.00-0.09) ng/ml\n\nCreatine Kinase 103 (60-400) U/L\n\n\n\nAdmission:\n\nWBC 18.4 H (4.5-11.0) th/cmm\n\nHCT 22.2 L (41.0-53.0) %\n\nHGB 7.4 L (13.5-17.5) gm/dl\n\nRBC 2.25 L (4.50-5.90) mil/cmm\n\nPLT 243 (150-350) th/cumm\n\nMCV 99 (80-100) fl\n\nMCH 33.0 (26.0-34.0) pg/rbc\n\nMCHC 33.4 (31.0-37.0) g/dl\n\nRDW 15.6 H (11.5-14.5) %\n\nSuperstat PT 14.0 H (11.1-13.1) sec\n\nSuperstat APTT 25.3 (22.1-35.1) sec\n\n\n\n\n\nUA-Specific Gravity <1.005 (1.001-1.035)\n\nUA-pH 5.0 (5.0-9.0)\n\nUA-WBC Screen NEGATIVE (NEG)\n\nUA-Nitrite NEGATIVE (NEG)\n\nUA-Albumin NEGATIVE (NEG)\n\nUA-Glucose Trace (NEG)\n\nUA-Ketones NEGATIVE (NEG)\n\nUA-Occult Blood NEGATIVE (NEG)\n\n\n\nWBC 17.2 H (4.5-11.0) th/cmm\n\nHCT 24.7 L (41.0-53.0) %\n\nHGB 8.6 L (13.5-17.5) gm/dl\n\nRBC 2.65 L (4.50-5.90) mil/cmm\n\nPLT 210 (150-350) th/cumm\n\nMCV 93 (80-100) fl\n\nMCH 32.5 (26.0-34.0) pg/rbc\n\nMCHC 35.0 (31.0-37.0) g/dl\n\nRDW 16.8 H (11.5-14.5) %\n\n\n\nStudies:\n\nEKG AF (92) LAD TWI I,L,V6\n\nCXR IMPRESSION:\n\n\tThere are linear opacities at both bases consistent with\n\n\tsubsegmental atelectasis. Bilateral calcified pleural plaques are\n\n\tconsistent with asbestos exposure. There is no pneumothorax. No\n\npleural effusions are visualized. There is extensive change including both shoulders.\n\n\n\nAssessment: 85 y/o with medically management multivessel CAD who presents with light-headedness, fatigue and anemia. Likely coffee gound emesis, melana and increased BUN concerning for UGIB. Given history of alcohol consumption (suspect that consumption may be slightly more than given by history) and NSAID use, gastritis is a possibility. No clear reason for other sources of UGIB. Pt also found to be in Afib of unknown duration. Unclear if Afib was prompted by anemia due to UGIB.\n\n\n\nPlan:\n\n\n\n1.\tUGIB\n\na.\tActive blood back sample, IV access\n\nb.\tHct increased from 22.2 to 24.7 after 2u pRBCs. Given CAD history, will transfuse one more unit\n\nc.\tGI consult for possible EGD (pt likely also needs colonoscopy given h/o tubular adenoma)\n\nd.\tIV zantac \n\ne.\tHold aspirin for now\n\n2.\tCV\n\na.\tIschemia - TVI on admission on ECG from ED. Will cycle cardiac enzymes. Hold aspirin for now given GIB. Continue beta blocker both for ischemia protection and AF rate control; will hold other anti-hypertensives for now\n\nb.\tRhythm - unclear duration of Afib so immediate cardioversion not an option (avoid TEE in the setting of UGIB); lopressor for rate control; no anticoagulation for now; central telmetry\n\nc.\tPump - no active issues \n\n3.\tincreased WBC \n\na.\tunclear etiology ? reactive\n\nb.\tAfebrile, no signs of infection; will follow\n\n4.\tdiabetes\n\na.\tSSI\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJohn Kirk, MD\n\nIntern in Medicine \n\nPager 92915\n\n\n\n", "name": "218-01.xml", "annotations": [{"start": "16", "end": "26", "cui": "DATE", "value": "2092-10-29"}, {"start": "63", "end": "75", "cui": "PATIENT", "value": "Walton, Levi"}, {"start": "82", "end": "89", "cui": "MEDICALRECORD", "value": "2554172"}, {"start": "97", "end": "105", "cui": "DATE", "value": "10/29/92"}, {"start": "111", "end": "125", "cui": "DOCTOR", "value": "Paul Eggleston"}, {"start": "139", "end": "146", "cui": "DOCTOR", "value": "Youmans"}, {"start": "188", "end": "190", "cui": "AGE", "value": "85"}, {"start": "227", "end": "231", "cui": "DATE", "value": "6/87"}, {"start": "1906", "end": "1910", "cui": "DATE", "value": "2082"}, {"start": "1978", "end": "1982", "cui": "DATE", "value": "2082"}, {"start": "2058", "end": "2062", "cui": "DATE", "value": "6/87"}, {"start": "2206", "end": "2210", "cui": "DATE", "value": "3/87"}, {"start": "2293", "end": "2297", "cui": "DATE", "value": "2087"}, {"start": "2429", "end": "2434", "cui": "DATE", "value": "10/92"}, {"start": "2491", "end": "2496", "cui": "DATE", "value": "10/92"}, {"start": "7749", "end": "7751", "cui": "AGE", "value": "85"}, {"start": "9107", "end": "9116", "cui": "DOCTOR", "value": "John Kirk"}, {"start": "9149", "end": "9154", "cui": "PHONE", "value": "92915"}]}, {"text": "\n\n\nRecord date: 2094-12-26\n\nCARDIOLOGY\n\nCOQUILLE VALLEY HOSPITAL\n\n\n\nReason for visit:\n\n NSTEMI\n\n\n\nInterval History:\n\n Multiple risk factors for CAD including DM, HTN, CRI. Developed complaints of exertional dyspnea and fatigue during the spring of 2094. Progressed and she was evaluated by Bonnie Eaves. Bruce protocol was postive at 9 minutes, 64% PMHR. Complaints of dyspnea. EKG with borderline changes. Nuclear images with moderate sized defect of anteroapical zone with partial reperfusion. Some scar. Mild LV dysfunction with EF 45-50% and apical dyskinesis.\n\n\n\n8/2094 Admitted to CVH for elective cath. Cath with Dr Vitale found nl LM. 99% subtotal LAD with TIMI I flow. Circ was patent. RCA with proximal 30% lesions and mid 80% lesion. Nl LV gram. Treated with PTCA and stent to mid LAD with mini-vision non-drug-eluding stent (2.5 X 18 mm). Excellent results. Distal LAD with residual diffuse 30-40% lesion. Plan to return for staged intervention to RCA.\n\n\n\n9/29/2094 she was readmitted for elective RCA PCI. A relook at her LAD stent found >90% restenosis to proximal edge of LAD stent. Treated with 2.75 X 12 mm TAXUS stent. Tolerated procedure well. Discharged the following day with creat 3.3. Plan was to again return for RCA PCI.\n\n\n\n10/94 Cath: patent LAD stents. Mid RCA lesion treated with 2.5 X 13 mm cypher stent. Discharge delayed for neurologic changes (LOC and unresponsive) but EEG, CNIS, CT, all without new infarct (old PCA CVA). \n\n\n\n12/25 Developed SOB and arrived in EW at SMM with chest pain. EKG without changes. Trop +/- but continued to have chest pain. Given previous stents, she was transferred urgently to CVH for cath. \n\n\n\nPast medical history:\n\n CVA of left PCA territory 2093. No residual. Rx with ASA, folate, niaspan\n\nComplete neurologic evaluation in LOC in 10/94 and previously in 2093\n\nIDDM diagnosed 30 yrs ago on insulin pump\n\nCRI with creat baseline 3.7 (followed by Orlando Ernst) and recent eval by Dr Ratliff for transplant. +proteinuria with nephrotic syndrome. Donor kidney (sister)lined up. No date for transplant made...needs to complete Plavix course first.\n\nHTN\n\nHigh cholesterol\n\nDiabetic retinopathy\n\nAnemia\n\n\n\nMedications (Confirmed):\n\n\n\n\t\t\t\n\nColace 100 mg po qd\n\nDiovan 160mg po qd\n\nEpogen 10000u sc sundays\n\nerythromycin 333mg po TID\n\nfolic acid 1 mg po qd\n\nInsulin pump\n\niron supplement 325mg po am\n\nisosorbide dinitrate\n\nLasix 80 mg po qd\n\nmultivitamins 1 tab po qd\n\nNiaspan 1000mg SR po qhs\n\nNorvasc 5mg po q pm\n\nPhoslo and vitamin B 1\n\nPlavix 75mg po qd\n\nToprol XL 25 mg po qd\n\nVytorin 10mg/80mg po qhs\n\n\n\nAllergies:\n\nNo known drug allergy \n\n\n\nFamily history:\n\n Mom A&W in her 60's with HTN. Dad A&W. Sister is planning to be donor kidney. Other sister is back-up donor. \n\n\n\nSocial history:\n\n Works as Patternmaker at IMN. \n\nvolunteer firefighter. Single and lives with her parents. \n\nNever smoked. No ETOH. \n\n\n\nReview of systems:\n\n no peripheral edema currently (but had it in the past). No fever, chills, sweating. Problems with gastroparesis and is planning to have a gastric pacemaker inserted (has been having delayed spikes in insulin). \n\n\n\nPhysical examination:\n\n-BP: 110/50 \n\n-Pulse: 70 \n\n-resp. rate: 16 \n\n-weight: 153 \n\n-General appearance: No acute distress.\n\n-Skin: No rashes, anicteric.\n\n-Heent: Unremarkable\n\n-Neck: Carotids 2+ without bruits. JVP no jugular venous distention\n\n-Chest: Clear to auscultation and percussion.\n\n-Cardiac: Left ventricular impulse discrete and nondisplaced. Regular rate and rhythm, normal S1 and S2, with no S3 or S4. There were no murmurs, clicks or rubs.\n\n-Abdomen: Normal bowel sounds, soft and nontender, with no hepatosplenomegaly or masses appreciated.\n\n-Extremities: No cyanosis, clubbing or edema. 2+ femoral pulses without bruits. 2+ pedal pulses.\n\n-Neuro: A&O x3, CN 2-12 grossly intact. Reflexes 2+ and symmetric x 4 extremities. Toes B downgoing.\n\n\n\nSelected recent labs:\n\n K 4.4, Creat 4.2. WBC 7.5, Creat 35.8, Plts 401. INR 0.9\n\n\n\nAssessment and plan:\n\n 40 y.o. with multiple risk factors for CAD. HTN, high chol, IDDM. Previous LAD stent with restenosis. Then second LAD stent and finally RCA stent. Now with admit for NSTEMI. Transferred for urgent cath. Plan for eventual renal transplant at CVH (had been waiting for Plavix to be completed). Pre-treated with mucomyst. \n\n\n\nGiven her previous restenosis, and the fact that further Plavix courses are postponing her renal transplant....she would be best served with CABG if she has restenosis. \n\n\n\nFurther plan per Dr Rollins\n\nFollow up with Dr Eaves\n\n\n\nFrances Travis Potts NP\n\n\n\n\n\nChanges to Medications this visit\n\nDiovan 160mg po qd Start: 09/28/2094\n\nLasix 80 mg po qd Start: 09/28/2094\n\nToprol XL 25 mg po qd Start: 09/28/2094 just increased to 50\n\nNorvasc 5mg po q pm Start: 09/28/2094\n\nPlavix 75mg po qd Start: 09/28/2094\n\nNiaspan 1000mg SR po qhs Start: 09/28/2094\n\nVytorin 10mg/80mg po qhs Start: 09/28/2094\n\nfolic acid 1 mg po qd Start: 09/28/2094\n\nEpogen 10000u sc sundays Start: 09/28/2094\n\nmultivitamins 1 tab po qd Start: 09/28/2094\n\niron supplement 325mg po am Start: 09/28/2094\n\nColace 100 mg po qd Start: 09/28/2094\n\nisosorbide dinitrate Start: 09/28/2094\n\nerythromycin 333mg po TID Start: 09/28/2094 for gastroparesis\n\nPhoslo and vitamin B 1 Start: 09/28/2094\n\nInsulin pump \n\n\n\nSigned electronically by Frances T Potts NP on Dec 26, 2094 \n\n\n\n", "name": "231-02.xml", "annotations": [{"start": "16", "end": "26", "cui": "DATE", "value": "2094-12-26"}, {"start": "40", "end": "64", "cui": "HOSPITAL", "value": "COQUILLE VALLEY HOSPITAL"}, {"start": "242", "end": "256", "cui": "DATE", "value": "spring of 2094"}, {"start": "294", "end": "306", "cui": "DOCTOR", "value": "Bonnie Eaves"}, {"start": "572", "end": "578", "cui": "DATE", "value": "8/2094"}, {"start": "591", "end": "594", "cui": "HOSPITAL", "value": "CVH"}, {"start": "627", "end": "633", "cui": "DOCTOR", "value": "Vitale"}, {"start": "972", "end": "981", "cui": "DATE", "value": "9/29/2094"}, {"start": "1253", "end": "1258", "cui": "DATE", "value": "10/94"}, {"start": "1464", "end": "1469", "cui": "DATE", "value": "12/25"}, {"start": "1505", "end": "1508", "cui": "HOSPITAL", "value": "SMM"}, {"start": "1645", "end": "1648", "cui": "HOSPITAL", "value": "CVH"}, {"start": "1715", "end": "1719", "cui": "DATE", "value": "2093"}, {"start": "1805", "end": "1810", "cui": "DATE", "value": "10/94"}, {"start": "1829", "end": "1833", "cui": "DATE", "value": "2093"}, {"start": "1919", "end": "1932", "cui": "DOCTOR", "value": "Orlando Ernst"}, {"start": "1956", "end": "1963", "cui": "DOCTOR", "value": "Ratliff"}, {"start": "2265", "end": "2272", "cui": "DATE", "value": "sundays"}, {"start": "2651", "end": "2655", "cui": "AGE", "value": "60's"}, {"start": "2778", "end": "2790", "cui": "PROFESSION", "value": "Patternmaker"}, {"start": "2794", "end": "2797", "cui": "ORGANIZATION", "value": "IMN"}, {"start": "2801", "end": "2822", "cui": "PROFESSION", "value": "volunteer firefighter"}, {"start": "4015", "end": "4017", "cui": "AGE", "value": "40"}, {"start": "4256", "end": "4259", "cui": "HOSPITAL", "value": "CVH"}, {"start": "4531", "end": "4538", "cui": "DOCTOR", "value": "Rollins"}, {"start": "4558", "end": "4563", "cui": "DOCTOR", "value": "Eaves"}, {"start": "4567", "end": "4587", "cui": "DOCTOR", "value": "Frances Travis Potts"}, {"start": "4658", "end": "4668", "cui": "DATE", "value": "09/28/2094"}, {"start": "4696", "end": "4706", "cui": "DATE", "value": "09/28/2094"}, {"start": "4738", "end": "4748", "cui": "DATE", "value": "09/28/2094"}, {"start": "4799", "end": "4809", "cui": "DATE", "value": "09/28/2094"}, {"start": "4837", "end": "4847", "cui": "DATE", "value": "09/28/2094"}, {"start": "4882", "end": "4892", "cui": "DATE", "value": "09/28/2094"}, {"start": "4927", "end": "4937", "cui": "DATE", "value": "09/28/2094"}, {"start": "4969", "end": "4979", "cui": "DATE", "value": "09/28/2094"}, {"start": "4999", "end": "5006", "cui": "DATE", "value": "sundays"}, {"start": "5014", "end": "5024", "cui": "DATE", "value": "09/28/2094"}, {"start": "5060", "end": "5070", "cui": "DATE", "value": "09/28/2094"}, {"start": "5108", "end": "5118", "cui": "DATE", "value": "09/28/2094"}, {"start": "5148", "end": "5158", "cui": "DATE", "value": "09/28/2094"}, {"start": "5192", "end": "5202", "cui": "DATE", "value": "09/28/2094"}, {"start": "5238", "end": "5248", "cui": "DATE", "value": "09/28/2094"}, {"start": "5301", "end": "5311", "cui": "DATE", "value": "09/28/2094"}, {"start": "5360", "end": "5375", "cui": "DOCTOR", "value": "Frances T Potts"}, {"start": "5384", "end": "5396", "cui": "DATE", "value": "Dec 26, 2094"}]}, {"text": "\n\n\nRecord date: 2069-11-18\n\nHPI\n\n54 yo F with h/o CRI, DM on insulin, HTN, obesity, GERD, glaucoma, anemia, p/f for f/u. \n\nShe is generally doing well and has no complaints. She lost her glucometer and hasnt check her BS for 2 weeks. Last A1C after starting insulin down to 7.5, only on 5 glyburide bc of BS lows.\n\nFor her CRI, which is multifactorial (DM, HTN and L renal artery stenosis) she sees Dr Uriarte in Internal Medicine, saw him yesterday, Cr stable and PTH up a bit to 226. \n\n \n\n================ Problems ================\n\nESSENTIAL HYPERTENSION \n\nMODERATE OBESITY \n\nDiabetes mellitus \n\nChronic renal dysfunction cre cl estimate 23 in 12/67\n\nGlaucoma \n\nLactose intolerance \n\nAtypical chest pain negative ETT 4/8/67, negative ETT 2064, ETT w/ SPECT 2062\n\nIron-deficiency anemia \n\nGastritis EGD 9/67, also GERD symptoms\n\n\n\n================ Medications ================\n\nAsa 81 MG PO QD \n\nProcardia XL 60 MG (60MG TABLET take 1) PO QD , may use adalat instead\n\nRanitidine HCL 150 MG PO QD \n\nLosartan 100 MG PO QD \n\nGlyburide 5MG TABLET take 1 Tablet(s) PO QD \n\nFerrous GLUCONATE 325MG TABLET take 1 Tablet(s) PO TID , Take one tablet with every meal for iron deficiency\n\nHydrochlorothiazide 25MG TABLET take 1 Tablet(s) PO QD , Take every day for blood pressure\n\nToprol XL (METOPROLOL SUCCINATE EXTENDED ... 50MG TABLET CR 24HR take 1 Tablet(s) PO QD \n\nCalcitriol 0.25MCG CAPSULE take 1 Capsule(s) PO QD \n\nZocor (SIMVASTATIN) 20 MG (20MG TABLET take 1) PO QHS \n\nLantus (INSULIN GLARGINE) 16 UNITS SC QHS , dose increase\n\nTimolol XE 0.25% 1 DROP OU QD , must keep appt\n\nXalatan (LATANOPROST) 1 DROP OU QPM , must keep scheduled appt \n\n\n\n\n\nALLERGIES NKDA \n\n\n\nPHYSICAL EXAM\n\nVS: BP 110/60 \n\nCor: RRR, nl S1S2, 1/6 sys M, no rubs, gallops\n\nLungs: CTA b/l, no rales, rhonchi or wheezes\n\nAbd: Soft, NTND. Normal active bowel sounds. \n\nNo c/c/e\n\n\n\nA/P\n\n54 yo F with h/o CRI, DM, HTN, obesity, GERD, glaucoma, anemia for f/u. \n\n1. DM: Last A1C 7.5 from 9.3 after starting insulin, still on Lantus 16 and now Glyburide 5 qam. She sees Optho 2x/yr as she has glaucoma, saw them 2mos ago. \n\n--Prescribed glucometer\n\n--Cont current regimen\n\n--check A1C (pt left without getting labs, will get at next visit)\n\n--2/69 urine mcalb/cr 200\n\n--on statin\n\n\n\n2. CRI: Multifactorial, followed by Dr Uriarte. Last Cr 2.8 on 5/69. She has 2ndary hyperparathyroidism as well, PTH yest 226. On Calcitriol. \n\n--Cont ARB. \n\n--F/u with Dr Uriarte. \n\n\n\n3. Anemia, h/o, Fe deficiency, on Fe supp. Last Hct 36.\n\n\n\n4. H/o bowel ischemia but most recent CT wnl so more likely it was an infectious etiology.\n\n\n\n5. HTN: Cont on ARB, Toprol, Procardia, HCTZ. Today somewhat high, increased Procardia to 60 qd. On ASA for 1ry prevention of CAD. Statin.\n\n\n\n6.GERD: Cont Zantac, well controlled sx.\n\n\n\n7. Galucoma: followed by Optho, cont eye drops\n\n\n\n8. HM:\n\n--lipids 8/69much improved with addition of statin TC 121, TG 309, HDL 28, LDL 31. Lfts ok. Will check again in Feb\n\n--BMD 10/68 wnl\n\n--colonoscopy 2/68 wnl\n\n--Mammo will be scheduled for 12/69\n\n--PAP overdue, will reschedule a PAP appt\n\n\n\n\n\n\n\n\n\n______________________________ \n\n\n\nQuiana Gagnon, M.D.\n\n \n\n\n\n========================== Preceptor's Note ==========================\n\n\n\nI have discussed the evaluation and care of this patient with Dr. Gagnon.\n\n\n\n\n\n______________________________ \n\n\n\nDavid R. Quintin, M.D.\n\n\n\n", "name": "313-03.xml", "annotations": [{"start": "16", "end": "26", "cui": "DATE", "value": "2069-11-18"}, {"start": "33", "end": "35", "cui": "AGE", "value": "54"}, {"start": "402", "end": "409", "cui": "DOCTOR", "value": "Uriarte"}, {"start": "683", "end": "688", "cui": "DATE", "value": "12/67"}, {"start": "783", "end": "789", "cui": "DATE", "value": "4/8/67"}, {"start": "804", "end": "808", "cui": "DATE", "value": "2064"}, {"start": "823", "end": "827", "cui": "DATE", "value": "2062"}, {"start": "885", "end": "889", "cui": "DATE", "value": "9/67"}, {"start": "1938", "end": "1940", "cui": "AGE", "value": "54"}, {"start": "2292", "end": "2296", "cui": "DATE", "value": "2/69"}, {"start": "2371", "end": "2378", "cui": "DOCTOR", "value": "Uriarte"}, {"start": "2396", "end": "2400", "cui": "DATE", "value": "5/69"}, {"start": "2505", "end": "2512", "cui": "DOCTOR", "value": "Uriarte"}, {"start": "2924", "end": "2928", "cui": "DATE", "value": "8/69"}, {"start": "3027", "end": "3030", "cui": "DATE", "value": "Feb"}, {"start": "3038", "end": "3043", "cui": "DATE", "value": "10/68"}, {"start": "3063", "end": "3067", "cui": "DATE", "value": "2/68"}, {"start": "3103", "end": "3108", "cui": "DATE", "value": "12/69"}, {"start": "3225", "end": "3238", "cui": "DOCTOR", "value": "Quiana Gagnon"}, {"start": "3396", "end": "3402", "cui": "DOCTOR", "value": "Gagnon"}, {"start": "3473", "end": "3489", "cui": "DOCTOR", "value": "David R. Quintin"}]}]}]} \ No newline at end of file diff --git a/tests/utils/ner/test_deid.py b/tests/utils/ner/test_deid.py index dcc8938b8..97ca8334b 100644 --- a/tests/utils/ner/test_deid.py +++ b/tests/utils/ner/test_deid.py @@ -3,12 +3,15 @@ from medcat.ner import transformers_ner -from spacy.tokens import Doc +from spacy.tokens import Doc, Span from typing import Any, List, Tuple import os +import json +import tempfile import unittest +import timeout_decorator FILE_DIR = os.path.dirname(os.path.realpath(__file__)) @@ -20,6 +23,9 @@ TRAIN_DATA = os.path.join(FILE_DIR, "..", "..", "resources", "deid_train_data.json") +TEST_DATA = os.path.join(FILE_DIR, "..", "..", + "resources", "deid_test_data.json") + class DeIDmodelCreationTests(unittest.TestCase): @@ -57,6 +63,17 @@ def train_model_once(model: deid.DeIdModel, ) -> Tuple[Tuple[Any, Any, Any], deid.DeIdModel]: if not _trained: retval = model.train(TRAIN_DATA) + # mpp = 'temp/deid_multiprocess/dumps/temp_model_save' + # NOTE: it seems that after training the model leaves + # it in a state where it can no longer be used + # for multiprocessing. So in order to avoid that + # we save the model on disk and load it agains + with tempfile.TemporaryDirectory() as dir_name: + print("Saving model on disk") + mpn = model.cat.create_model_pack(dir_name) + print("Loading model") + model = deid.DeIdModel.load_model_pack(os.path.join(dir_name, mpn)) + print("Loaded model off disk") _trained.append((retval, model)) return _trained[0] @@ -105,7 +122,10 @@ def setUpClass(cls) -> None: def test_model_works_deid_text(self): anon_text = self.deid_model.deid_text(input_text) self.assertIn("[DOCTOR]", anon_text) + self.assertNotIn("M. Sully", anon_text) self.assertIn("[HOSPITAL]", anon_text) + # self.assertNotIn("Dublin", anon_text) + self.assertNotIn("7 Eccles Street", anon_text) def test_model_works_dunder_call(self): anon_doc = self.deid_model(input_text) @@ -115,4 +135,52 @@ def test_model_works_deid_text_redact(self): anon_text = self.deid_model.deid_text(input_text, redact=True) self.assertIn("****", anon_text) self.assertNotIn("[DOCTOR]", anon_text) + self.assertNotIn("M. Sully", anon_text) self.assertNotIn("[HOSPITAL]", anon_text) + # self.assertNotIn("Dublin", anon_text) + self.assertNotIn("7 Eccles Street", anon_text) + +class DeIDModelMultiprocessingWorks(unittest.TestCase): + processes = 2 + + @classmethod + def setUpClass(cls) -> None: + Span.set_extension('link_candidates', default=None, force=True) + _add_model(cls) + cls.deid_model = train_model_once(cls.deid_model)[1] + with open(TEST_DATA) as f: + raw_data = json.load(f) + cls.data = [] + for project in raw_data['projects']: + for doc in project['documents']: + cls.data.append((f"{project['name']}_{doc['name']}", doc['text'])) + + def assertTextHasBeenDeIded(self, text: str, redacted: bool): + if not redacted: + for cui in self.deid_model.cdb.cui2names: + cui_name = self.deid_model.cdb.get_name(cui) + if cui_name in text: + # all good + return + else: + # if redacted, only check once... + if "******" in text: + # all good + return + raise AssertionError("None of the CUIs found") + + @timeout_decorator.timeout(3 * 60) # 3 minutes max + def test_model_can_multiprocess_no_redact(self): + processed = self.deid_model.deid_multi_texts(self.data, n_process=self.processes) + self.assertEqual(len(processed), 5) + for tid, new_text in enumerate(processed): + with self.subTest(str(tid)): + self.assertTextHasBeenDeIded(new_text, redacted=False) + + @timeout_decorator.timeout(3 * 60) # 3 minutes max + def test_model_can_multiprocess_redact(self): + processed = self.deid_model.deid_multi_texts(self.data, n_process=self.processes, redact=True) + self.assertEqual(len(processed), 5) + for tid, new_text in enumerate(processed): + with self.subTest(str(tid)): + self.assertTextHasBeenDeIded(new_text, redacted=True) From d01084cb167b98af9a38af02cbd51f136577fc0e Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Tue, 13 Feb 2024 17:55:45 +0000 Subject: [PATCH 64/64] Cu 8693u6b4u tests continue on fail (#400) * CU-8693u6b4u: Make sure failed/errored tests fail the main workflow * CU-8693u6b4u: Attempt to fix deid multiprocessing, at least for GHA * CU-8693u6b4u: Fix small docstring issue --- .github/workflows/main.yml | 1 - medcat/config_meta_cat.py | 2 +- tests/utils/ner/test_deid.py | 7 +++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 687160ed9..7c7a2b742 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -34,7 +34,6 @@ jobs: - name: Test run: | timeout 17m python -m unittest discover - continue-on-error: true publish-to-test-pypi: diff --git a/medcat/config_meta_cat.py b/medcat/config_meta_cat.py index 47f42dc28..6ddd71d56 100644 --- a/medcat/config_meta_cat.py +++ b/medcat/config_meta_cat.py @@ -38,7 +38,7 @@ class General(MixingConfig, BaseModel): pipe_batch_size_in_chars: int = 20000000 """How many characters are piped at once into the meta_cat class""" span_group: Optional[str] = None - """If set, the spacy span group that the metacat model will assign annotations. + """If set, the spacy span group that the metacat model will assign annotations. Otherwise defaults to doc._.ents or doc.ents per the annotate_overlapping settings""" class Config: diff --git a/tests/utils/ner/test_deid.py b/tests/utils/ner/test_deid.py index 97ca8334b..01c9c1af3 100644 --- a/tests/utils/ner/test_deid.py +++ b/tests/utils/ner/test_deid.py @@ -154,6 +154,13 @@ def setUpClass(cls) -> None: for project in raw_data['projects']: for doc in project['documents']: cls.data.append((f"{project['name']}_{doc['name']}", doc['text'])) + # NOTE: Comment and subsequent code + # copied from CAT.multiprocessing_batch_char_size + # (lines 1234 - 1237) + # Hack for torch using multithreading, which is not good if not + #separate_nn_components, need for CPU runs only + import torch + torch.set_num_threads(1) def assertTextHasBeenDeIded(self, text: str, redacted: bool): if not redacted: