From 8d5fcb5713bc3dc05d5c0e16b90fe196e8327ccb Mon Sep 17 00:00:00 2001 From: "E.C. Wood" Date: Wed, 12 Jul 2023 13:25:18 -0700 Subject: [PATCH 001/117] #316 starting to piece together how UMLS is structured based on MySQL and umls2rdf code to scope out this issue --- misc-tools/mysql_table_to_md_table.sh | 5 + understanding_umls.md | 599 ++++++++++++++++++++++++++ 2 files changed, 604 insertions(+) create mode 100755 misc-tools/mysql_table_to_md_table.sh create mode 100644 understanding_umls.md diff --git a/misc-tools/mysql_table_to_md_table.sh b/misc-tools/mysql_table_to_md_table.sh new file mode 100755 index 00000000..673b3e4d --- /dev/null +++ b/misc-tools/mysql_table_to_md_table.sh @@ -0,0 +1,5 @@ +sed -i -E "s/\+(-)+/\|--/g" umls_table.txt +sed -i -E "s/^\|( )*//g" umls_table.txt +sed -i -E "s/--\+$/--/g" umls_table.txt +sed -i -E "s/( )+/ /g" umls_table.txt +sed -i -E "s/<|>//g" umls_table.txt diff --git a/understanding_umls.md b/understanding_umls.md new file mode 100644 index 00000000..02baabd0 --- /dev/null +++ b/understanding_umls.md @@ -0,0 +1,599 @@ +## UMLS MySQL Walk Through + +# RegEx MySQL Table -> Markdown Table +1. Replace: `\+(-)+` With: `\|--` +2. Replace: `^\|( )*` With: Nothing +3. Replace: `--\+$` With: `--` +4. Replace: `( )+` With: ` ` + + +# Tables +``` +mysql> show tables; +``` + +Tables_in_umls | +--| +AMBIGLUI | +AMBIGSUI | +DELETEDCUI | +DELETEDLUI | +DELETEDSUI | +MERGEDCUI | +MERGEDLUI | +MRAUI | +MRCOLS | +MRCONSO | +MRCUI | +MRDEF | +MRDOC | +MRFILES | +MRHIER | +MRHIST | +MRMAP | +MRRANK | +MRREL | +MRSAB | +MRSAT | +MRSMAP | +MRSTY | +MRXNS_ENG | +MRXNW_ENG | +MRXW_BAQ | +MRXW_CHI | +MRXW_CZE | +MRXW_DAN | +MRXW_DUT | +MRXW_ENG | +MRXW_EST | +MRXW_FIN | +MRXW_FRE | +MRXW_GER | +MRXW_GRE | +MRXW_HEB | +MRXW_HUN | +MRXW_ITA | +MRXW_JPN | +MRXW_KOR | +MRXW_LAV | +MRXW_NOR | +MRXW_POL | +MRXW_POR | +MRXW_RUS | +MRXW_SCR | +MRXW_SPA | +MRXW_SWE | +MRXW_TUR | + +``` +mysql> select * from MRCUI limit 10; +``` +CUI1 | VER | REL | RELA | MAPREASON | CUI2 | MAPIN | +--|--|--|--|--|--|-- +C0000002 | 2000AC | SY | NULL | NULL | C0007404 | Y | +C0000003 | 1999AA | SY | NULL | NULL | C0010504 | Y | +C0000024 | 1993AA | SY | NULL | NULL | C0043791 | Y | +C0000105 | 1995AA | SY | NULL | NULL | C0001964 | Y | +C0000136 | 1993AA | DEL | NULL | NULL | NULL | NULL | +C0000140 | 1993AA | DEL | NULL | NULL | NULL | NULL | +C0000158 | 1993AA | DEL | NULL | NULL | NULL | NULL | +C0000164 | 2003AB | RO | NULL | NULL | C0000163 | Y | +C0000177 | 1993AA | SY | NULL | NULL | C0014924 | Y | +C0000219 | 1993AA | DEL | NULL | NULL | NULL | NULL | + +``` +mysql> select * from MRCOLS; +``` + +COL | DES | REF | MIN | AV | MAX | FIL | DTY | +--|--|--|--|--|--|--|-- +ATNL | Attribute name list for a source. | NULL | 0 | 69.84 | 1178 | MRSAB.RRF | varchar(4000) | +ATN | Attribute name | NULL | 2 | 10.38 | 62 | MRSAT.RRF | varchar(100) | +ATUI | Unique identifier for attribute. | NULL | 10 | 10.64 | 11 | MRSTY.RRF | varchar(11) | +ATUI | Unique identifier for attribute. | NULL | 10 | 10.85 | 11 | MRSAT.RRF | varchar(11) | +ATUI | Unique identifier for attribute. | NULL | 10 | 10.86 | 11 | MRDEF.RRF | varchar(11) | +ATV | Attribute value | NULL | 1 | 12.69 | 35985 | MRSAT.RRF | varchar(65000) | +AUI1 | Unique identifier for first atom | NULL | 0 | 8.52 | 9 | MRREL.RRF | varchar(9) | +AUI1 | Unique identifier for first atom | NULL | 8 | 8.54 | 9 | MRAUI.RRF | varchar(9) | +AUI2 | Unique identifier for second atom | NULL | 0 | 8.52 | 9 | MRREL.RRF | varchar(9) | +AUI2 | Unique identifier for second atom | NULL | 8 | 8.54 | 9 | MRAUI.RRF | varchar(9) | +AUI | Unique identifier for atom | NULL | 8 | 8.58 | 9 | MRHIER.RRF | varchar(9) | +AUI | Unique identifier for atom | NULL | 8 | 8.74 | 9 | MRDEF.RRF | varchar(9) | +AUI | Unique identifier for atom | NULL | 8 | 8.77 | 9 | MRCONSO.RRF | varchar(9) | +AV | Average Length, Characters | NULL | 4 | 4.12 | 6 | MRCOLS.RRF | numeric(5,2) | +BTS | Size in Bytes | NULL | 1 | 7.19 | 10 | MRFILES.RRF | integer | +CENC | Character encoding of a source as specified by IANA | NULL | 5 | 5.00 | 5 | MRSAB.RRF | varchar(20) | +CFR | CUI frequency for a source | NULL | 1 | 4.18 | 6 | MRSAB.RRF | integer | +CHANGEKEY | CONCEPTSTATUS (if history relates to a SNOMED CT concept) or DESCRIPTIONSTATUS (if history relates to a SNOMED CT atom or "description") | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(1000) | +CHANGETYPE | Source asserted code for type of change | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(1000) | +CHANGEVAL | SNOMED CT CONCEPTSTATUS or DESCRIPTIONSTATUS value after the change took place | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(1000) | +CLS | Number of columns | NULL | 1 | 1.12 | 2 | MRFILES.RRF | integer | +CODE | Unique Identifier or code for string in source | NULL | 0 | 4.46 | 56 | MRSAT.RRF | varchar(100) | +CODE | Unique Identifier or code for string in source | NULL | 1 | 7.50 | 95 | MRCONSO.RRF | varchar(100) | +COL | Column or data element name | NULL | 2 | 3.71 | 11 | MRCOLS.RRF | varchar(20) | +CUI1 | Unique identifier for first concept | NULL | 8 | 8.00 | 8 | MRAUI.RRF | char(8) | +CUI1 | Unique identifier for first concept | NULL | 8 | 8.00 | 8 | MRCUI.RRF | char(8) | +CUI1 | Unique identifier for first concept | NULL | 8 | 8.00 | 8 | MRREL.RRF | char(8) | +CUI2 | Unique identifier for second concept | NULL | 0 | 3.33 | 8 | MRCUI.RRF | char(8) | +CUI2 | Unique identifier for second concept | NULL | 8 | 8.00 | 8 | MRAUI.RRF | char(8) | +CUI2 | Unique identifier for second concept | NULL | 8 | 8.00 | 8 | MRREL.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 0 | 0.00 | 0 | MRHIST.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | AMBIGLUI.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | AMBIGSUI.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | CHANGE/MERGEDCUI.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRCONSO.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRDEF.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRHIER.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRSAT.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRSTY.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXNS_ENG.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXNW_ENG.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_ARA.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_BAQ.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_CHI.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_CZE.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_DAN.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_DUT.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_ENG.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_EST.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_FIN.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_FRE.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_GER.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_GRE.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_HEB.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_HUN.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_ITA.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_JPN.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_KOR.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_LAV.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_NOR.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_POL.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_POR.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_RUS.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_SCR.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_SPA.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_SWE.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_TUR.RRF | char(8) | +CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_UKR.RRF | char(8) | +CURVER | Current Version flag | NULL | 1 | 1.00 | 1 | MRSAB.RRF | char(1) | +CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRDEF.RRF | varchar(50) | +CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRHIER.RRF | varchar(50) | +CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(50) | +CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(50) | +CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRREL.RRF | varchar(50) | +CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRSAT.RRF | varchar(50) | +CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRSMAP.RRF | varchar(50) | +CVF | Content view flag | NULL | 0 | 1.22 | 5 | MRCONSO.RRF | varchar(50) | +CVF | Content view flag | NULL | 0 | 2.13 | 5 | MRSTY.RRF | varchar(50) | +CXN | The context number if the atom has multiple contexts | NULL | 1 | 2.17 | 5 | MRHIER.RRF | integer | +CXTY | Context type for a source | NULL | 0 | 5.14 | 13 | MRSAB.RRF | varchar(50) | +DEF | Definition | NULL | 1 | 232.23 | 10939 | MRDEF.RRF | varchar(16000) | +DES | Descriptive Name | NULL | 5 | 28.81 | 136 | MRCOLS.RRF | varchar(200) | +DES | Descriptive Name | NULL | 8 | 18.25 | 42 | MRFILES.RRF | varchar(200) | +DIR | Source asserted directionality flag | NULL | 0 | 0.13 | 1 | MRREL.RRF | varchar(1) | +DOCKEY | Key to be documented | NULL | 2 | 3.65 | 8 | MRDOC.RRF | varchar(50) | +DTY | SQL-92 data type for this column | NULL | 7 | 10.02 | 14 | MRCOLS.RRF | varchar(20) | +EXPL | Detailed explanation | NULL | 0 | 26.57 | 941 | MRDOC.RRF | varchar(1000) | +FIL | Physical FILENAME | NULL | 9 | 10.99 | 21 | MRCOLS.RRF | varchar(50) | +FIL | Physical FILENAME | NULL | 9 | 12.12 | 21 | MRFILES.RRF | varchar(50) | +FMT | Comma separated list of COL | NULL | 7 | 29.69 | 190 | MRFILES.RRF | varchar(300) | +FROMEXPR | The expression that a mapping is mapped from | NULL | 1 | 6.93 | 9 | MRSMAP.RRF | varchar(4000) | +FROMEXPR | The expression that a mapping is mapped from | NULL | 1 | 8.29 | 18 | MRMAP.RRF | varchar(4000) | +FROMID | Metathesaurus identifier for the entity being mapped from | NULL | 1 | 7.31 | 18 | MRMAP.RRF | varchar(50) | +FROMRES | Restriction applicable to the entity being mapped from | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(4000) | +FROMRULE | Machine processible rule applicable to the entity being mapped from | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(4000) | +FROMSID | Source asserted identifier for the entity being mapped from | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(50) | +FROMTYPE | The type of expression that a mapping is mapped from | NULL | 3 | 3.98 | 4 | MRSMAP.RRF | varchar(50) | +FROMTYPE | The type of expression that a mapping is mapped from | NULL | 3 | 3.99 | 4 | MRMAP.RRF | varchar(50) | +HCD | Source asserted hierarchical number or code of context member (if it exists) | NULL | 0 | 0.48 | 51 | MRHIER.RRF | varchar(100) | +IMETA | Version of the Metathesaurus that a source was added | NULL | 6 | 6.00 | 6 | MRSAB.RRF | varchar(10) | +ISPREF | Indicates whether AUI is preferred | NULL | 1 | 1.00 | 1 | MRCONSO.RRF | char(1) | +LAT | Language of Term(s) | NULL | 0 | 0.00 | 0 | CHANGE/DELETEDSUI.RRF | char(3) | +LAT | Language of Term(s) | NULL | 0 | 2.97 | 3 | MRSAB.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRCONSO.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXNS_ENG.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXNW_ENG.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_ARA.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_BAQ.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_CHI.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_CZE.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_DAN.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_DUT.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_ENG.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_EST.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_FIN.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_FRE.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_GER.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_GRE.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_HEB.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_HUN.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_ITA.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_JPN.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_KOR.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_LAV.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_NOR.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_POL.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_POR.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_RUS.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_SCR.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_SPA.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_SWE.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_TUR.RRF | char(3) | +LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_UKR.RRF | char(3) | +LUI | Unique identifier for term | NULL | 0 | 0.00 | 0 | CHANGE/MERGEDLUI.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 0 | 4.50 | 9 | MRSAT.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.00 | 8 | MRXW_BAQ.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.00 | 8 | MRXW_DAN.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.00 | 8 | MRXW_FIN.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.00 | 8 | MRXW_HEB.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.17 | 9 | MRXW_SCR.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.20 | 9 | MRXW_JPN.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.22 | 9 | AMBIGLUI.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.43 | 9 | MRXW_ENG.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.44 | 9 | MRCONSO.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.44 | 9 | MRXNS_ENG.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.44 | 9 | MRXNW_ENG.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.48 | 9 | MRXW_CZE.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.52 | 9 | MRXW_DUT.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.58 | 9 | MRXW_GER.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.67 | 9 | MRXW_SPA.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.77 | 9 | MRXW_POR.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.77 | 9 | MRXW_RUS.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.78 | 9 | MRXW_ITA.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.78 | 9 | MRXW_POL.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.80 | 9 | MRXW_FRE.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.82 | 9 | MRXW_SWE.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.90 | 9 | MRXW_KOR.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.98 | 9 | MRXW_NOR.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.99 | 9 | MRXW_HUN.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 8 | 8.99 | 9 | MRXW_LAV.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 9 | 9.00 | 9 | MRXW_ARA.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 9 | 9.00 | 9 | MRXW_CHI.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 9 | 9.00 | 9 | MRXW_EST.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 9 | 9.00 | 9 | MRXW_GRE.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 9 | 9.00 | 9 | MRXW_TUR.RRF | varchar(10) | +LUI | Unique identifier for term | NULL | 9 | 9.00 | 9 | MRXW_UKR.RRF | varchar(10) | +MAPATN | Mapping attribute name (for future use) | NULL | 0 | 2.82 | 6 | MRMAP.RRF | varchar(20) | +MAPATV | Mapping attribute value (for future use) | NULL | 0 | 0.00 | 1 | MRMAP.RRF | varchar(4000) | +MAPID | Metathesaurus asserted identifier for mapping | NULL | 10 | 10.98 | 11 | MRSMAP.RRF | varchar(50) | +MAPID | Metathesaurus asserted identifier for mapping | NULL | 10 | 10.99 | 11 | MRMAP.RRF | varchar(50) | +MAPIN | Mapping in current subset | NULL | 0 | 0.42 | 1 | MRCUI.RRF | char(1) | +MAPIN | Mapping in current subset | NULL | 1 | 1.00 | 1 | MRAUI.RRF | char(1) | +MAPRANK | Order in which mappings in a subset should be applied | NULL | 0 | 0.49 | 2 | MRMAP.RRF | integer | +MAPREASON | Reason for mapping | NULL | 0 | 0.00 | 4 | MRCUI.RRF | varchar(4000) | +MAPREASON | Reason for mapping | NULL | 4 | 4.00 | 4 | MRAUI.RRF | varchar(4000) | +MAPRES | Restriction applicable to this mapping | NULL | 0 | 34.78 | 429 | MRMAP.RRF | varchar(4000) | +MAPRULE | Machine processible rule applicable to this mapping | NULL | 0 | 9.57 | 336 | MRMAP.RRF | varchar(4000) | +MAPSETCUI | CUI of the map set | NULL | 8 | 8.00 | 8 | MRMAP.RRF | char(8) | +MAPSETCUI | CUI of the map set | NULL | 8 | 8.00 | 8 | MRSMAP.RRF | char(8) | +MAPSETSAB | SAB of the map set | NULL | 3 | 10.60 | 13 | MRSMAP.RRF | varchar(40) | +MAPSETSAB | SAB of the map set | NULL | 3 | 10.71 | 13 | MRMAP.RRF | varchar(40) | +MAPSID | Source asserted identifier for mapping | NULL | 0 | 0.00 | 0 | MRSMAP.RRF | varchar(50) | +MAPSID | Source asserted identifier for mapping | NULL | 0 | 0.01 | 36 | MRMAP.RRF | varchar(50) | +MAPSUBSETID | Map subset identifier used to identify a subset of related mappings within a map set | NULL | 0 | 0.49 | 1 | MRMAP.RRF | varchar(10) | +MAPTYPE | Type of mapping | NULL | 0 | 4.26 | 9 | MRMAP.RRF | varchar(50) | +MAX | Maximum Length | NULL | 1 | 1.37 | 5 | MRCOLS.RRF | integer | +METAUI | Metathesaurus asserted unique identifier | NULL | 0 | 7.85 | 10 | MRSAT.RRF | varchar(50) | +MIN | Minimum Length | NULL | 1 | 1.02 | 2 | MRCOLS.RRF | integer | +NSTR | Normalized string | NULL | 1 | 38.86 | 2460 | MRXNS_ENG.RRF | varchar(3000) | +NWD | Normalized word | NULL | 1 | 6.55 | 80 | MRXNW_ENG.RRF | varchar(100) | +PAUI | Unique identifier for parent atom | NULL | 0 | 8.46 | 9 | MRHIER.RRF | varchar(9) | +PCUI | Concept unique identifier in the previous Metathesaurus | NULL | 8 | 8.00 | 8 | CHANGE/DELETEDCUI.RRF | char(8) | +PCUI | Concept unique identifier in the previous Metathesaurus | NULL | 8 | 8.00 | 8 | CHANGE/MERGEDCUI.RRF | char(8) | +PLUI | Lexical unique identifier in the previous Metathesaurus | NULL | 0 | 0.00 | 0 | CHANGE/DELETEDLUI.RRF | varchar(10) | +PLUI | Lexical unique identifier in the previous Metathesaurus | NULL | 0 | 0.00 | 0 | CHANGE/MERGEDLUI.RRF | varchar(10) | +PSTR | Preferred name in the previous Metathesaurus | NULL | 0 | 0.00 | 0 | CHANGE/DELETEDLUI.RRF | varchar(3000) | +PSTR | Preferred name in the previous Metathesaurus | NULL | 0 | 0.00 | 0 | CHANGE/DELETEDSUI.RRF | varchar(3000) | +PSTR | Preferred name in the previous Metathesaurus | NULL | 4 | 4.00 | 4 | CHANGE/DELETEDCUI.RRF | varchar(3000) | +PSUI | String unique identifier in the previous Metathesaurus | NULL | 0 | 0.00 | 0 | CHANGE/DELETEDSUI.RRF | varchar(10) | +PTR | Path to root | NULL | 0 | 103.81 | 345 | MRHIER.RRF | varchar(1000) | +RANK | Termgroup ranking | NULL | 4 | 4.00 | 4 | MRRANK.RRF | integer | +RCUI | Unique identifier for root SRC concept | NULL | 8 | 8.00 | 8 | MRSAB.RRF | char(8) | +REASON | Explanation of change, if present | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(1000) | +REF | Documentation Section Number | NULL | 0 | 0.00 | 0 | MRCOLS.RRF | varchar(20) | +RELA | Additional relationship label | NULL | 0 | 0.00 | 0 | MRAUI.RRF | varchar(100) | +RELA | Additional relationship label | NULL | 0 | 0.00 | 0 | MRCUI.RRF | varchar(100) | +RELA | Additional relationship label | NULL | 0 | 10.69 | 54 | MRREL.RRF | varchar(100) | +RELA | Additional relationship label | NULL | 0 | 14.07 | 37 | MRMAP.RRF | varchar(100) | +RELA | Additional relationship label | NULL | 0 | 19.91 | 37 | MRSMAP.RRF | varchar(100) | +RELA | Additional relationship label | NULL | 0 | 2.71 | 12 | MRHIER.RRF | varchar(100) | +REL | Relationship label | NULL | 0 | 0.00 | 0 | MRAUI.RRF | varchar(4) | +REL | Relationship label | NULL | 2 | 2.00 | 2 | MRMAP.RRF | varchar(4) | +REL | Relationship label | NULL | 2 | 2.00 | 2 | MRSMAP.RRF | varchar(4) | +REL | Relationship label | NULL | 2 | 2.24 | 3 | MRREL.RRF | varchar(4) | +REL | Relationship label | NULL | 2 | 2.65 | 4 | MRCUI.RRF | varchar(4) | +RG | Relationship group | NULL | 0 | 0.06 | 2 | MRREL.RRF | varchar(10) | +RMETA | Version of the Metathesaurus where a version is removed | NULL | 0 | 0.09 | 6 | MRSAB.RRF | varchar(10) | +RSAB | Root source abbreviation | NULL | 2 | 5.94 | 15 | MRSAB.RRF | varchar(40) | +RUI | Unique identifier for relationship | NULL | 9 | 9.82 | 10 | MRREL.RRF | varchar(10) | +RWS | Number of rows | NULL | 1 | 5.56 | 8 | MRFILES.RRF | integer | +SABIN | Source in current subset | NULL | 1 | 1.00 | 1 | MRSAB.RRF | char(1) | +SAB | Source abbreviation | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(40) | +SAB | Source abbreviation | NULL | 2 | 4.12 | 11 | MRDEF.RRF | varchar(40) | +SAB | Source abbreviation | NULL | 2 | 5.31 | 15 | MRRANK.RRF | varchar(40) | +SAB | Source abbreviation | NULL | 2 | 5.48 | 15 | MRREL.RRF | varchar(40) | +SAB | Source abbreviation | NULL | 2 | 5.70 | 15 | MRCONSO.RRF | varchar(40) | +SAB | Source abbreviation | NULL | 2 | 5.75 | 13 | MRSAT.RRF | varchar(40) | +SAB | Source abbreviation | NULL | 2 | 7.90 | 13 | MRHIER.RRF | varchar(40) | +SATUI | Source asserted attribute identifier | NULL | 0 | 0.47 | 16 | MRDEF.RRF | varchar(50) | +SATUI | Source asserted attribute identifier | NULL | 0 | 3.24 | 36 | MRSAT.RRF | varchar(50) | +SAUI | Source asserted atom identifier | NULL | 0 | 1.73 | 18 | MRCONSO.RRF | varchar(100) | +SCC | Content contact info for a source | NULL | 0 | 152.05 | 332 | MRSAB.RRF | varchar(1000) | +SCIT | Source citation | NULL | 54 | 164.09 | 674 | MRSAB.RRF | varchar(4000) | +SCUI | Source asserted concept identifier | NULL | 0 | 5.28 | 95 | MRCONSO.RRF | varchar(100) | +SDUI | Source asserted descriptor identifier | NULL | 0 | 2.73 | 13 | MRCONSO.RRF | varchar(100) | +SF | Source Family | NULL | 2 | 4.20 | 13 | MRSAB.RRF | varchar(40) | +SLC | License contact info for a source | NULL | 12 | 167.35 | 346 | MRSAB.RRF | varchar(1000) | +SL | Source of relationship labels | NULL | 2 | 5.48 | 15 | MRREL.RRF | varchar(40) | +SON | Source Official Name | NULL | 10 | 48.65 | 145 | MRSAB.RRF | varchar(3000) | +SOURCEUI | Source asserted unique identifier | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(50) | +SRL | Source Restriction Level | NULL | 1 | 1.00 | 1 | MRCONSO.RRF | integer | +SRL | Source Restriction Level | NULL | 1 | 1.00 | 1 | MRSAB.RRF | integer | +SRUI | Source attributed relationship identifier | NULL | 0 | 1.20 | 36 | MRREL.RRF | varchar(50) | +SSN | Source short name | NULL | 3 | 26.96 | 89 | MRSAB.RRF | varchar(3000) | +STN | Semantic type tree number | NULL | 1 | 7.85 | 14 | MRSTY.RRF | varchar(100) | +STR | String | NULL | 1 | 38.20 | 2930 | MRCONSO.RRF | varchar(3000) | +STT | String type | NULL | 2 | 2.01 | 3 | MRCONSO.RRF | varchar(3) | +STYPE1 | The name of the column in MRCONSO.RRF that contains the first identifier to which the relationship is attached | NULL | 3 | 3.62 | 4 | MRREL.RRF | varchar(50) | +STYPE2 | The name of the column in MRCONSO.RRF that contains the second identifier to which the relationship is attached | NULL | 3 | 3.62 | 4 | MRREL.RRF | varchar(50) | +STYPE | The name of the column in MRCONSO.RRF or MRREL.RRF that contains the identifier to which the attribute is attached | NULL | 3 | 3.25 | 4 | MRSAT.RRF | varchar(50) | +STY | Semantic type | NULL | 4 | 17.65 | 39 | MRSTY.RRF | varchar(50) | +SUI | Unique identifier for string | NULL | 0 | 4.57 | 9 | MRSAT.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.00 | 8 | MRXW_BAQ.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.00 | 8 | MRXW_DAN.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.00 | 8 | MRXW_FIN.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.00 | 8 | MRXW_HEB.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.35 | 9 | AMBIGSUI.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.35 | 9 | MRXW_JPN.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.53 | 9 | MRXW_DUT.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.58 | 9 | MRCONSO.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.61 | 9 | MRXW_GER.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.64 | 9 | MRXNS_ENG.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.67 | 9 | MRXNW_ENG.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.67 | 9 | MRXW_ENG.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.71 | 9 | MRXW_SPA.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.79 | 9 | MRXW_POR.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.79 | 9 | MRXW_RUS.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.82 | 9 | MRXW_ITA.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.84 | 9 | MRXW_SWE.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.85 | 9 | MRXW_CZE.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.85 | 9 | MRXW_FRE.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.98 | 9 | MRXW_NOR.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 8 | 8.99 | 9 | MRXW_HUN.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_ARA.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_CHI.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_EST.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_GRE.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_KOR.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_LAV.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_POL.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_SCR.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_TUR.RRF | varchar(10) | +SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_UKR.RRF | varchar(10) | +SUPPRESS | Suppressible flag | NULL | 1 | 1.00 | 1 | MRCONSO.RRF | char(1) | +SUPPRESS | Suppressible flag | NULL | 1 | 1.00 | 1 | MRDEF.RRF | char(1) | +SUPPRESS | Suppressible flag | NULL | 1 | 1.00 | 1 | MRRANK.RRF | char(1) | +SUPPRESS | Suppressible flag | NULL | 1 | 1.00 | 1 | MRREL.RRF | char(1) | +SUPPRESS | Suppressible flag | NULL | 1 | 1.00 | 1 | MRSAT.RRF | char(1) | +SVER | Release date or version number of a source | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(20) | +SVER | Release date or version number of a source | NULL | 0 | 5.08 | 15 | MRSAB.RRF | varchar(20) | +TFR | Term frequency for a source | NULL | 1 | 4.41 | 7 | MRSAB.RRF | integer | +TOEXPR | The expression that a mapping is mapped to | NULL | 0 | 6.03 | 242 | MRMAP.RRF | varchar(4000) | +TOEXPR | The expression that a mapping is mapped to | NULL | 1 | 6.92 | 242 | MRSMAP.RRF | varchar(4000) | +TOID | Metathesaurus identifier for the entity being mapped to | NULL | 0 | 5.18 | 18 | MRMAP.RRF | varchar(50) | +TORES | Restriction applicable to the entity being mapped to | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(4000) | +TORULE | Machine processible rule applicable to the entity being mapped to | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(4000) | +TOSID | Source asserted identifier for the entity being mapped to | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(50) | +TOTYPE | The type of expression that a mapping is mapped to | NULL | 0 | 3.98 | 23 | MRMAP.RRF | varchar(50) | +TOTYPE | The type of expression that a mapping is mapped to | NULL | 4 | 4.36 | 22 | MRSMAP.RRF | varchar(50) | +TS | Term status | NULL | 1 | 1.00 | 1 | MRCONSO.RRF | char(1) | +TTYL | Term type list for a source | NULL | 0 | 11.76 | 86 | MRSAB.RRF | varchar(400) | +TTY | Term type in source | NULL | 2 | 2.35 | 11 | MRCONSO.RRF | varchar(20) | +TTY | Term type in source | NULL | 2 | 2.58 | 11 | MRRANK.RRF | varchar(20) | +TUI | Unique identifier of Semantic type | NULL | 4 | 4.00 | 4 | MRSTY.RRF | char(4) | +TYPE | Type of information | NULL | 3 | 13.14 | 21 | MRDOC.RRF | varchar(50) | +VALUE | Value | NULL | 0 | 15.98 | 62 | MRDOC.RRF | varchar(200) | +VCUI | Unique identifier for versioned SRC concept | NULL | 0 | 7.71 | 8 | MRSAB.RRF | char(8) | +VEND | Valid end date for a source | NULL | 0 | 0.00 | 0 | MRSAB.RRF | char(8) | +VER | Last release version in which CUI1 was valid | NULL | 6 | 6.00 | 6 | MRAUI.RRF | varchar(10) | +VER | Last release version in which CUI1 was valid | NULL | 6 | 6.00 | 6 | MRCUI.RRF | varchar(10) | +VSAB | Versioned source abbreviation | NULL | 3 | 11.35 | 24 | MRSAB.RRF | varchar(40) | +VSTART | Valid start date for a source | NULL | 0 | 0.00 | 0 | MRSAB.RRF | char(8) | +WD | Word in lower-case | NULL | 1 | 10.53 | 54 | MRXW_FIN.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 2.90 | 38 | MRXW_KOR.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 3.65 | 68 | MRXW_CHI.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 4.58 | 35 | MRXW_EST.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 5.23 | 37 | MRXW_TUR.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 5.47 | 22 | MRXW_ARA.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 5.71 | 38 | MRXW_POR.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 5.91 | 38 | MRXW_ITA.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 6.12 | 19 | MRXW_HEB.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 6.13 | 24 | MRXW_UKR.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 6.23 | 80 | MRXW_ENG.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 6.38 | 25 | MRXW_DAN.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 6.67 | 46 | MRXW_SPA.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 6.83 | 39 | MRXW_FRE.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 7.14 | 40 | MRXW_RUS.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 7.17 | 18 | MRXW_BAQ.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 7.50 | 34 | MRXW_GRE.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 7.55 | 48 | MRXW_POL.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 7.57 | 52 | MRXW_CZE.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 7.89 | 51 | MRXW_DUT.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 7.97 | 27 | MRXW_HUN.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 7.98 | 29 | MRXW_LAV.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 8.02 | 37 | MRXW_SCR.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 8.37 | 41 | MRXW_GER.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 8.61 | 39 | MRXW_SWE.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 8.91 | 85 | MRXW_JPN.RRF | varchar(500) | +WD | Word in lower-case | NULL | 1 | 9.11 | 44 | MRXW_NOR.RRF | varchar(500) | + +``` +mysql> select * from MRREL limit 10; +``` +CUI1 | AUI1 | STYPE1 | REL | CUI2 | AUI2 | STYPE2 | RELA | RUI | SRUI | SAB | SL | RG | DIR | SUPPRESS | CVF | +--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|-- +C0236642 | A0001895 | AUI | RB | C0270715 | A1389616 | AUI | NULL | R00689636 | NULL | AOD | AOD | NULL | NULL | N | NULL | +C0003787 | A0002112 | AUI | RB | C0037728 | A1397168 | AUI | NULL | R00689637 | NULL | AOD | AOD | NULL | NULL | N | NULL | +C0018090 | A0002644 | AUI | RB | C0032636 | A0103514 | AUI | NULL | R00689638 | NULL | AOD | AOD | NULL | NULL | N | NULL | +C0039194 | A0003844 | AUI | RB | C0024264 | A0483067 | AUI | NULL | R00689639 | NULL | AOD | AOD | NULL | NULL | N | NULL | +C0004561 | A0003849 | AUI | RB | C0024264 | A0483067 | AUI | NULL | R00689640 | NULL | AOD | AOD | NULL | NULL | N | NULL | +C0022801 | A0006210 | AUI | RB | C0035287 | A0488404 | AUI | NULL | R00689641 | NULL | AOD | AOD | NULL | NULL | N | NULL | +C0022801 | A0006210 | AUI | RB | C0227525 | A1182577 | AUI | NULL | R00689642 | NULL | AOD | AOD | NULL | NULL | N | NULL | +C0022801 | A0006210 | AUI | RB | C0449475 | A1182637 | AUI | NULL | R00689643 | NULL | AOD | AOD | NULL | NULL | N | NULL | +C0034143 | A0006342 | AUI | RB | C0682702 | A1389183 | AUI | NULL | R00689644 | NULL | AOD | AOD | NULL | NULL | N | NULL | +C0221406 | A0009638 | AUI | RB | C0020635 | A1393940 | AUI | NULL | R00689645 | NULL | AOD | AOD | NULL | NULL | N | NULL | + +``` +mysql> select * from MRDOC limit 10; +``` +DOCKEY | VALUE | TYPE | EXPL | +--|--|--|-- +ATN | AAL_TERM | expanded_form | AAL term | +ATN | ACCEPTABILITYID | expanded_form | Acceptability Id | +ATN | ACCEPTED_THERAPEUTIC_USE_FOR | expanded_form | Accepted therapeutic use for | +ATN | ACTIVE | expanded_form | Active | +ATN | ADDED_MEANING | expanded_form | Additional descriptive information | +ATN | ADDITIONAL_GUIDELINE | expanded_form | Additional explanatory text that is applicable to a concept (code/heading/subheading). | +ATN | ADDON_CODE | expanded_form | A "T" in this field indicates that it is an "Add-on" code, i.e. it is commonly carried out in addition to the primary procedure performed | +ATN | AGR | expanded_form | Alliance of Genome Resources | +ATN | AMBIGUITY_FLAG | expanded_form | Source atom ambiguity flag | +ATN | AMT | expanded_form | AOT uses MeSH term | + + +``` +mysql> select * from MRSMAP limit 10; +``` +MAPSETCUI | MAPSETSAB | MAPID | MAPSID | FROMEXPR | FROMTYPE | REL | RELA | TOEXPR | TOTYPE | CVF | +--|--|--|--|--|--|--|--|--|--|-- +C1306694 | MTH | AT102971857 | NULL | C0264643 | CUI | SY | NULL | Hypertension, Renovascular AND Hypertension, Malignant | BOOLEAN_EXPRESSION_STR | NULL | +C1306694 | MTH | AT102971858 | NULL | C0276253 | CUI | SY | NULL | Pneumonia AND Cytomegalovirus Infections | BOOLEAN_EXPRESSION_STR | NULL | +C1306694 | MTH | AT102971859 | NULL | C0409780 | CUI | SY | NULL | Synovitis AND Hand | BOOLEAN_EXPRESSION_STR | NULL | +C1306694 | MTH | AT102971861 | NULL | C1706094 | CUI | SY | NULL | Adhesives AND Denture Retention | BOOLEAN_EXPRESSION_STR | NULL | +C1306694 | MTH | AT102971862 | NULL | C1706094 | CUI | SY | NULL | Dental Cements AND Orthodontics | BOOLEAN_EXPRESSION_STR | NULL | +C1306694 | MTH | AT102971863 | NULL | C0180739 | CUI | RN | NULL | Enteral Nutrition/instrumentation | BOOLEAN_EXPRESSION_STR | NULL | +C1306694 | MTH | AT102971864 | NULL | C1533661 | CUI | SY | NULL | Arthroscopy AND Wrist Joint | BOOLEAN_EXPRESSION_STR | NULL | +C1306694 | MTH | AT110677869 | NULL | C1962918 | CUI | RN | NULL | Wheelchairs AND Equipment and Supplies | BOOLEAN_EXPRESSION_STR | NULL | +C1306694 | MTH | AT110677871 | NULL | C1855348 | CUI | RU | NULL | Glomerulonephritis | BOOLEAN_EXPRESSION_STR | NULL | +C1306694 | MTH | AT110677872 | NULL | C1855348 | CUI | RU | NULL | Marfan Syndrome | BOOLEAN_EXPRESSION_STR | NULL | + +``` +mysql> select * from MRSTY limit 10; +``` +CUI | TUI | STN | STY | ATUI | CVF | +--|--|--|--|--|-- +C0541479 | T104 | A1.4.1.2 | Chemical Viewed Structurally | AT07863944 | NULL | +C0541480 | T104 | A1.4.1.2 | Chemical Viewed Structurally | AT07863945 | NULL | +C0541481 | T104 | A1.4.1.2 | Chemical Viewed Structurally | AT07863946 | NULL | +C0070474 | T104 | A1.4.1.2 | Chemical Viewed Structurally | AT07863947 | 256 | +C0541516 | T104 | A1.4.1.2 | Chemical Viewed Structurally | AT07863948 | NULL | +C0678461 | T104 | A1.4.1.2 | Chemical Viewed Structurally | AT07863949 | NULL | +C0678462 | T104 | A1.4.1.2 | Chemical Viewed Structurally | AT07863950 | 256 | +C0678518 | T104 | A1.4.1.2 | Chemical Viewed Structurally | AT07863951 | 256 | +C0678519 | T104 | A1.4.1.2 | Chemical Viewed Structurally | AT07863952 | 256 | +C0678520 | T104 | A1.4.1.2 | Chemical Viewed Structurally | AT07863953 | 256 | + +``` +mysql> select * from MRAUI limit 10; +``` +AUI1 | CUI1 | VER | REL | RELA | MAPREASON | AUI2 | CUI2 | MAPIN | +--|--|--|--|--|--|--|--|-- +A0000039 | C1411876 | 2022AA | NULL | NULL | move | A0000039 | C0869474 | Y | +A0000049 | C0003910 | 2005AB | NULL | NULL | move | A0000049 | C0236828 | Y | +A0000080 | C0003477 | 2011AA | NULL | NULL | move | A0000080 | C1527281 | Y | +A0000087 | C0596170 | 2008AB | NULL | NULL | move | A0000087 | C2267227 | Y | +A0000088 | C0596170 | 2008AB | NULL | NULL | move | A0000088 | C2267227 | Y | +A0000090 | C0596170 | 2008AB | NULL | NULL | move | A0000090 | C2267227 | Y | +A0000091 | C0596170 | 2008AB | NULL | NULL | move | A0000091 | C2267227 | Y | +A0000092 | C0596170 | 2008AB | NULL | NULL | move | A0000092 | C2267227 | Y | +A0000230 | C0029220 | 2007AA | NULL | NULL | move | A0000230 | C0236748 | Y | +A0000231 | C0029220 | 2007AA | NULL | NULL | move | A0000231 | C0236748 | Y | + +``` +mysql> select * from MRCONSO limit 10; +``` +CUI | LAT | TS | LUI | STT | SUI | ISPREF | AUI | SAUI | SCUI | SDUI | SAB | TTY | CODE | STR | SRL | SUPPRESS | CVF | +--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|-- +C0026106 | ENG | S | L0026106 | PF | S0000001 | N | A0000002 | NULL | NULL | NULL | ICD10 | HT | F70 | Mild mental retardation | 3 | N | 256 | +C0026106 | ENG | S | L0026106 | PF | S0000001 | N | A0000003 | NULL | NULL | NULL | ICD10AM | HT | F70 | Mild mental retardation | 3 | N | 256 | +C0026351 | ENG | S | L0026351 | PF | S0000002 | N | A0000008 | NULL | NULL | NULL | ICD10 | HT | F71 | Moderate mental retardation | 3 | N | NULL | +C0026351 | ENG | S | L0026351 | PF | S0000002 | N | A0000009 | NULL | NULL | NULL | ICD10AM | HT | F71 | Moderate mental retardation | 3 | N | NULL | +C0036857 | ENG | S | L0036857 | PF | S0000003 | N | A0000014 | NULL | NULL | NULL | ICD10 | HT | F72 | Severe mental retardation | 3 | N | 256 | +C0036857 | ENG | S | L0036857 | PF | S0000003 | N | A0000015 | NULL | NULL | NULL | ICD10AM | HT | F72 | Severe mental retardation | 3 | N | 256 | +C0020796 | ENG | S | L0033296 | PF | S0000004 | N | A0000020 | NULL | NULL | NULL | ICD10 | HT | F73 | Profound mental retardation | 3 | N | 256 | +C0020796 | ENG | S | L0033296 | PF | S0000004 | N | A0000021 | NULL | NULL | NULL | ICD10AM | HT | F73 | Profound mental retardation | 3 | N | 256 | +C0025362 | ENG | S | L0080273 | PF | S0000005 | N | A0000026 | NULL | NULL | NULL | ICD10 | HT | F79 | Unspecified mental retardation | 3 | N | 256 | +C0025362 | ENG | S | L0080273 | PF | S0000005 | N | A0000027 | NULL | NULL | NULL | ICD10AM | HT | F79 | Unspecified mental retardation | 3 | N | 256 | + +``` +mysql> select * from MRFILES; +``` +FIL | DES | FMT | CLS | RWS | BTS | +--|--|--|--|--|-- +AMBIGLUI.RRF | Ambiguous term identifiers | LUI,CUI | 2 | 301093 | 5788399 | +AMBIGSUI.RRF | Ambiguous string identifiers | SUI,CUI | 2 | 207867 | 4022457 | +CHANGE/DELETEDCUI.RRF | Deleted concepts | PCUI,PSTR | 2 | 1426698 | 21400470 | +CHANGE/DELETEDLUI.RRF | Deleted terms | PLUI,PSTR | 2 | 0 | 0 | +CHANGE/DELETEDSUI.RRF | Deleted strings | PSUI,LAT,PSTR | 3 | 0 | 0 | +CHANGE/MERGEDCUI.RRF | Merged concepts | PCUI,CUI | 2 | 1536 | 29184 | +CHANGE/MERGEDLUI.RRF | Merged terms | PLUI,LUI | 2 | 0 | 0 | +MRAUI.RRF | AUI History | AUI1,CUI1,VER,REL,RELA,MAPREASON,AUI2,CUI2,MAPIN | 9 | 293552 | 15877630 | +MRCOLS.RRF | Attribute Relation | COL,DES,REF,MIN,AV,MAX,FIL,DTY | 8 | 339 | 23403 | +MRCONSO.RRF | Concept names and sources | CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF | 18 | 13501908 | 1737065435 | +MRCUI.RRF | CUI History | CUI1,VER,REL,RELA,MAPREASON,CUI2,MAPIN | 7 | 2716556 | 77130698 | +MRDEF.RRF | Definitions | CUI,AUI,ATUI,SATUI,SAB,DEF,SUPPRESS,CVF | 8 | 425261 | 118551841 | +MRDOC.RRF | Typed key value metadata map | DOCKEY,VALUE,TYPE,EXPL | 4 | 3396 | 218481 | +MRFILES.RRF | Relation Relation | FIL,DES,FMT,CLS,RWS,BTS | 6 | 52 | 4208 | +MRHIER.RRF | Computable hierarchies | CUI,AUI,CXN,PAUI,SAB,RELA,PTR,HCD,CVF | 9 | 31893483 | 4851178506 | +MRHIST.RRF | Source-asserted history | CUI,SOURCEUI,SAB,SVER,CHANGETYPE,CHANGEKEY,CHANGEVAL,REASON,CVF | 9 | 0 | 0 | +MRMAP.RRF | Mappings | MAPSETCUI,MAPSETSAB,MAPSUBSETID,MAPRANK,MAPID,MAPSID,FROMID,FROMSID,FROMEXPR,FROMTYPE,FROMRULE,FROMRES,REL,RELA,TOID,TOSID,TOEXPR,TOTYPE,TORULE,TORES,MAPRULE,MAPRES,MAPTYPE,MAPATN,MAPATV,CVF | 26 | 810346 | 129610753 | +MRRANK.RRF | Concept Name Ranking | RANK,SAB,TTY,SUPPRESS | 4 | 683 | 12217 | +MRREL.RRF | Related Concepts | CUI1,AUI1,STYPE1,REL,CUI2,AUI2,STYPE2,RELA,RUI,SRUI,SAB,SL,RG,DIR,SUPPRESS,CVF | 16 | 43842950 | 4093351915 | +MRSAB.RRF | Source Metadata | VCUI,RCUI,VSAB,RSAB,SON,SF,SVER,VSTART,VEND,IMETA,RMETA,SLC,SCC,SRL,TFR,CFR,CXTY,TTYL,ATNL,LAT,CENC,CURVER,SABIN,SSN,SCIT | 25 | 192 | 142036 | +MRSAT.RRF | Simple Concept, Term and String Attributes | CUI,LUI,SUI,METAUI,STYPE,CODE,ATUI,SATUI,ATN,SAB,ATV,SUPPRESS,CVF | 13 | 65915853 | 5967696352 | +MRSMAP.RRF | Simple Mappings | MAPSETCUI,MAPSETSAB,MAPID,MAPSID,FROMEXPR,FROMTYPE,REL,RELA,TOEXPR,TOTYPE,CVF | 11 | 416075 | 35648007 | +MRSTY.RRF | Semantic Types | CUI,TUI,STN,STY,ATUI,CVF | 6 | 3476668 | 199142173 | +MRXNS_ENG.RRF | Normalized String Index | LAT,NSTR,CUI,LUI,SUI | 5 | 12150129 | 886221009 | +MRXNW_ENG.RRF | Normalized Word Index | LAT,NWD,CUI,LUI,SUI | 5 | 39785958 | 1617668497 | +MRXW_ARA.RRF | Arabic Word Index | LAT,WD,CUI,LUI,SUI | 5 | 290245 | 13322541 | +MRXW_BAQ.RRF | Basque Word Index | LAT,WD,CUI,LUI,SUI | 5 | 2669 | 107206 | +MRXW_CHI.RRF | Chinese Word Index | LAT,WD,CUI,LUI,SUI | 5 | 601700 | 27220291 | +MRXW_CZE.RRF | Czech Word Index | LAT,WD,CUI,LUI,SUI | 5 | 477599 | 20363847 | +MRXW_DAN.RRF | Danish Word Index | LAT,WD,CUI,LUI,SUI | 5 | 2466 | 97114 | +MRXW_DUT.RRF | Dutch Word Index | LAT,WD,CUI,LUI,SUI | 5 | 1101850 | 46227836 | +MRXW_ENG.RRF | English Word Index | LAT,WD,CUI,LUI,SUI | 5 | 39223696 | 1581848830 | +MRXW_EST.RRF | Estonian Word Index | LAT,WD,CUI,LUI,SUI | 5 | 226586 | 8986331 | +MRXW_FIN.RRF | Finnish Word Index | LAT,WD,CUI,LUI,SUI | 5 | 42922 | 1875628 | +MRXW_FRE.RRF | French Word Index | LAT,WD,CUI,LUI,SUI | 5 | 2426179 | 101219317 | +MRXW_GER.RRF | German Word Index | LAT,WD,CUI,LUI,SUI | 5 | 799432 | 34054417 | +MRXW_GRE.RRF | Greek Word Index | LAT,WD,CUI,LUI,SUI | 5 | 274018 | 13634628 | +MRXW_HEB.RRF | Hebrew Word Index | LAT,WD,CUI,LUI,SUI | 5 | 1617 | 63262 | +MRXW_HUN.RRF | Hungarian Word Index | LAT,WD,CUI,LUI,SUI | 5 | 241751 | 10526508 | +MRXW_ITA.RRF | Italian Word Index | LAT,WD,CUI,LUI,SUI | 5 | 1199574 | 48609396 | +MRXW_JPN.RRF | Japanese Word Index | LAT,WD,CUI,LUI,SUI | 5 | 282000 | 16758359 | +MRXW_KOR.RRF | Korean Word Index | LAT,WD,CUI,LUI,SUI | 5 | 460600 | 19847488 | +MRXW_LAV.RRF | Latvian Word Index | LAT,WD,CUI,LUI,SUI | 5 | 230914 | 10092516 | +MRXW_NOR.RRF | Norwegian Word Index | LAT,WD,CUI,LUI,SUI | 5 | 125266 | 5530491 | +MRXW_POL.RRF | Polish Word Index | LAT,WD,CUI,LUI,SUI | 5 | 425767 | 18182640 | +MRXW_POR.RRF | Portuguese Word Index | LAT,WD,CUI,LUI,SUI | 5 | 1498232 | 60641784 | +MRXW_RUS.RRF | Russian Word Index | LAT,WD,CUI,LUI,SUI | 5 | 1111315 | 52355773 | +MRXW_SCR.RRF | Croatian Word Index | LAT,WD,CUI,LUI,SUI | 5 | 24050 | 1017136 | +MRXW_SPA.RRF | Spanish Word Index | LAT,WD,CUI,LUI,SUI | 5 | 9117253 | 375647723 | +MRXW_SWE.RRF | Swedish Word Index | LAT,WD,CUI,LUI,SUI | 5 | 288913 | 12571621 | +MRXW_TUR.RRF | Turkish Word Index | LAT,WD,CUI,LUI,SUI | 5 | 419840 | 17046315 | +MRXW_UKR.RRF | Ukrainian Word Index | LAT,WD,CUI,LUI,SUI | 5 | 25840 | 1210231 | + +``` +mysql> select * from MRSAB limit 10; +``` +VCUI | RCUI | VSAB | RSAB | SON | SF | SVER | VSTART | VEND | IMETA | RMETA | SLC | SCC | SRL | TFR | CFR | CXTY | TTYL | ATNL | LAT | CENC | CURVER | SABIN | SSN | SCIT | +--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|-- +C1140092 | C1140091 | AIR93 | AIR | AI/RHEUM, 1993 | AIR | 1993 | NULL | NULL | 1995AA | NULL | May Cheh;;Lister Hill National Center for Biomedical Communications, National Library of Medicine;Building 38A, Room 9E902;8600 Rockville Pike;Bethesda;MD;;20894;;;cheh@nlm.nih.gov; | May Cheh;;Lister Hill National Center for Biomedical Communications, National Library of Medicine;Building 38A, Room 9E902;8600 Rockville Pike;Bethesda;MD;;20894;;;cheh@nlm.nih.gov; | 0 | 685 | 630 | FULL-MULTIPLE | DI,FI,HT,SY | NULL | ENG | UTF-8 | Y | Y | AI/RHEUM | ;;;;AI/RHEUM;;;;;National Library of Medicine, Lister Hill Center;1993;;Bethesda, MD;;;;;; | +C2366569 | C1140170 | ALT2009 | ALT | Alternative Billing Concepts, 2009 | ALT | 2009 | NULL | NULL | 2009AA | NULL | ;;ABC Coding Solutions - Alternative Link;6121 Indian School Road NE;Suite 131;Albuquerque;NM;United States;87110;1-877-621-5465;1-505-875-0002;Legal@ABCcodes.com; | Bernd G. Lucks;Chief Operating Officer;ABC Coding Solutions - Alternative Link;6121 Indian School Road NE;Suite 131;Albuquerque;NM;United States;87110;1-505-875-0001 ext. 202;;bernd.lucks@ABCcodes.com; | 3 | 4669 | 4613 | FULL | HT,PT | DATE_CREATED,DATE_LAST_MODIFIED,SOURCE_UI | ENG | UTF-8 | Y | Y | Alternative Billing Concepts | ;;;;ABC Codes and Terminology;;;9th;Albuquerque, NM;ABC Coding Solutions - Alternative Link;2009;;;;;;ENG;; | +C1140163 | C1140162 | AOD2000 | AOD | Alcohol and Other Drug Thesaurus, 2000 | AOD | 2000 | NULL | NULL | 2002AC | NULL | Nancy Winstanley;;NIAAA Library c/o CSR Incorporated;2107 Wilson Blvd., Suite 1000;;Arlington;VA;;22201;703-741-7147;; e-mail: nwinstanley@csrincorporated.com;; | Dagobert Soergel;;;;;;;;;301-405-2037;;ds52@umail.umd.edu; | 0 | 20685 | 15915 | FULL | DE,DS,ES,ET,EX,FN,NP,NS,NX,XD | HN,SOS | ENG | UTF-8 | Y | Y | Alcohol and Other Drug Thesaurus | ;;;;Alcohol and Other Drug Thesaurus: A Guide to Concepts and Terminology in Substance Abuse and Addiction;;;3rd. ed. [4 Volumes.];Bethesda, MD;National Institute on Alcohol Abuse and Alcoholism (NIAAA) and Center for Substance Abuse Prevention (CSAP);2000;;;;;;ENG;; | +C1704486 | C1704485 | AOT2003 | AOT | Authorized Osteopathic Thesaurus, 2003 | AOT | 2003 | NULL | NULL | 2006AD | NULL | ;;;;;Chevy Chase;MD;;;;;;http://www.aacom.org/InfoFor/educators/Pages/thesaurus.aspx | ;;American Association of Colleges of Osteopathic Medicine ;5550 Friendship Boulevard ;Suite 310;Chevy Chase;MD;United States;20815-7231;301-968-4100;301-968-4101;;http://www.aacom.org/InfoFor/educators/Pages/thesaurus.aspx | 0 | 471 | 276 | FULL-MULTIPLE | ET,PT | AMT | ENG | UTF-8 | Y | Y | Authorized Osteopathic Thesaurus | ;;;;Authorized Osteopathic Thesaurus;;;;Chevy Chase, MD;Educational Council of Osteopathic Principles of the American Association of Colleges of Osteopathic Medicine;2004;;;;;http://www.aacom.org/InfoFor/educators/Pages/thesaurus.aspx;ENG;; | +C5777091 | C4722517 | ATC_2022_23_03_06 | ATC | Anatomical Therapeutic Chemical Classification System, ATC_2022 | ATC | ATC_2022 | NULL | NULL | 2023AA | NULL | ;;WHO Collaborating Centre for Drug Statistics Methodology;Norwegian Institute of Public Health;P.O.Box 4404 Nydalen;Oslo;;Norway;0403;+47 21 07 81 60;+47 21 07 81 46;whocc@fhi.no;http://www.whocc.no/copyright_disclaimer/ | ;;WHO Collaborating Centre for Drug Statistics Methodology;Norwegian Institute of Public Health;P.O.Box 4404 Nydalen;Oslo;;Norway;0403;+47 21 07 81 60;+47 21 07 81 46;whocc@fhi.no;http://www.whocc.no/ | 0 | 7210 | 5794 | FULL | IN,PT,RXN_IN,RXN_PT | ATC_LEVEL,IS_DRUG_CLASS | ENG | UTF-8 | Y | Y | Anatomical Therapeutic Chemical Classification System | ;;WHO Collaborating Centre for Drug Statistics Methodology;;Anatomical Therapeutic Chemical (ATC) classification system;;;2022;Oslo, Norway;WHO Collaborating Centre for Drug Statistics Methodology;;;;;;http://www.whocc.no/copyright_disclaimer/;;; | +C1140164 | C1140165 | BI98 | BI | Beth Israel Vocabulary, 1.0 | BI | 1.0 | NULL | NULL | 1999AA | NULL | Daniel Z. Sands, M.D., M.P.H.;Clinical Systems Integration Architect;Center for Clinical Computing,Beth Israel Deaconess Medical Center,Harvard University;330 Brookline Avenue;;Boston;MA;United States;02215;617-667-1510;810-592-0716; e-mail: dsands@bidmc.Harvard.edu; | Howard Goldberg, MD.;;;;;;;;;;;hgoldber@bidmc.harvard.edu; | 2 | 1250 | 937 | NULL | AB,PT,RT,SY | NULL | ENG | UTF-8 | Y | Y | Beth Israel Problem List | Howard Goldberg, MD;;;;Beth Israel OMR Clinical Problem List Vocabulary;;;Version 1.0;Boston, MA;Beth Israel Deaconess Medical Center;1999;;;;;;ENG;; | +C4550264 | C3251798 | CCC2_5_2018 | CCC | Clinical Care Classification, 2_5_2018 | CCC | 2_5_2018 | NULL | NULL | 2018AA | NULL | Dr. Virginia K. Saba;CEO & President;SabaCare,Inc;;;Arlington;VA;United States;;703-521-6132;703-521-3866;vsaba@att.net;http://www.sabacare.com/; | Dr. Virginia K. Saba;CEO & President;SabaCare,Inc;;;Arlington;VA;United States;;703-521-6132;703-521-3866;vsaba@att.net;http://www.sabacare.com/; | 1 | 410 | 405 | FULL-MULTIPLE | HT,MP,MTH_HT,PT | NULL | ENG | UTF-8 | Y | Y | Clinical Care Classification | ;;SabaCare,Inc.;;Clinical Care Classification (CCC) System;;;2.5;;;;January 10, 2018;;;;;ENG;; | +C1140221 | C1140220 | CCPSS99 | CCPSS | Canonical Clinical Problem Statement System, 1999 | CCPSS | 1999 | NULL | NULL | 2000AA | NULL | Steven Brown, M.D.;Associate Professor, Biomedical Informatics;Eskind Biomedical Library, Vanderbilt University Medical Center;2209 Garland Ave;Room 442;Nashville;TN;United States;37232-8340;(615) 321-6335;;sbrown@vumclib.mc.vanderbilt.edu; | Steven Brown, MD;;Department of Biomedical Informatics Vanderbilt University;;;;;;;;;; | 3 | 15777 | 15245 | NULL | MP,PT,TX | CCF | ENG | UTF-8 | Y | Y | Clinical Problem Statements | ;;;;Canonical Clincial Problem Statement System;;;Version 1.0;;;June 23, 1999;;;;;;ENG;Contact: sbrown@vumclib.mc.vanderbilt.edu; | +C1541964 | C1140228 | CCS2005 | CCS | Clinical Classifications Software, 2005 | CCS | 2005 | NULL | NULL | 2005AC | NULL | Anne Elixhauser, Ph.D.;Senior Research Scientist;Agency for Healthcare Research and Quality;540 Gaither Road;;Rockville;MD;United States;20850;(301) 427-1411, 1-800-358-9295;(301) 594-1430;AElixhau@AHRQ.gov; | Anne Elixhauser, Ph.D.;Senior Research Scientist;Agency for Healthcare Research and Quality;540 Gaither Road;;Rockville;MD;United States;20850;1-800-358-9295;(301)-594-1430;AElixhau@AHRQ.gov; | 0 | 1617 | 1109 | FULL | HT,MD,MV,SD,SP,XM | CCI,FROMRSAB,FROMVSAB,MAPSETRSAB,MAPSETVERSION,MAPSETVSAB,MTH_MAPFROMCOMPLEXITY,MTH_MAPFROMEXHAUSTIVE,MTH_MAPSETCOMPLEXITY,MTH_MAPTOCOMPLEXITY,MTH_MAPTOEXHAUSTIVE,SOS,TORSAB,TOVSAB | ENG | UTF-8 | Y | Y | Clinical Classifications Software | ;;Agency for Healthcare Research and Quality (AHRQ);;Clinical Classifications Software (CCS);;;;;;April 2005;;Rockville,MD;;; http://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp;ENG;Phone: 301-594-1364.; | +C5770268 | C5400755 | CCSR_ICD10CM_2023 | CCSR_ICD10CM | Clinical Classifications Software Refined for ICD-10-CM, 2023 | CCS | 2023 | NULL | NULL | 2023AA | NULL | ;;Agency for Healthcare Research and Quality;5600 Fishers Lane;Mail Stop 07N94A;Rockville;MD;United States;20857;1-866-290-HCUP;(301) 594-1430;hcup@ahrq.gov;https://www.hcup-us.ahrq.gov/toolssoftware/ccsr/ccs_refined.jsp; | ;;Agency for Healthcare Research and Quality;5600 Fishers Lane;Mail Stop 07N94A;Rockville;MD;United States;20857;1-866-290-HCUP;(301)-594-1430;hcup@ahrq.gov;https://www.hcup-us.ahrq.gov/toolssoftware/ccsr/ccs_refined.jsp; | 0 | 546 | 545 | NULL | SD,XM | FROMRSAB,FROMVSAB,MAPSETRSAB,MAPSETVERSION,MAPSETVSAB,MTH_MAPFROMCOMPLEXITY,MTH_MAPFROMEXHAUSTIVE,MTH_MAPSETCOMPLEXITY,MTH_MAPTOCOMPLEXITY,MTH_MAPTOEXHAUSTIVE,TORSAB,TOVSAB | ENG | UTF-8 | Y | Y | Clinical Classifications Software Refined for ICD-10-CM | ;;Healthcare Cost and Utilization Project (HCUP);;Clinical Classifications Software Refined for ICD-10-CM;;;;;Agency for Healthcare Research and Quality (AHRQ);;November 2022;Rockville, MD;;;;ENG;; | From 13fd4e1ca8899dafad1eca7a2fabe333e42fd665 Mon Sep 17 00:00:00 2001 From: "E.C. Wood" Date: Wed, 12 Jul 2023 13:56:03 -0700 Subject: [PATCH 002/117] #316 found where IDs from other sources are coming from --- understanding_umls.md | 505 +++++++++++++----------------------------- 1 file changed, 159 insertions(+), 346 deletions(-) diff --git a/understanding_umls.md b/understanding_umls.md index 02baabd0..104b01ea 100644 --- a/understanding_umls.md +++ b/understanding_umls.md @@ -65,6 +65,16 @@ MRXW_SPA | MRXW_SWE | MRXW_TUR | +Tables that `umls2rdf.py` uses: +- MRSTY +- MRCONSO +- MRSAB +- MRREL +- MRDEF +- MRSAT +- MRRANK +- MRDOC + ``` mysql> select * from MRCUI limit 10; ``` @@ -81,352 +91,6 @@ C0000164 | 2003AB | RO | NULL | NULL | C0000163 | Y | C0000177 | 1993AA | SY | NULL | NULL | C0014924 | Y | C0000219 | 1993AA | DEL | NULL | NULL | NULL | NULL | -``` -mysql> select * from MRCOLS; -``` - -COL | DES | REF | MIN | AV | MAX | FIL | DTY | ---|--|--|--|--|--|--|-- -ATNL | Attribute name list for a source. | NULL | 0 | 69.84 | 1178 | MRSAB.RRF | varchar(4000) | -ATN | Attribute name | NULL | 2 | 10.38 | 62 | MRSAT.RRF | varchar(100) | -ATUI | Unique identifier for attribute. | NULL | 10 | 10.64 | 11 | MRSTY.RRF | varchar(11) | -ATUI | Unique identifier for attribute. | NULL | 10 | 10.85 | 11 | MRSAT.RRF | varchar(11) | -ATUI | Unique identifier for attribute. | NULL | 10 | 10.86 | 11 | MRDEF.RRF | varchar(11) | -ATV | Attribute value | NULL | 1 | 12.69 | 35985 | MRSAT.RRF | varchar(65000) | -AUI1 | Unique identifier for first atom | NULL | 0 | 8.52 | 9 | MRREL.RRF | varchar(9) | -AUI1 | Unique identifier for first atom | NULL | 8 | 8.54 | 9 | MRAUI.RRF | varchar(9) | -AUI2 | Unique identifier for second atom | NULL | 0 | 8.52 | 9 | MRREL.RRF | varchar(9) | -AUI2 | Unique identifier for second atom | NULL | 8 | 8.54 | 9 | MRAUI.RRF | varchar(9) | -AUI | Unique identifier for atom | NULL | 8 | 8.58 | 9 | MRHIER.RRF | varchar(9) | -AUI | Unique identifier for atom | NULL | 8 | 8.74 | 9 | MRDEF.RRF | varchar(9) | -AUI | Unique identifier for atom | NULL | 8 | 8.77 | 9 | MRCONSO.RRF | varchar(9) | -AV | Average Length, Characters | NULL | 4 | 4.12 | 6 | MRCOLS.RRF | numeric(5,2) | -BTS | Size in Bytes | NULL | 1 | 7.19 | 10 | MRFILES.RRF | integer | -CENC | Character encoding of a source as specified by IANA | NULL | 5 | 5.00 | 5 | MRSAB.RRF | varchar(20) | -CFR | CUI frequency for a source | NULL | 1 | 4.18 | 6 | MRSAB.RRF | integer | -CHANGEKEY | CONCEPTSTATUS (if history relates to a SNOMED CT concept) or DESCRIPTIONSTATUS (if history relates to a SNOMED CT atom or "description") | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(1000) | -CHANGETYPE | Source asserted code for type of change | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(1000) | -CHANGEVAL | SNOMED CT CONCEPTSTATUS or DESCRIPTIONSTATUS value after the change took place | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(1000) | -CLS | Number of columns | NULL | 1 | 1.12 | 2 | MRFILES.RRF | integer | -CODE | Unique Identifier or code for string in source | NULL | 0 | 4.46 | 56 | MRSAT.RRF | varchar(100) | -CODE | Unique Identifier or code for string in source | NULL | 1 | 7.50 | 95 | MRCONSO.RRF | varchar(100) | -COL | Column or data element name | NULL | 2 | 3.71 | 11 | MRCOLS.RRF | varchar(20) | -CUI1 | Unique identifier for first concept | NULL | 8 | 8.00 | 8 | MRAUI.RRF | char(8) | -CUI1 | Unique identifier for first concept | NULL | 8 | 8.00 | 8 | MRCUI.RRF | char(8) | -CUI1 | Unique identifier for first concept | NULL | 8 | 8.00 | 8 | MRREL.RRF | char(8) | -CUI2 | Unique identifier for second concept | NULL | 0 | 3.33 | 8 | MRCUI.RRF | char(8) | -CUI2 | Unique identifier for second concept | NULL | 8 | 8.00 | 8 | MRAUI.RRF | char(8) | -CUI2 | Unique identifier for second concept | NULL | 8 | 8.00 | 8 | MRREL.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 0 | 0.00 | 0 | MRHIST.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | AMBIGLUI.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | AMBIGSUI.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | CHANGE/MERGEDCUI.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRCONSO.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRDEF.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRHIER.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRSAT.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRSTY.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXNS_ENG.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXNW_ENG.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_ARA.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_BAQ.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_CHI.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_CZE.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_DAN.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_DUT.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_ENG.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_EST.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_FIN.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_FRE.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_GER.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_GRE.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_HEB.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_HUN.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_ITA.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_JPN.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_KOR.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_LAV.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_NOR.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_POL.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_POR.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_RUS.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_SCR.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_SPA.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_SWE.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_TUR.RRF | char(8) | -CUI | Unique identifier for concept | NULL | 8 | 8.00 | 8 | MRXW_UKR.RRF | char(8) | -CURVER | Current Version flag | NULL | 1 | 1.00 | 1 | MRSAB.RRF | char(1) | -CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRDEF.RRF | varchar(50) | -CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRHIER.RRF | varchar(50) | -CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(50) | -CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(50) | -CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRREL.RRF | varchar(50) | -CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRSAT.RRF | varchar(50) | -CVF | Content view flag | NULL | 0 | 0.00 | 0 | MRSMAP.RRF | varchar(50) | -CVF | Content view flag | NULL | 0 | 1.22 | 5 | MRCONSO.RRF | varchar(50) | -CVF | Content view flag | NULL | 0 | 2.13 | 5 | MRSTY.RRF | varchar(50) | -CXN | The context number if the atom has multiple contexts | NULL | 1 | 2.17 | 5 | MRHIER.RRF | integer | -CXTY | Context type for a source | NULL | 0 | 5.14 | 13 | MRSAB.RRF | varchar(50) | -DEF | Definition | NULL | 1 | 232.23 | 10939 | MRDEF.RRF | varchar(16000) | -DES | Descriptive Name | NULL | 5 | 28.81 | 136 | MRCOLS.RRF | varchar(200) | -DES | Descriptive Name | NULL | 8 | 18.25 | 42 | MRFILES.RRF | varchar(200) | -DIR | Source asserted directionality flag | NULL | 0 | 0.13 | 1 | MRREL.RRF | varchar(1) | -DOCKEY | Key to be documented | NULL | 2 | 3.65 | 8 | MRDOC.RRF | varchar(50) | -DTY | SQL-92 data type for this column | NULL | 7 | 10.02 | 14 | MRCOLS.RRF | varchar(20) | -EXPL | Detailed explanation | NULL | 0 | 26.57 | 941 | MRDOC.RRF | varchar(1000) | -FIL | Physical FILENAME | NULL | 9 | 10.99 | 21 | MRCOLS.RRF | varchar(50) | -FIL | Physical FILENAME | NULL | 9 | 12.12 | 21 | MRFILES.RRF | varchar(50) | -FMT | Comma separated list of COL | NULL | 7 | 29.69 | 190 | MRFILES.RRF | varchar(300) | -FROMEXPR | The expression that a mapping is mapped from | NULL | 1 | 6.93 | 9 | MRSMAP.RRF | varchar(4000) | -FROMEXPR | The expression that a mapping is mapped from | NULL | 1 | 8.29 | 18 | MRMAP.RRF | varchar(4000) | -FROMID | Metathesaurus identifier for the entity being mapped from | NULL | 1 | 7.31 | 18 | MRMAP.RRF | varchar(50) | -FROMRES | Restriction applicable to the entity being mapped from | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(4000) | -FROMRULE | Machine processible rule applicable to the entity being mapped from | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(4000) | -FROMSID | Source asserted identifier for the entity being mapped from | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(50) | -FROMTYPE | The type of expression that a mapping is mapped from | NULL | 3 | 3.98 | 4 | MRSMAP.RRF | varchar(50) | -FROMTYPE | The type of expression that a mapping is mapped from | NULL | 3 | 3.99 | 4 | MRMAP.RRF | varchar(50) | -HCD | Source asserted hierarchical number or code of context member (if it exists) | NULL | 0 | 0.48 | 51 | MRHIER.RRF | varchar(100) | -IMETA | Version of the Metathesaurus that a source was added | NULL | 6 | 6.00 | 6 | MRSAB.RRF | varchar(10) | -ISPREF | Indicates whether AUI is preferred | NULL | 1 | 1.00 | 1 | MRCONSO.RRF | char(1) | -LAT | Language of Term(s) | NULL | 0 | 0.00 | 0 | CHANGE/DELETEDSUI.RRF | char(3) | -LAT | Language of Term(s) | NULL | 0 | 2.97 | 3 | MRSAB.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRCONSO.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXNS_ENG.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXNW_ENG.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_ARA.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_BAQ.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_CHI.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_CZE.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_DAN.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_DUT.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_ENG.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_EST.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_FIN.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_FRE.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_GER.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_GRE.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_HEB.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_HUN.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_ITA.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_JPN.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_KOR.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_LAV.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_NOR.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_POL.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_POR.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_RUS.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_SCR.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_SPA.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_SWE.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_TUR.RRF | char(3) | -LAT | Language of Term(s) | NULL | 3 | 3.00 | 3 | MRXW_UKR.RRF | char(3) | -LUI | Unique identifier for term | NULL | 0 | 0.00 | 0 | CHANGE/MERGEDLUI.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 0 | 4.50 | 9 | MRSAT.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.00 | 8 | MRXW_BAQ.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.00 | 8 | MRXW_DAN.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.00 | 8 | MRXW_FIN.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.00 | 8 | MRXW_HEB.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.17 | 9 | MRXW_SCR.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.20 | 9 | MRXW_JPN.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.22 | 9 | AMBIGLUI.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.43 | 9 | MRXW_ENG.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.44 | 9 | MRCONSO.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.44 | 9 | MRXNS_ENG.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.44 | 9 | MRXNW_ENG.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.48 | 9 | MRXW_CZE.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.52 | 9 | MRXW_DUT.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.58 | 9 | MRXW_GER.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.67 | 9 | MRXW_SPA.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.77 | 9 | MRXW_POR.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.77 | 9 | MRXW_RUS.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.78 | 9 | MRXW_ITA.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.78 | 9 | MRXW_POL.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.80 | 9 | MRXW_FRE.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.82 | 9 | MRXW_SWE.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.90 | 9 | MRXW_KOR.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.98 | 9 | MRXW_NOR.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.99 | 9 | MRXW_HUN.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 8 | 8.99 | 9 | MRXW_LAV.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 9 | 9.00 | 9 | MRXW_ARA.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 9 | 9.00 | 9 | MRXW_CHI.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 9 | 9.00 | 9 | MRXW_EST.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 9 | 9.00 | 9 | MRXW_GRE.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 9 | 9.00 | 9 | MRXW_TUR.RRF | varchar(10) | -LUI | Unique identifier for term | NULL | 9 | 9.00 | 9 | MRXW_UKR.RRF | varchar(10) | -MAPATN | Mapping attribute name (for future use) | NULL | 0 | 2.82 | 6 | MRMAP.RRF | varchar(20) | -MAPATV | Mapping attribute value (for future use) | NULL | 0 | 0.00 | 1 | MRMAP.RRF | varchar(4000) | -MAPID | Metathesaurus asserted identifier for mapping | NULL | 10 | 10.98 | 11 | MRSMAP.RRF | varchar(50) | -MAPID | Metathesaurus asserted identifier for mapping | NULL | 10 | 10.99 | 11 | MRMAP.RRF | varchar(50) | -MAPIN | Mapping in current subset | NULL | 0 | 0.42 | 1 | MRCUI.RRF | char(1) | -MAPIN | Mapping in current subset | NULL | 1 | 1.00 | 1 | MRAUI.RRF | char(1) | -MAPRANK | Order in which mappings in a subset should be applied | NULL | 0 | 0.49 | 2 | MRMAP.RRF | integer | -MAPREASON | Reason for mapping | NULL | 0 | 0.00 | 4 | MRCUI.RRF | varchar(4000) | -MAPREASON | Reason for mapping | NULL | 4 | 4.00 | 4 | MRAUI.RRF | varchar(4000) | -MAPRES | Restriction applicable to this mapping | NULL | 0 | 34.78 | 429 | MRMAP.RRF | varchar(4000) | -MAPRULE | Machine processible rule applicable to this mapping | NULL | 0 | 9.57 | 336 | MRMAP.RRF | varchar(4000) | -MAPSETCUI | CUI of the map set | NULL | 8 | 8.00 | 8 | MRMAP.RRF | char(8) | -MAPSETCUI | CUI of the map set | NULL | 8 | 8.00 | 8 | MRSMAP.RRF | char(8) | -MAPSETSAB | SAB of the map set | NULL | 3 | 10.60 | 13 | MRSMAP.RRF | varchar(40) | -MAPSETSAB | SAB of the map set | NULL | 3 | 10.71 | 13 | MRMAP.RRF | varchar(40) | -MAPSID | Source asserted identifier for mapping | NULL | 0 | 0.00 | 0 | MRSMAP.RRF | varchar(50) | -MAPSID | Source asserted identifier for mapping | NULL | 0 | 0.01 | 36 | MRMAP.RRF | varchar(50) | -MAPSUBSETID | Map subset identifier used to identify a subset of related mappings within a map set | NULL | 0 | 0.49 | 1 | MRMAP.RRF | varchar(10) | -MAPTYPE | Type of mapping | NULL | 0 | 4.26 | 9 | MRMAP.RRF | varchar(50) | -MAX | Maximum Length | NULL | 1 | 1.37 | 5 | MRCOLS.RRF | integer | -METAUI | Metathesaurus asserted unique identifier | NULL | 0 | 7.85 | 10 | MRSAT.RRF | varchar(50) | -MIN | Minimum Length | NULL | 1 | 1.02 | 2 | MRCOLS.RRF | integer | -NSTR | Normalized string | NULL | 1 | 38.86 | 2460 | MRXNS_ENG.RRF | varchar(3000) | -NWD | Normalized word | NULL | 1 | 6.55 | 80 | MRXNW_ENG.RRF | varchar(100) | -PAUI | Unique identifier for parent atom | NULL | 0 | 8.46 | 9 | MRHIER.RRF | varchar(9) | -PCUI | Concept unique identifier in the previous Metathesaurus | NULL | 8 | 8.00 | 8 | CHANGE/DELETEDCUI.RRF | char(8) | -PCUI | Concept unique identifier in the previous Metathesaurus | NULL | 8 | 8.00 | 8 | CHANGE/MERGEDCUI.RRF | char(8) | -PLUI | Lexical unique identifier in the previous Metathesaurus | NULL | 0 | 0.00 | 0 | CHANGE/DELETEDLUI.RRF | varchar(10) | -PLUI | Lexical unique identifier in the previous Metathesaurus | NULL | 0 | 0.00 | 0 | CHANGE/MERGEDLUI.RRF | varchar(10) | -PSTR | Preferred name in the previous Metathesaurus | NULL | 0 | 0.00 | 0 | CHANGE/DELETEDLUI.RRF | varchar(3000) | -PSTR | Preferred name in the previous Metathesaurus | NULL | 0 | 0.00 | 0 | CHANGE/DELETEDSUI.RRF | varchar(3000) | -PSTR | Preferred name in the previous Metathesaurus | NULL | 4 | 4.00 | 4 | CHANGE/DELETEDCUI.RRF | varchar(3000) | -PSUI | String unique identifier in the previous Metathesaurus | NULL | 0 | 0.00 | 0 | CHANGE/DELETEDSUI.RRF | varchar(10) | -PTR | Path to root | NULL | 0 | 103.81 | 345 | MRHIER.RRF | varchar(1000) | -RANK | Termgroup ranking | NULL | 4 | 4.00 | 4 | MRRANK.RRF | integer | -RCUI | Unique identifier for root SRC concept | NULL | 8 | 8.00 | 8 | MRSAB.RRF | char(8) | -REASON | Explanation of change, if present | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(1000) | -REF | Documentation Section Number | NULL | 0 | 0.00 | 0 | MRCOLS.RRF | varchar(20) | -RELA | Additional relationship label | NULL | 0 | 0.00 | 0 | MRAUI.RRF | varchar(100) | -RELA | Additional relationship label | NULL | 0 | 0.00 | 0 | MRCUI.RRF | varchar(100) | -RELA | Additional relationship label | NULL | 0 | 10.69 | 54 | MRREL.RRF | varchar(100) | -RELA | Additional relationship label | NULL | 0 | 14.07 | 37 | MRMAP.RRF | varchar(100) | -RELA | Additional relationship label | NULL | 0 | 19.91 | 37 | MRSMAP.RRF | varchar(100) | -RELA | Additional relationship label | NULL | 0 | 2.71 | 12 | MRHIER.RRF | varchar(100) | -REL | Relationship label | NULL | 0 | 0.00 | 0 | MRAUI.RRF | varchar(4) | -REL | Relationship label | NULL | 2 | 2.00 | 2 | MRMAP.RRF | varchar(4) | -REL | Relationship label | NULL | 2 | 2.00 | 2 | MRSMAP.RRF | varchar(4) | -REL | Relationship label | NULL | 2 | 2.24 | 3 | MRREL.RRF | varchar(4) | -REL | Relationship label | NULL | 2 | 2.65 | 4 | MRCUI.RRF | varchar(4) | -RG | Relationship group | NULL | 0 | 0.06 | 2 | MRREL.RRF | varchar(10) | -RMETA | Version of the Metathesaurus where a version is removed | NULL | 0 | 0.09 | 6 | MRSAB.RRF | varchar(10) | -RSAB | Root source abbreviation | NULL | 2 | 5.94 | 15 | MRSAB.RRF | varchar(40) | -RUI | Unique identifier for relationship | NULL | 9 | 9.82 | 10 | MRREL.RRF | varchar(10) | -RWS | Number of rows | NULL | 1 | 5.56 | 8 | MRFILES.RRF | integer | -SABIN | Source in current subset | NULL | 1 | 1.00 | 1 | MRSAB.RRF | char(1) | -SAB | Source abbreviation | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(40) | -SAB | Source abbreviation | NULL | 2 | 4.12 | 11 | MRDEF.RRF | varchar(40) | -SAB | Source abbreviation | NULL | 2 | 5.31 | 15 | MRRANK.RRF | varchar(40) | -SAB | Source abbreviation | NULL | 2 | 5.48 | 15 | MRREL.RRF | varchar(40) | -SAB | Source abbreviation | NULL | 2 | 5.70 | 15 | MRCONSO.RRF | varchar(40) | -SAB | Source abbreviation | NULL | 2 | 5.75 | 13 | MRSAT.RRF | varchar(40) | -SAB | Source abbreviation | NULL | 2 | 7.90 | 13 | MRHIER.RRF | varchar(40) | -SATUI | Source asserted attribute identifier | NULL | 0 | 0.47 | 16 | MRDEF.RRF | varchar(50) | -SATUI | Source asserted attribute identifier | NULL | 0 | 3.24 | 36 | MRSAT.RRF | varchar(50) | -SAUI | Source asserted atom identifier | NULL | 0 | 1.73 | 18 | MRCONSO.RRF | varchar(100) | -SCC | Content contact info for a source | NULL | 0 | 152.05 | 332 | MRSAB.RRF | varchar(1000) | -SCIT | Source citation | NULL | 54 | 164.09 | 674 | MRSAB.RRF | varchar(4000) | -SCUI | Source asserted concept identifier | NULL | 0 | 5.28 | 95 | MRCONSO.RRF | varchar(100) | -SDUI | Source asserted descriptor identifier | NULL | 0 | 2.73 | 13 | MRCONSO.RRF | varchar(100) | -SF | Source Family | NULL | 2 | 4.20 | 13 | MRSAB.RRF | varchar(40) | -SLC | License contact info for a source | NULL | 12 | 167.35 | 346 | MRSAB.RRF | varchar(1000) | -SL | Source of relationship labels | NULL | 2 | 5.48 | 15 | MRREL.RRF | varchar(40) | -SON | Source Official Name | NULL | 10 | 48.65 | 145 | MRSAB.RRF | varchar(3000) | -SOURCEUI | Source asserted unique identifier | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(50) | -SRL | Source Restriction Level | NULL | 1 | 1.00 | 1 | MRCONSO.RRF | integer | -SRL | Source Restriction Level | NULL | 1 | 1.00 | 1 | MRSAB.RRF | integer | -SRUI | Source attributed relationship identifier | NULL | 0 | 1.20 | 36 | MRREL.RRF | varchar(50) | -SSN | Source short name | NULL | 3 | 26.96 | 89 | MRSAB.RRF | varchar(3000) | -STN | Semantic type tree number | NULL | 1 | 7.85 | 14 | MRSTY.RRF | varchar(100) | -STR | String | NULL | 1 | 38.20 | 2930 | MRCONSO.RRF | varchar(3000) | -STT | String type | NULL | 2 | 2.01 | 3 | MRCONSO.RRF | varchar(3) | -STYPE1 | The name of the column in MRCONSO.RRF that contains the first identifier to which the relationship is attached | NULL | 3 | 3.62 | 4 | MRREL.RRF | varchar(50) | -STYPE2 | The name of the column in MRCONSO.RRF that contains the second identifier to which the relationship is attached | NULL | 3 | 3.62 | 4 | MRREL.RRF | varchar(50) | -STYPE | The name of the column in MRCONSO.RRF or MRREL.RRF that contains the identifier to which the attribute is attached | NULL | 3 | 3.25 | 4 | MRSAT.RRF | varchar(50) | -STY | Semantic type | NULL | 4 | 17.65 | 39 | MRSTY.RRF | varchar(50) | -SUI | Unique identifier for string | NULL | 0 | 4.57 | 9 | MRSAT.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.00 | 8 | MRXW_BAQ.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.00 | 8 | MRXW_DAN.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.00 | 8 | MRXW_FIN.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.00 | 8 | MRXW_HEB.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.35 | 9 | AMBIGSUI.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.35 | 9 | MRXW_JPN.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.53 | 9 | MRXW_DUT.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.58 | 9 | MRCONSO.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.61 | 9 | MRXW_GER.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.64 | 9 | MRXNS_ENG.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.67 | 9 | MRXNW_ENG.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.67 | 9 | MRXW_ENG.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.71 | 9 | MRXW_SPA.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.79 | 9 | MRXW_POR.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.79 | 9 | MRXW_RUS.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.82 | 9 | MRXW_ITA.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.84 | 9 | MRXW_SWE.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.85 | 9 | MRXW_CZE.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.85 | 9 | MRXW_FRE.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.98 | 9 | MRXW_NOR.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 8 | 8.99 | 9 | MRXW_HUN.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_ARA.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_CHI.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_EST.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_GRE.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_KOR.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_LAV.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_POL.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_SCR.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_TUR.RRF | varchar(10) | -SUI | Unique identifier for string | NULL | 9 | 9.00 | 9 | MRXW_UKR.RRF | varchar(10) | -SUPPRESS | Suppressible flag | NULL | 1 | 1.00 | 1 | MRCONSO.RRF | char(1) | -SUPPRESS | Suppressible flag | NULL | 1 | 1.00 | 1 | MRDEF.RRF | char(1) | -SUPPRESS | Suppressible flag | NULL | 1 | 1.00 | 1 | MRRANK.RRF | char(1) | -SUPPRESS | Suppressible flag | NULL | 1 | 1.00 | 1 | MRREL.RRF | char(1) | -SUPPRESS | Suppressible flag | NULL | 1 | 1.00 | 1 | MRSAT.RRF | char(1) | -SVER | Release date or version number of a source | NULL | 0 | 0.00 | 0 | MRHIST.RRF | varchar(20) | -SVER | Release date or version number of a source | NULL | 0 | 5.08 | 15 | MRSAB.RRF | varchar(20) | -TFR | Term frequency for a source | NULL | 1 | 4.41 | 7 | MRSAB.RRF | integer | -TOEXPR | The expression that a mapping is mapped to | NULL | 0 | 6.03 | 242 | MRMAP.RRF | varchar(4000) | -TOEXPR | The expression that a mapping is mapped to | NULL | 1 | 6.92 | 242 | MRSMAP.RRF | varchar(4000) | -TOID | Metathesaurus identifier for the entity being mapped to | NULL | 0 | 5.18 | 18 | MRMAP.RRF | varchar(50) | -TORES | Restriction applicable to the entity being mapped to | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(4000) | -TORULE | Machine processible rule applicable to the entity being mapped to | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(4000) | -TOSID | Source asserted identifier for the entity being mapped to | NULL | 0 | 0.00 | 0 | MRMAP.RRF | varchar(50) | -TOTYPE | The type of expression that a mapping is mapped to | NULL | 0 | 3.98 | 23 | MRMAP.RRF | varchar(50) | -TOTYPE | The type of expression that a mapping is mapped to | NULL | 4 | 4.36 | 22 | MRSMAP.RRF | varchar(50) | -TS | Term status | NULL | 1 | 1.00 | 1 | MRCONSO.RRF | char(1) | -TTYL | Term type list for a source | NULL | 0 | 11.76 | 86 | MRSAB.RRF | varchar(400) | -TTY | Term type in source | NULL | 2 | 2.35 | 11 | MRCONSO.RRF | varchar(20) | -TTY | Term type in source | NULL | 2 | 2.58 | 11 | MRRANK.RRF | varchar(20) | -TUI | Unique identifier of Semantic type | NULL | 4 | 4.00 | 4 | MRSTY.RRF | char(4) | -TYPE | Type of information | NULL | 3 | 13.14 | 21 | MRDOC.RRF | varchar(50) | -VALUE | Value | NULL | 0 | 15.98 | 62 | MRDOC.RRF | varchar(200) | -VCUI | Unique identifier for versioned SRC concept | NULL | 0 | 7.71 | 8 | MRSAB.RRF | char(8) | -VEND | Valid end date for a source | NULL | 0 | 0.00 | 0 | MRSAB.RRF | char(8) | -VER | Last release version in which CUI1 was valid | NULL | 6 | 6.00 | 6 | MRAUI.RRF | varchar(10) | -VER | Last release version in which CUI1 was valid | NULL | 6 | 6.00 | 6 | MRCUI.RRF | varchar(10) | -VSAB | Versioned source abbreviation | NULL | 3 | 11.35 | 24 | MRSAB.RRF | varchar(40) | -VSTART | Valid start date for a source | NULL | 0 | 0.00 | 0 | MRSAB.RRF | char(8) | -WD | Word in lower-case | NULL | 1 | 10.53 | 54 | MRXW_FIN.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 2.90 | 38 | MRXW_KOR.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 3.65 | 68 | MRXW_CHI.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 4.58 | 35 | MRXW_EST.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 5.23 | 37 | MRXW_TUR.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 5.47 | 22 | MRXW_ARA.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 5.71 | 38 | MRXW_POR.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 5.91 | 38 | MRXW_ITA.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 6.12 | 19 | MRXW_HEB.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 6.13 | 24 | MRXW_UKR.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 6.23 | 80 | MRXW_ENG.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 6.38 | 25 | MRXW_DAN.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 6.67 | 46 | MRXW_SPA.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 6.83 | 39 | MRXW_FRE.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 7.14 | 40 | MRXW_RUS.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 7.17 | 18 | MRXW_BAQ.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 7.50 | 34 | MRXW_GRE.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 7.55 | 48 | MRXW_POL.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 7.57 | 52 | MRXW_CZE.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 7.89 | 51 | MRXW_DUT.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 7.97 | 27 | MRXW_HUN.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 7.98 | 29 | MRXW_LAV.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 8.02 | 37 | MRXW_SCR.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 8.37 | 41 | MRXW_GER.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 8.61 | 39 | MRXW_SWE.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 8.91 | 85 | MRXW_JPN.RRF | varchar(500) | -WD | Word in lower-case | NULL | 1 | 9.11 | 44 | MRXW_NOR.RRF | varchar(500) | - ``` mysql> select * from MRREL limit 10; ``` @@ -597,3 +261,152 @@ C4550264 | C3251798 | CCC2_5_2018 | CCC | Clinical Care Classification, 2_5_2018 C1140221 | C1140220 | CCPSS99 | CCPSS | Canonical Clinical Problem Statement System, 1999 | CCPSS | 1999 | NULL | NULL | 2000AA | NULL | Steven Brown, M.D.;Associate Professor, Biomedical Informatics;Eskind Biomedical Library, Vanderbilt University Medical Center;2209 Garland Ave;Room 442;Nashville;TN;United States;37232-8340;(615) 321-6335;;sbrown@vumclib.mc.vanderbilt.edu; | Steven Brown, MD;;Department of Biomedical Informatics Vanderbilt University;;;;;;;;;; | 3 | 15777 | 15245 | NULL | MP,PT,TX | CCF | ENG | UTF-8 | Y | Y | Clinical Problem Statements | ;;;;Canonical Clincial Problem Statement System;;;Version 1.0;;;June 23, 1999;;;;;;ENG;Contact: sbrown@vumclib.mc.vanderbilt.edu; | C1541964 | C1140228 | CCS2005 | CCS | Clinical Classifications Software, 2005 | CCS | 2005 | NULL | NULL | 2005AC | NULL | Anne Elixhauser, Ph.D.;Senior Research Scientist;Agency for Healthcare Research and Quality;540 Gaither Road;;Rockville;MD;United States;20850;(301) 427-1411, 1-800-358-9295;(301) 594-1430;AElixhau@AHRQ.gov; | Anne Elixhauser, Ph.D.;Senior Research Scientist;Agency for Healthcare Research and Quality;540 Gaither Road;;Rockville;MD;United States;20850;1-800-358-9295;(301)-594-1430;AElixhau@AHRQ.gov; | 0 | 1617 | 1109 | FULL | HT,MD,MV,SD,SP,XM | CCI,FROMRSAB,FROMVSAB,MAPSETRSAB,MAPSETVERSION,MAPSETVSAB,MTH_MAPFROMCOMPLEXITY,MTH_MAPFROMEXHAUSTIVE,MTH_MAPSETCOMPLEXITY,MTH_MAPTOCOMPLEXITY,MTH_MAPTOEXHAUSTIVE,SOS,TORSAB,TOVSAB | ENG | UTF-8 | Y | Y | Clinical Classifications Software | ;;Agency for Healthcare Research and Quality (AHRQ);;Clinical Classifications Software (CCS);;;;;;April 2005;;Rockville,MD;;; http://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp;ENG;Phone: 301-594-1364.; | C5770268 | C5400755 | CCSR_ICD10CM_2023 | CCSR_ICD10CM | Clinical Classifications Software Refined for ICD-10-CM, 2023 | CCS | 2023 | NULL | NULL | 2023AA | NULL | ;;Agency for Healthcare Research and Quality;5600 Fishers Lane;Mail Stop 07N94A;Rockville;MD;United States;20857;1-866-290-HCUP;(301) 594-1430;hcup@ahrq.gov;https://www.hcup-us.ahrq.gov/toolssoftware/ccsr/ccs_refined.jsp; | ;;Agency for Healthcare Research and Quality;5600 Fishers Lane;Mail Stop 07N94A;Rockville;MD;United States;20857;1-866-290-HCUP;(301)-594-1430;hcup@ahrq.gov;https://www.hcup-us.ahrq.gov/toolssoftware/ccsr/ccs_refined.jsp; | 0 | 546 | 545 | NULL | SD,XM | FROMRSAB,FROMVSAB,MAPSETRSAB,MAPSETVERSION,MAPSETVSAB,MTH_MAPFROMCOMPLEXITY,MTH_MAPFROMEXHAUSTIVE,MTH_MAPSETCOMPLEXITY,MTH_MAPTOCOMPLEXITY,MTH_MAPTOEXHAUSTIVE,TORSAB,TOVSAB | ENG | UTF-8 | Y | Y | Clinical Classifications Software Refined for ICD-10-CM | ;;Healthcare Cost and Utilization Project (HCUP);;Clinical Classifications Software Refined for ICD-10-CM;;;;;Agency for Healthcare Research and Quality (AHRQ);;November 2022;Rockville, MD;;;;ENG;; | + + + +``` +mysql> select * from MRDEF limit 10; +``` +CUI | AUI | ATUI | SATUI | SAB | DEF | SUPPRESS | CVF | +--|--|--|--|--|--|--|-- +C0007662 | A15587413 | AT100258389 | NULL | MSH | Areas set apart as burial grounds. | N | NULL | +C0031705 | A0101053 | AT100258390 | NULL | MSH | A non-metal element that has the atomic symbol P, atomic number 15, and atomic weight 31. It is an essential element that takes part in a broad variety of biochemical reactions. | N | NULL | +C0319858 | A15585286 | AT100258391 | NULL | MSH | A genus of ectomycorrhizae basidiomycetous fungi in the family Cortinariaceae. Some species are poisonous. | N | NULL | +C0026655 | A0088287 | AT100258392 | NULL | MSH | A republic in southern Africa, south of TANZANIA, east of ZAMBIA and ZIMBABWE, bordered on the west by the Indian Ocean. Its capital is Maputo. It was formerly called Portuguese East Africa. | N | NULL | +C2350764 | A26632051 | AT100258393 | NULL | MSH | The flow of ions into or out of cells that cause EXCITATORY POSTSYNAPTIC POTENTIALS. | N | NULL | +C2350395 | A15587282 | AT100258394 | NULL | MSH | Timing the acquisition of imaging data to specific points in the cardiac cycle to minimize image blurring and other motion artifacts. | N | NULL | +C2350340 | A26678303 | AT100258395 | NULL | MSH | The ion flow that effects the POSTSYNAPTIC POTENTIAL. | N | NULL | +C0073209 | A12983302 | AT100258396 | NULL | MSH | A PROTEIN-SERINE-THREONINE KINASE that is found in PHOTORECEPTOR CELLS. It mediates light-dependent PHOSPHORYLATION of RHODOPSIN and plays an important role in PHOTOTRANSDUCTION. | N | NULL | +C0872279 | A15585197 | AT100258397 | NULL | MSH | A type of strength-building exercise program that requires the body muscle to exert a force against some form of resistance, such as weight, stretch bands, water, or immovable objects. Resistance exercise is a combination of static and dynamic contractions involving shortening and lengthening of skeletal muscles. | N | NULL | +C2350288 | A26632695 | AT100258398 | NULL | MSH | The duration of time from initiation to discontinuation of drug therapy. | N | NULL | + +``` +mysql> select * from MRDEF where SAB != "MSH" limit 10; +``` +CUI | AUI | ATUI | SATUI | SAB | DEF | SUPPRESS | CVF | +--|--|--|--|--|--|--|-- +C1965760 | A15884584 | AT104406511 | NULL | ALT | Mapping the practitioner type or specialty to a non-specified emergency or non-emergency transportation, travel, or delivery expense or service code. Use associated HCPCS II codes to bill for expense(s) or service(s). This code is used for scope-of-practice mapping, not for billing. | N | NULL | +C2366573 | A15884545 | AT104406512 | NULL | ALT | Mapping the practitioner type or specialty to a non-specified physician service or procedure code. Use associated HCPCS II codes to bill for physician service(s). This code is used for scope-of-practice mapping, not for billing. | N | NULL | +C2366625 | A15884463 | AT104406513 | NULL | ALT | Mapping the practitioner type or specialty to a nutritional therapy service code. Use associated HCPCS II codes to bill for nutrition service(s). This code is used for scope-of-practice mapping, not for billing. | N | NULL | +C2366594 | A15884632 | AT104406514 | NULL | ALT | Mapping the practitioner type or specialty to a wound care an/or therapy service code. Use associated HCPCS II codes to bill for wound care service(s). This code is used for scope-of-practice mapping, not for billing. | N | NULL | +C2366609 | A15884637 | AT104406515 | NULL | ALT | Mapping the practitioner type or specialty to a stabilizing, traction and/or restraining device or equipment code. Use associated HCPCS II codes to bill for stabilizing, traction or restraining device(s) or equipment. This code is used for scope-of-practice mapping, not for billing. | N | NULL | +C1535681 | A15884507 | AT104406516 | NULL | ALT | Mapping the practitioner type or specialty to a non-specified gastroenterology procedure. Use CPT® and/or HCPCS II codes to bill for all gastroenterology service(s). This code is used for scope-of-practice mapping, not for billing. | N | NULL | +C2366582 | A15884518 | AT104406517 | NULL | ALT | Mapping the practitioner type or specialty to a dental service adjunctive general code. Use associated HCPCS II codes to bill for dental service(s). This code is used for scope-of-practice mapping, not for billing. | N | NULL | +C2366655 | A15884685 | AT104406518 | NULL | ALT | Mapping the practitioner type or specialty to a prescription documentation service code. Use associated HCPCS II codes to bill for documentation service(s). This code is used for scope-of-practice mapping, not for billing. | N | NULL | +C2366632 | A15884500 | AT104406519 | NULL | ALT | Mapping the practitioner type or specialty to a vision rehab service code. Use associated HCPCS II codes to bill for vision service(s). This code is used for scope-of-practice mapping, not for billing. | N | NULL | +C1535683 | A15884654 | AT104406520 | NULL | ALT | Mapping the practitioner type or specialty to a non-specified diagnostic infusion procedure. Use CPT® and/or HCPCS II codes to bill for all infusion service(s). This code is used for scope-of-practice mapping, not for billing. | N | NULL | + +``` +mysql> select * from MRDEF where SAB != "MSH" and SAB != "ALT" limit 10; +``` +CUI | AUI | ATUI | SATUI | SAB | DEF | SUPPRESS | CVF | +--|--|--|--|--|--|--|-- +C0032226 | A18556325 | AT130670828 | NULL | CHV | disease causing increase of the fluid amount in the chest wall cavity | N | NULL | +C0032226 | A18593399 | AT130670829 | NULL | CHV | disease causing increase of the fluid amount in the chest wall cavity | N | NULL | +C0032226 | A18649215 | AT130670830 | NULL | CHV | disease causing increase of the fluid amount in the chest wall cavity | N | NULL | +C0078049 | A18558170 | AT130670831 | NULL | CHV | a substance used to prevent chickenpox | N | NULL | +C0078049 | A18576590 | AT130670832 | NULL | CHV | a substance used to prevent chickenpox | N | NULL | +C0078049 | A18632385 | AT130670833 | NULL | CHV | a substance used to prevent chickenpox | N | NULL | +C0078049 | A18688022 | AT130670834 | NULL | CHV | a substance used to prevent chickenpox | N | NULL | +C0543431 | A18565798 | AT130670835 | NULL | CHV | a unit of radiation dose | N | NULL | +C0556645 | A18566010 | AT130670836 | NULL | CHV | a unit of radiation dose | N | NULL | +C0560132 | A18566104 | AT130670837 | NULL | CHV | a unit of radiation dose | N | NULL | + +``` +mysql> select SAB, count(*) from MRDEF group by SAB; +``` +SAB | count(*) | +--|-- +AIR | 160 | +ALT | 4281 | +AOT | 240 | +CCC | 408 | +CHV | 2657 | +CSP | 8265 | +FMA | 2147 | +GO | 43648 | +HL7V3.0 | 8270 | +HPO | 14040 | +ICF | 767 | +ICF-CY | 906 | +JABL | 724 | +LNC | 511 | +MCM | 18 | +MDR | 230 | +MDRARA | 230 | +MDRBPO | 230 | +MDRCZE | 230 | +MDRDUT | 230 | +MDRFRE | 230 | +MDRGER | 230 | +MDRGRE | 230 | +MDRHUN | 230 | +MDRITA | 230 | +MDRJPN | 230 | +MDRKOR | 230 | +MDRLAV | 230 | +MDRPOL | 230 | +MDRPOR | 230 | +MDRRUS | 230 | +MDRSPA | 230 | +MDRSWE | 230 | +MEDLINEPLUS | 1023 | +MSH | 32702 | +MSHCZE | 22345 | +MSHFRE | 138 | +MSHNOR | 7460 | +MSHPOR | 30811 | +MSHSCR | 1 | +MSHSPA | 30647 | +MSHSWE | 17142 | +NANDA-I | 304 | +NCI | 137609 | +NEU | 2660 | +NIC | 602 | +NOC | 581 | +NUCCHCPT | 589 | +OMS | 134 | +ORPHANET | 6669 | +PDQ | 6356 | +PNDS | 265 | +PSY | 2212 | +SCTSPA | 7511 | +SNOMEDCT_US | 9413 | +SPN | 4204 | +UMD | 12259 | +UWDA | 442 | + +``` +mysql> select * from MRSAT limit 10; +``` +CUI | LUI | SUI | METAUI | STYPE | CODE | ATUI | SATUI | ATN | SAB | ATV | SUPPRESS | CVF | +--|--|--|--|--|--|--|--|--|--|--|--|-- +C0002797 | NULL | NULL | NULL | CUI | NULL | AT00000003 | NULL | DA | MTH | 19900930 | N | NULL | +C0002804 | NULL | NULL | NULL | CUI | NULL | AT00000004 | NULL | DA | MTH | 19900930 | N | NULL | +C0197800 | NULL | NULL | NULL | CUI | NULL | AT00000007 | NULL | DA | MTH | 19940412 | N | NULL | +C0002808 | NULL | NULL | NULL | CUI | NULL | AT00000008 | NULL | DA | MTH | 19900930 | N | NULL | +C0002810 | NULL | NULL | NULL | CUI | NULL | AT00000009 | NULL | DA | MTH | 19900930 | N | NULL | +C0002811 | NULL | NULL | NULL | CUI | NULL | AT00000010 | NULL | DA | MTH | 19900930 | N | NULL | +C0197801 | NULL | NULL | NULL | CUI | NULL | AT00000011 | NULL | DA | MTH | 19940412 | N | NULL | +C0002812 | NULL | NULL | NULL | CUI | NULL | AT00000012 | NULL | DA | MTH | 19900930 | N | NULL | +C0002813 | NULL | NULL | NULL | CUI | NULL | AT00000013 | NULL | DA | MTH | 19900930 | N | NULL | +C0197803 | NULL | NULL | NULL | CUI | NULL | AT00000014 | NULL | DA | MTH | 19940412 | N | NULL | + +``` +mysql> select * from MRSAT where SAB != "MTH" limit 10; +``` +CUI | LUI | SUI | METAUI | STYPE | CODE | ATUI | SATUI | ATN | SAB | ATV | SUPPRESS | CVF | +--|--|--|--|--|--|--|--|--|--|--|--|-- +C0226631 | L7947353 | S9261161 | A15487314 | AUI | 77500 | AT100000001 | NULL | LANGUAGE | FMA | Latin | N | NULL | +C0226476 | L1658590 | S1869222 | A15487357 | AUI | 43921 | AT100000002 | NULL | LANGUAGE | FMA | Latin | N | NULL | +C0226476 | L1658578 | S1869210 | A15487358 | AUI | 43921 | AT100000003 | NULL | LANGUAGE | FMA | Latin | N | NULL | +C1184758 | L7921465 | S9257177 | A15487423 | AUI | 75484 | AT100000004 | NULL | LANGUAGE | FMA | Latin | N | NULL | +C0224224 | L7917062 | S9255685 | A15487425 | AUI | 46777 | AT100000005 | NULL | LANGUAGE | FMA | Latin | N | NULL | +C1306642 | L7941514 | S9244381 | A15487435 | AUI | 71875 | AT100000006 | NULL | LANGUAGE | FMA | Latin | N | NULL | +C0227302 | L7921706 | S9259748 | A15487449 | AUI | 14929 | AT100000007 | NULL | LANGUAGE | FMA | Latin | N | NULL | +C0694589 | L1456954 | S1742895 | A15487461 | AUI | 67962 | AT100000008 | NULL | LANGUAGE | FMA | Latin | N | NULL | +C0152374 | L1457021 | S1742970 | A15487464 | AUI | 72455 | AT100000009 | NULL | LANGUAGE | FMA | Latin | N | NULL | +C0224086 | L7915107 | S9234531 | A15487481 | AUI | 9719 | AT100000010 | NULL | LANGUAGE | FMA | Latin | N | NULL | + +As you can see from the image below, the `CODE` column of the table corresponds to the FMA ID for that node. +![image](https://github.com/RTXteam/RTX-KG2/assets/36611732/c3a043fc-6e29-47c9-9598-f5b67dbec917) From ed24438693bd0f78a675acdcb1e72bfa5adea79e Mon Sep 17 00:00:00 2001 From: "E.C. Wood" Date: Wed, 12 Jul 2023 14:09:56 -0700 Subject: [PATCH 003/117] #316 added more table info --- understanding_umls.md | 125 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 124 insertions(+), 1 deletion(-) diff --git a/understanding_umls.md b/understanding_umls.md index 104b01ea..a421fd25 100644 --- a/understanding_umls.md +++ b/understanding_umls.md @@ -408,5 +408,128 @@ C0694589 | L1456954 | S1742895 | A15487461 | AUI | 67962 | AT100000008 | NULL | C0152374 | L1457021 | S1742970 | A15487464 | AUI | 72455 | AT100000009 | NULL | LANGUAGE | FMA | Latin | N | NULL | C0224086 | L7915107 | S9234531 | A15487481 | AUI | 9719 | AT100000010 | NULL | LANGUAGE | FMA | Latin | N | NULL | -As you can see from the image below, the `CODE` column of the table corresponds to the FMA ID for that node. +As you can see from the image below (from `KG2.8.3pre`), the `CODE` column of the table corresponds to the FMA ID for that node. ![image](https://github.com/RTXteam/RTX-KG2/assets/36611732/c3a043fc-6e29-47c9-9598-f5b67dbec917) + +``` +mysql> select SAB, count(*) from MRSAT group by SAB; +``` +SAB | count(*) | +--|-- +ALT | 13272 | +AOD | 6054 | +AOT | 27 | +ATC | 7860 | +CCPSS | 15716 | +CCS | 23453 | +CCSR_ICD10CM | 12 | +CCSR_ICD10PCS | 12 | +CDT | 1275 | +CHV | 877774 | +CPT | 249691 | +CSP | 23251 | +CVX | 2755 | +DRUGBANK | 10459 | +FMA | 284369 | +GO | 168004 | +GS | 76415 | +HCDT | 7983 | +HCPCS | 66036 | +HCPT | 105273 | +HGNC | 810883 | +HL7V2.5 | 16770 | +HL7V3.0 | 38386 | +HPO | 29796 | +ICD10AM | 61299 | +ICD10CM | 101898 | +ICD10PCS | 79341 | +ICD9CM | 10190 | +ICF | 13822 | +ICF-CY | 386 | +ICNP | 1955 | +ICPC | 1318 | +ICPC2EENG | 1175 | +ICPC2ICD10ENG | 81849 | +ICPC2P | 29636 | +JABL | 490 | +KCD5 | 76 | +LCH_NW | 13 | +LNC | 2417573 | +MDR | 1045184 | +MDRARA | 1045184 | +MDRBPO | 1045184 | +MDRCZE | 1045184 | +MDRDUT | 1045184 | +MDRFRE | 1045184 | +MDRGER | 1045184 | +MDRGRE | 1045184 | +MDRHUN | 1045184 | +MDRITA | 1045184 | +MDRJPN | 780993 | +MDRKOR | 1045184 | +MDRLAV | 1045184 | +MDRPOL | 1045184 | +MDRPOR | 1045184 | +MDRRUS | 1045184 | +MDRSPA | 1045184 | +MDRSWE | 1045184 | +MED-RT | 95999 | +MEDCIN | 1355208 | +MEDLINEPLUS | 8173 | +MMSL | 242812 | +MMX | 412325 | +MSH | 4841113 | +MSHCZE | 81443 | +MSHFRE | 17 | +MSHITA | 59531 | +MSHLAV | 1191 | +MSHNOR | 62205 | +MSHPOR | 107509 | +MSHSCR | 8069 | +MSHSPA | 95836 | +MTH | 9493363 | +MTHMST | 1908 | +MTHSPL | 3345744 | +MVX | 411 | +NANDA-I | 1879 | +NCBI | 2034978 | +NCI | 158863 | +NDDF | 71219 | +NEU | 8194 | +NIC | 3023 | +NOC | 15731 | +NUCCHCPT | 522 | +OMIM | 204484 | +OMS | 21 | +PDQ | 55017 | +PNDS | 59 | +PPAC | 813 | +PSY | 8563 | +RCD | 175408 | +RXNORM | 2126399 | +SCTSPA | 5843064 | +SNMI | 85848 | +SNOMEDCT_US | 9779615 | +SNOMEDCT_VET | 457028 | +SPN | 19052 | +UMD | 46357 | +USP | 8802 | +USPMG | 1609 | +UWDA | 61526 | +VANDF | 349254 | + +``` +mysql> select * from MRRANK limit 10; +``` +MRRANK_RANK | SAB | TTY | SUPPRESS | +--|--|--|-- +266 | AIR | DI | N | +267 | AIR | FI | N | +264 | AIR | HT | N | +265 | AIR | SY | N | +364 | ALT | HT | N | +365 | ALT | PT | N | +282 | AOD | DE | N | +281 | AOD | DS | N | +277 | AOD | ES | N | +278 | AOD | ET | N | From a8fd56ce21b99fbbc412e403a0dbe7118f1ca821 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sun, 6 Aug 2023 04:27:54 -0700 Subject: [PATCH 004/117] #316 add more research to file --- understanding_umls.md | 112 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/understanding_umls.md b/understanding_umls.md index a421fd25..bbc124ca 100644 --- a/understanding_umls.md +++ b/understanding_umls.md @@ -533,3 +533,115 @@ MRRANK_RANK | SAB | TTY | SUPPRESS | 281 | AOD | DS | N | 277 | AOD | ES | N | 278 | AOD | ET | N | + +# Studying `umls2rdf.py` + +Tables Used: +- `MRSTY` +- `MRCONSO` +- `MRSAB` +- `MRREL` +- `MRDEF` +- `MRSAT` +- `MRRANK` +- `MRDOC` + +## `MRSTY` + +**What is taken?** + +This table is accessed twice, once on line 143 and once on line 573. At the line 143 accession, the distinct columns `TUI`, `STN`, and `STY` are taken. At the line 573 accession, all of the columns from `MRSTY` are taken, which consists of `CUI`, `TUI`, `STN`, `STY`, `ATUI`, `CVF`. + +**What does this table contain?** + +**How does `umls2rdf.py` use this table?** + +## `MRCONSO` + +**What is taken?** + +This table is accessed once, on line 491. All of the columns are taken, which consists of `CUI`, `LAT`, `TS`, `LUI`, `STT`, `SUI`, `ISPREF`, `AUI`, `SAUI`, `SCUI`, `SDUI`, `SAB`, `TTY`, `CODE`, `STR`, `SRL`, `SUPPRESS`, and `CVF`. + +**What does this table contain?** + +**How does `umls2rdf.py` use this table?** + +## `MRSAB` + +**What is taken?** + +This table is accessed once, on line 496. All of the columns are taken, which consists of `VCUI`, `RCUI`, `VSAB`, `RSAB`, `SON`, `SF`, `SVER`, `VSTART`, `VEND`, `IMETA`, `RMETA`, `SLC`, `SCC`, `SRL`, `TFR`, `CFR`, `CXTY`, `TTYL`, `ATNL`, `LAT`, `CENC`, `CURVER`, `SABIN`, `SSN`, and `SCIT`. + +**What does this table contain?** + +**How does `umls2rdf.py` use this table?** + +A limit of 1 is placed on this `scan` (per ontology code). + +## `MRREL` + +**What is taken?** + +This table is accessed once, on line 527. All of the columns are taken, which consists of `CUI1`, `AUI1`, `STYPE1`, `REL`, `CUI2`, `AUI2`, `STYPE2`, `RELA`, `RUI`, `SRUI`, `SAB`, `SL`, `RG`, `DIR`, `SUPPRESS`, and `CVF`. + +**What does this table contain?** + +**How does `umls2rdf.py` use this table?** + +## `MRDEF` + +**What is taken?** + +This table is accessed once, on line 538. All of the columns are taken, which consists of `CUI`, `AUI`, `ATUI`, `SATUI`, `SAB`, `DEF`, `SUPPRESS`, and `CVF`. + +**What does this table contain?** + +**How does `umls2rdf.py` use this table?** + +## `MRSAT` + +**What is taken?** + +This table is accessed once, on line 549. All of the columns are taken, which consists of `CUI`, `LUI`, `SUI`, `METAUI`, `STYPE`, `CODE`, `ATUI`, `SATUI`, `ATN`, `SAB`, `ATV`, `SUPPRESS`, and `CVF`. + +**What does this table contain?** + +**How does `umls2rdf.py` use this table?** + +## `MRRANK` + +**What is taken?** + +This table is accessed once, on line 560. All of the columns are taken, which consists of `MRRANK_RANK`, `SAB`, `TTY`, and `SUPPRESS`. + +**What does this table contain?** + +**How does `umls2rdf.py` use this table?** + +## `MRDOC` + +**What is taken?** + +This table is accessed once, on line 742. All of the columns are taken, which consists of `DOCKEY`, `VALUE`, `TYPE`, `EXPL` + +**What does this table contain?** + +**How does `umls2rdf.py` use this table?** + +## Assorted Notes + +- The table is filtered based on which ontology's ttl file is being generated at the time. This is done through the scan function, which is the actual function that sends the query to MySQL. Thus, this does not create redundancy but instead in fact ensures that only the ontologies we care about are ever queries. This is done on lines 222 through 227, where the `filt` parameter is passed into the `WHERE` clause on the MySQL statement. + +## To Do + +1. Determine which columns are actually making their ways into the `TTL` files by examining the `TTL` files. + +2. Decide on join points/concatentation between this tables. Ideally, we will be able to implement a streaming solution like with SemMedDB, where each row has everything we need to know about that CUI. With extra information (such as `MRDOC` content), we may have to create a supplementary file, but it should be pretty small. + +3. Implement MySQL querying as decided on in step 2. + +4. Run time tests on the solution decided on in step 3. We need to determine whether this will save the time currently used in roughly 14 hours (though a good chunk of that is load in) of ETL currently present. We probably want under 2-3 hours of MySQL time to make this a worthwhile change. + +5. Repeat steps 3 and 4 until timing is desirable. + +6. Evaluate whether the content is sufficiently comparable to what is currently in KG2. \ No newline at end of file From fa0882f0a591c5b61cdea713bc835c8a81951d36 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 7 Aug 2023 15:05:58 -0700 Subject: [PATCH 005/117] #316 add links to make finding info easier --- understanding_umls.md | 92 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/understanding_umls.md b/understanding_umls.md index bbc124ca..5099b504 100644 --- a/understanding_umls.md +++ b/understanding_umls.md @@ -552,6 +552,10 @@ Tables Used: This table is accessed twice, once on line 143 and once on line 573. At the line 143 accession, the distinct columns `TUI`, `STN`, and `STY` are taken. At the line 573 accession, all of the columns from `MRSTY` are taken, which consists of `CUI`, `TUI`, `STN`, `STY`, `ATUI`, `CVF`. +**What do these columns mean?** + +See [here](https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.Tf/). + **What does this table contain?** **How does `umls2rdf.py` use this table?** @@ -562,6 +566,10 @@ This table is accessed twice, once on line 143 and once on line 573. At the line This table is accessed once, on line 491. All of the columns are taken, which consists of `CUI`, `LAT`, `TS`, `LUI`, `STT`, `SUI`, `ISPREF`, `AUI`, `SAUI`, `SCUI`, `SDUI`, `SAB`, `TTY`, `CODE`, `STR`, `SRL`, `SUPPRESS`, and `CVF`. +**What do these columns mean?** + +See [here](https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.concept_names_and_sources_file_mr/). + **What does this table contain?** **How does `umls2rdf.py` use this table?** @@ -572,6 +580,10 @@ This table is accessed once, on line 491. All of the columns are taken, which co This table is accessed once, on line 496. All of the columns are taken, which consists of `VCUI`, `RCUI`, `VSAB`, `RSAB`, `SON`, `SF`, `SVER`, `VSTART`, `VEND`, `IMETA`, `RMETA`, `SLC`, `SCC`, `SRL`, `TFR`, `CFR`, `CXTY`, `TTYL`, `ATNL`, `LAT`, `CENC`, `CURVER`, `SABIN`, `SSN`, and `SCIT`. +**What do these columns mean?** + +See [here](https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.source_information_file_mrsab_rrf/). + **What does this table contain?** **How does `umls2rdf.py` use this table?** @@ -584,6 +596,10 @@ A limit of 1 is placed on this `scan` (per ontology code). This table is accessed once, on line 527. All of the columns are taken, which consists of `CUI1`, `AUI1`, `STYPE1`, `REL`, `CUI2`, `AUI2`, `STYPE2`, `RELA`, `RUI`, `SRUI`, `SAB`, `SL`, `RG`, `DIR`, `SUPPRESS`, and `CVF`. +**What do these columns mean?** + +See [here](https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.related_concepts_file_mrrel_rrf/). + **What does this table contain?** **How does `umls2rdf.py` use this table?** @@ -594,6 +610,10 @@ This table is accessed once, on line 527. All of the columns are taken, which co This table is accessed once, on line 538. All of the columns are taken, which consists of `CUI`, `AUI`, `ATUI`, `SATUI`, `SAB`, `DEF`, `SUPPRESS`, and `CVF`. +**What do these columns mean?** + +See [here](https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.definitions_file_mrdef_rrf/). + **What does this table contain?** **How does `umls2rdf.py` use this table?** @@ -604,6 +624,10 @@ This table is accessed once, on line 538. All of the columns are taken, which co This table is accessed once, on line 549. All of the columns are taken, which consists of `CUI`, `LUI`, `SUI`, `METAUI`, `STYPE`, `CODE`, `ATUI`, `SATUI`, `ATN`, `SAB`, `ATV`, `SUPPRESS`, and `CVF`. +**What do these columns mean?** + +See [here](https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.simple_concept_and_atom_attribute/). + **What does this table contain?** **How does `umls2rdf.py` use this table?** @@ -614,6 +638,10 @@ This table is accessed once, on line 549. All of the columns are taken, which co This table is accessed once, on line 560. All of the columns are taken, which consists of `MRRANK_RANK`, `SAB`, `TTY`, and `SUPPRESS`. +**What do these columns mean?** + +See [here](https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.concept_name_ranking_file_mrrank/). + **What does this table contain?** **How does `umls2rdf.py` use this table?** @@ -624,6 +652,10 @@ This table is accessed once, on line 560. All of the columns are taken, which co This table is accessed once, on line 742. All of the columns are taken, which consists of `DOCKEY`, `VALUE`, `TYPE`, `EXPL` +**What do these columns mean?** + +See [here](https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.Te/). + **What does this table contain?** **How does `umls2rdf.py` use this table?** @@ -644,4 +676,62 @@ This table is accessed once, on line 742. All of the columns are taken, which co 5. Repeat steps 3 and 4 until timing is desirable. -6. Evaluate whether the content is sufficiently comparable to what is currently in KG2. \ No newline at end of file +6. Evaluate whether the content is sufficiently comparable to what is currently in KG2. + +### Step 1 + +Example: `umls-atc.ttl` + +``` + a owl:Class ; + skos:prefLabel """chlorothiazide, combinations"""@en ; + skos:notation """C03AH01"""^^xsd:string ; + rdfs:subClassOf ; + """5"""^^xsd:string ; + UMLS:has_cui """C3652440"""^^xsd:string ; + UMLS:has_tui """T109"""^^xsd:string ; + UMLS:has_tui """T121"""^^xsd:string ; + UMLS:has_sty ; + UMLS:has_sty ; +``` + +Example: `umls-chv.ttl` +``` + a owl:Class ; + skos:prefLabel """synthesis"""@en ; + skos:notation """0000050974"""^^xsd:string ; + skos:definition """the combining of separate elements or substances to form a coherent whole"""@en ; + """0.413096903"""^^xsd:string ; + """0.413096903"""^^xsd:string ; + """0.4381"""^^xsd:string ; + """0.4034"""^^xsd:string ; + """no"""^^xsd:string ; + """0.397790709"""^^xsd:string ; + UMLS:has_cui """C0220781"""^^xsd:string ; + UMLS:has_tui """T038"""^^xsd:string ; + UMLS:has_sty ; +``` + +Example: `umls-drugbank.ttl` +``` + a owl:Class ; + skos:prefLabel """Tetracaine"""@en ; + skos:notation """DB09085"""^^xsd:string ; + skos:altLabel """2-(Dimethylamino)ethyl p-(butylamino)benzoate"""@en , """2-(dimethylamino)ethyl 4-(butylamino)benzoate"""@en , """Amethocaine"""@en , """Amethocaine HCl"""@en , """Dicaine"""@en , """Diäthylaminoäthanol ester der p-butylaminobenzösäure"""@en , """Medihaler-Tetracaine"""@en , """Metraspray"""@en , """Tetracaine HCl"""@en , """Tetracaína"""@en , """Tétracaïne"""@en , """p-(butylamino)benzoic acid β-(dimethylamino)ethyl ester"""@en , """p-Butylaminobenzoyl-2-dimethylaminoethanol"""@en ; + """0619F35CGV"""^^xsd:string ; + UMLS:has_cui """C0039629"""^^xsd:string ; + UMLS:has_cui """C0304456"""^^xsd:string ; + UMLS:has_cui """C0702211"""^^xsd:string ; + UMLS:has_cui """C4292382"""^^xsd:string ; + UMLS:has_cui """C4292391"""^^xsd:string ; + UMLS:has_tui """T109"""^^xsd:string ; + UMLS:has_tui """T121"""^^xsd:string ; + UMLS:has_sty ; + UMLS:has_sty ; +``` + +I am currently trying to find where `FDA_UNII_CODE` is in the data. I know that it is an attribute per https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html. + +It looks like, per running `select * from MRSAT where SAB="DRUGBANK" limit 20;`, the name of the attribute is in the `ATN` column and the value is in the `ATV` column. + +This link discusses each of the `MRREL` types: https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html. \ No newline at end of file From d212e9470716fc753f69230ecc8179ff72f27922 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 7 Aug 2023 15:08:27 -0700 Subject: [PATCH 006/117] #316 add the mysql to md script directly into the file --- understanding_umls.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/understanding_umls.md b/understanding_umls.md index 5099b504..3b7cf8cf 100644 --- a/understanding_umls.md +++ b/understanding_umls.md @@ -6,6 +6,14 @@ 3. Replace: `--\+$` With: `--` 4. Replace: `( )+` With: ` ` +Script used: +``` +sed -i -E "s/\+(-)+/\|--/g" umls_table.txt +sed -i -E "s/^\|( )*//g" umls_table.txt +sed -i -E "s/--\+$/--/g" umls_table.txt +sed -i -E "s/( )+/ /g" umls_table.txt +sed -i -E "s/<|>//g" umls_table.txt +``` # Tables ``` From 465559044bdce0d160f5c9e0ad25c8f790e36b30 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 7 Aug 2023 15:55:57 -0700 Subject: [PATCH 007/117] #316 first attempt at a join --- understanding_umls.md | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/understanding_umls.md b/understanding_umls.md index 3b7cf8cf..2de1d850 100644 --- a/understanding_umls.md +++ b/understanding_umls.md @@ -742,4 +742,24 @@ I am currently trying to find where `FDA_UNII_CODE` is in the data. I know that It looks like, per running `select * from MRSAT where SAB="DRUGBANK" limit 20;`, the name of the attribute is in the `ATN` column and the value is in the `ATV` column. -This link discusses each of the `MRREL` types: https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html. \ No newline at end of file +This link discusses each of the `MRREL` types: https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html. + +``` +select * from MRCONSO con left join MRSAT sat on con.CODE=sat.CODE where con.SAB="DRUGBANK" limit 10; +``` + +CUI | LAT | TS | LUI | STT | SUI | ISPREF | AUI | SAUI | SCUI | SDUI | SAB | TTY | CODE | STR | SRL | SUPPRESS | CVF | CUI | LUI | SUI | METAUI | STYPE | CODE | ATUI | SATUI | ATN | SAB | ATV | SUPPRESS | CVF | +--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|-- +C0039601 | ENG | S | L13409149 | VO | S16395464 | Y | A27406646 | NULL | DB00624 | NULL | DRUGBANK | FSY | DB00624 | Testosteronum | 0 | N | 256 | C0039601 | L0039601 | S0092451 | A27059293 | SCUI | DB00624 | AT215745781 | NULL | SID | DRUGBANK | DB05275 | N | NULL | +C0039925 | ENG | S | L13409165 | PF | S16395565 | Y | A27406649 | NULL | DB00599 | NULL | DRUGBANK | FSY | DB00599 | Tiopentale | 0 | N | 256 | C0039925 | L0039925 | S0093293 | A27062921 | SCUI | DB00599 | AT215745786 | NULL | FDA_UNII_CODE | DRUGBANK | JI8Z5M7NA3 | N | NULL | +C0004057 | ENG | S | L13415345 | PF | S16396444 | Y | A27406659 | NULL | DB00945 | NULL | DRUGBANK | FSY | DB00945 | ácido acetilsalicílico | 0 | N | 256 | C0004057 | L0001063 | S0584084 | A27066872 | SCUI | DB00945 | AT215746200 | NULL | SID | DRUGBANK | EXPT00475 | N | NULL | +C0004057 | ENG | S | L13415345 | PF | S16396444 | Y | A27406659 | NULL | DB00945 | NULL | DRUGBANK | FSY | DB00945 | ácido acetilsalicílico | 0 | N | 256 | C0004057 | L0001063 | S0584084 | A27066872 | SCUI | DB00945 | AT215745697 | NULL | FDA_UNII_CODE | DRUGBANK | R16CO5Y76E | N | NULL | +C0006491 | ENG | S | L13413033 | PF | S16390917 | Y | A27406692 | NULL | DB00611 | NULL | DRUGBANK | FSY | DB00611 | Butorphanolum | 0 | N | 256 | C0006491 | L0006491 | S0021116 | A27064721 | SCUI | DB00611 | AT215745783 | NULL | FDA_UNII_CODE | DRUGBANK | QV897JC36D | N | NULL | +C0007735 | ENG | S | L13409541 | PF | S16391091 | Y | A27406763 | NULL | DB00456 | NULL | DRUGBANK | FSY | DB00456 | Cefalotina | 0 | N | 256 | C0007735 | L0007540 | S0023182 | A27055419 | SCUI | DB00456 | AT215745827 | NULL | SID | DRUGBANK | EXPT00946 | N | NULL | +C0061323 | ENG | P | L0061323 | VO | S16392549 | Y | A27406770 | NULL | DB00222 | NULL | DRUGBANK | FSY | DB00222 | Glimépiride | 0 | N | 256 | C0061323 | L0061323 | S1325002 | A27055170 | SCUI | DB00222 | AT215745888 | NULL | SID | DRUGBANK | APRD00381 | N | NULL | +C0064113 | ENG | S | L13414126 | PF | S16392995 | Y | A27406772 | NULL | DB01167 | NULL | DRUGBANK | FSY | DB01167 | Itraconazol | 0 | N | 256 | C0064113 | L0064113 | S0170262 | A27068928 | SCUI | DB01167 | AT215746145 | NULL | SID | DRUGBANK | APRD00040 | N | NULL | +C0064113 | ENG | S | L13414126 | PF | S16392995 | Y | A27406772 | NULL | DB01167 | NULL | DRUGBANK | FSY | DB01167 | Itraconazol | 0 | N | 256 | C0064113 | L0064113 | S0170262 | A27068928 | SCUI | DB01167 | AT215746144 | NULL | FDA_UNII_CODE | DRUGBANK | 304NUG5GF4 | N | NULL | +C0010927 | ENG | S | L13413100 | PF | S16391572 | Y | A27406779 | NULL | DB00851 | NULL | DRUGBANK | FSY | DB00851 | Dacarbazin | 0 | N | 256 | C0010927 | L0010927 | S0030020 | A27063198 | SCUI | DB00851 | AT215746222 | NULL | SID | DRUGBANK | APRD00331 | N | NULL | + + +For some reason, some (all?) of the names are in latin. \ No newline at end of file From 4e30007b0ae833a425e0cc32e3cd5f4bf4db93bb Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 7 Aug 2023 16:30:00 -0700 Subject: [PATCH 008/117] #316 improving the query --- understanding_umls.md | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/understanding_umls.md b/understanding_umls.md index 2de1d850..8515255c 100644 --- a/understanding_umls.md +++ b/understanding_umls.md @@ -762,4 +762,42 @@ C0064113 | ENG | S | L13414126 | PF | S16392995 | Y | A27406772 | NULL | DB01167 C0010927 | ENG | S | L13413100 | PF | S16391572 | Y | A27406779 | NULL | DB00851 | NULL | DRUGBANK | FSY | DB00851 | Dacarbazin | 0 | N | 256 | C0010927 | L0010927 | S0030020 | A27063198 | SCUI | DB00851 | AT215746222 | NULL | SID | DRUGBANK | APRD00331 | N | NULL | -For some reason, some (all?) of the names are in latin. \ No newline at end of file +For some reason, some (all?) of the names are in latin. + +``` +select con.CUI, con.CODE, con.ISPREF, con.STR, sat.ATN, sat.ATV from MRCONSO con left join MRSAT sat on con.CODE=sat.CODE where con.SAB="DRUGBANK" limit 10; +``` +CUI | CODE | ISPREF | STR | ATN | ATV | +--|--|--|--|--|-- +C1948374 | DB08906 | Y | Fluticasonum furoas | FDA_UNII_CODE | JS86977WNV | +C2930696 | DB08895 | Y | Tofacitinibum | FDA_UNII_CODE | 87LA6FU830 | +C1948374 | DB08906 | Y | Furoato de fluticasona | FDA_UNII_CODE | JS86977WNV | +C1948374 | DB08906 | Y | Furoate de fluticasone | FDA_UNII_CODE | JS86977WNV | +C0042665 | DB09185 | Y | Viloxazina | FDA_UNII_CODE | 5I5Y2789ZF | +C0123163 | DB09081 | Y | idébénone | FDA_UNII_CODE | HB6PN45W4J | +C0042665 | DB09185 | Y | Viloxazinum | FDA_UNII_CODE | 5I5Y2789ZF | +C0068700 | DB09220 | Y | Nicorandilum | FDA_UNII_CODE | 260456HAM0 | +C0037659 | DB09099 | Y | Somatostatine | FDA_UNII_CODE | 6E20216Q0L | +C0037659 | DB09099 | Y | Somatostatinum | FDA_UNII_CODE | 6E20216Q0L | + +``` +select con.CODE, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.ISPREF, '|', con.STR) SEPARATOR '\t'), GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', sat.ATV) SEPARATOR '\t') from MRCONSO con left join MRSAT sat on con.CODE=sat.CODE where con.SAB="DRUGBANK" GROUP BY con.CODE limit 10; +``` + +**NEED TO INCREASE MAX GROUP_CONCAT LENGTH FIRST** + +**Had to use `\|` to display as a table** + + +CODE | GROUP_CONCAT(DISTINCT con.CUI) | GROUP_CONCAT(DISTINCT CONCAT(con.ISPREF, '\|', con.STR) SEPARATOR '\t') | GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '\|', sat.ATV) SEPARATOR '\t') | +--|--|--|-- +DB00001 | C0378366,C0772394 | N\|Desulfatohirudin N\|Lepirudin Y\|Hirudin variant-1 Y\|Lepirudin recombinant Y\|R-hirudin Y\|[Leu1, Thr2]-63-desulfohirudin | FDA_UNII_CODE\|Y43GF64R34 RXAUI\|12740240 RXAUI\|12740241 RXAUI\|12740242 RXAUI\|8321260 RXAUI\|8471541 RXAUI\|8599806 RXCUI\|114934 RXCUI\|237057 SID\|BIOD00024 SID\|BTD00024 | +DB00002 | C0995188 | N\|Cetuximab Y\|Cétuximab Y\|Cetuximabum | FDA_UNII_CODE\|PQX0D8J21J RXAUI\|8473993 RXAUI\|8692140 RXAUI\|8692141 RXCUI\|318341 SID\|BIOD00071 SID\|BTD00071 | +DB00003 | C1135662 | N\|Dornase alfa Y\|Deoxyribonuclease (human clone 18-1 protein moiety) Y\|Dornasa alfa Y\|Dornase alfa, recombinant Y\|Dornase alpha Y\|Recombinant deoxyribonuclease (DNAse) | FDA_UNII_CODE\|953A26OA1Y RXAUI\|10778765 RXAUI\|8278645 RXAUI\|8326085 RXAUI\|8339777 RXAUI\|8376403 RXAUI\|8686775 RXCUI\|337623 SID\|BIOD00001 SID\|BTD00001 | +DB00004 | C0717670,C1383469 | N\|Denileukin diftitox Y\|Denileukin Y\|Interleukin-2/diptheria toxin fusion protein | FDA_UNII_CODE\|25E79B5CTM RXAUI\|10333971 RXAUI\|10333972 RXAUI\|8331268 RXCUI\|214470 RXCUI\|451876 SID\|BIOD00084 SID\|BTD00084 | +DB00005 | C0717758,C4291381,C4542001,C5135562 | N\|Etanercept N\|etanercept-szzs N\|etanercept-ykro Y\|Recombinant human TNF Y\|rhu TNFR:Fc Y\|rhu-TNFR:Fc Y\|TNFR-Immunoadhesin | FDA_UNII_CODE\|OP401G7OJC RXAUI\|11350310 RXAUI\|11350311 RXAUI\|11350312 RXAUI\|11350313 RXAUI\|11350314 RXAUI\|8622888 RXAUI\|9712732 RXCUI\|1995554 RXCUI\|2103480 RXCUI\|214555 RXCUI\|2462511 SID\|BIOD00052 SID\|BTD00052 | +DB00006 | C0168273 | N\|Bivalirudin Y\|Bivalirudina Y\|Bivalirudinum | FDA_UNII_CODE\|TN9BEX005G RXAUI\|8657293 RXAUI\|8715166 RXAUI\|8715167 RXCUI\|60819 SID\|BIOD00076 SID\|BTD00076 SID\|DB02351 SID\|EXPT03302 | +DB00007 | C0085272 | N\|Leuprolide N\|Leuprorelin Y\|Leuprorelina Y\|Leuproreline Y\|Leuprorelinum | FDA_UNII_CODE\|EFY6W0M8TG RXAUI\|10785183 RXAUI\|10785184 RXAUI\|10785185 RXAUI\|8540224 RXAUI\|8646100 RXCUI\|42375 SID\|BIOD00009 SID\|BTD00009 | +DB00008 | C0391001 | N\|Peginterferon alfa-2a Y\|PEG-IFN alfa-2A Y\|PEG-Interferon alfa-2A Y\|Pegylated Interfeaon alfa-2A Y\|Pegylated interferon alfa-2a Y\|Pegylated interferon alpha-2a Y\|Pegylated-interferon alfa 2a | FDA_UNII_CODE\|Q46947FE7K RXAUI\|11350315 RXAUI\|11350316 RXAUI\|11350317 RXAUI\|11350318 RXAUI\|11350319 RXAUI\|8672645 RXAUI\|8731057 RXCUI\|120608 SID\|BIOD00043 SID\|BTD00043 | +DB00009 | C0032143 | N\|Alteplase N\|Tissue plasminogen activator Y\|Alteplasa Y\|Alteplase (genetical recombination) Y\|Alteplase, recombinant Y\|Alteplase,recombinant Y\|Plasminogen activator (human tissue-type protein moiety) Y\|rt-PA Y\|t-PA Y\|t-plasminogen activator Y\|Tissue plasminogen activator alteplase Y\|Tissue plasminogen activator, recombinant Y\|tPA | FDA_UNII_CODE\|1RXS4UE564 RXAUI\|10778766 RXAUI\|8368173 RXAUI\|8383242 RXAUI\|8543112 RXAUI\|8578376 RXAUI\|9193634 RXAUI\|9193635 RXAUI\|9193636 RXAUI\|9193637 RXAUI\|9193638 RXAUI\|9193639 RXAUI\|9193640 RXAUI\|9193641 RXCUI\|8410 SID\|BIOD00050 SID\|BTD00050 | +DB00010 | C0142046 | N\|Sermorelin | FDA_UNII_CODE\|89243S03TE RXAUI\|8619290 RXCUI\|56188 SID\|BIOD00033 SID\|BTD00033 | From 7b76d7cc6e3f1487816f46090c66c87502adc831 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 7 Aug 2023 16:37:25 -0700 Subject: [PATCH 009/117] #316 more ttl snippets for context --- understanding_umls.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/understanding_umls.md b/understanding_umls.md index 8515255c..2fa8a089 100644 --- a/understanding_umls.md +++ b/understanding_umls.md @@ -801,3 +801,40 @@ DB00007 | C0085272 | N\|Leuprolide N\|Leuprorelin Y\|Leuprorelina Y\|Leuprorelin DB00008 | C0391001 | N\|Peginterferon alfa-2a Y\|PEG-IFN alfa-2A Y\|PEG-Interferon alfa-2A Y\|Pegylated Interfeaon alfa-2A Y\|Pegylated interferon alfa-2a Y\|Pegylated interferon alpha-2a Y\|Pegylated-interferon alfa 2a | FDA_UNII_CODE\|Q46947FE7K RXAUI\|11350315 RXAUI\|11350316 RXAUI\|11350317 RXAUI\|11350318 RXAUI\|11350319 RXAUI\|8672645 RXAUI\|8731057 RXCUI\|120608 SID\|BIOD00043 SID\|BTD00043 | DB00009 | C0032143 | N\|Alteplase N\|Tissue plasminogen activator Y\|Alteplasa Y\|Alteplase (genetical recombination) Y\|Alteplase, recombinant Y\|Alteplase,recombinant Y\|Plasminogen activator (human tissue-type protein moiety) Y\|rt-PA Y\|t-PA Y\|t-plasminogen activator Y\|Tissue plasminogen activator alteplase Y\|Tissue plasminogen activator, recombinant Y\|tPA | FDA_UNII_CODE\|1RXS4UE564 RXAUI\|10778766 RXAUI\|8368173 RXAUI\|8383242 RXAUI\|8543112 RXAUI\|8578376 RXAUI\|9193634 RXAUI\|9193635 RXAUI\|9193636 RXAUI\|9193637 RXAUI\|9193638 RXAUI\|9193639 RXAUI\|9193640 RXAUI\|9193641 RXCUI\|8410 SID\|BIOD00050 SID\|BTD00050 | DB00010 | C0142046 | N\|Sermorelin | FDA_UNII_CODE\|89243S03TE RXAUI\|8619290 RXCUI\|56188 SID\|BIOD00033 SID\|BTD00033 | + +Here is that first element in the `TTL` file: +``` + a owl:Class ; + skos:prefLabel """Lepirudin"""@en ; + skos:notation """DB00001"""^^xsd:string ; + skos:altLabel """Desulfatohirudin"""@en , """Hirudin variant-1"""@en , """Lepirudin recombinant"""@en , """R-hirudin"""@en , """[Leu1, Thr2]-63-desulfohirudin"""@en ; + """BIOD00024"""^^xsd:string ; + """Y43GF64R34"""^^xsd:string ; + """BTD00024"""^^xsd:string ; + UMLS:has_cui """C0378366"""^^xsd:string ; + UMLS:has_cui """C0772394"""^^xsd:string ; + UMLS:has_tui """T116"""^^xsd:string ; + UMLS:has_tui """T121"""^^xsd:string ; + UMLS:has_sty ; + UMLS:has_sty ; +``` + +Here is `DB00009` in the `TTL` file: +``` + a owl:Class ; + skos:prefLabel """Alteplase"""@en ; + skos:notation """DB00009"""^^xsd:string ; + skos:altLabel """Alteplasa"""@en , """Alteplase (genetical recombination)"""@en , """Alteplase, recombinant"""@en , """Alteplase,recombinant"""@en , """Plasminogen activator (human tissue-type protein moiety)"""@en , """Tissue plasminogen activator"""@en , """Tissue plasminogen activator alteplase"""@en , """Tissue plasminogen activator, recombinant"""@en , """rt-PA"""@en , """t-PA"""@en , """t-plasminogen activator"""@en , """tPA"""@en ; + """1RXS4UE564"""^^xsd:string ; + """BIOD00050"""^^xsd:string ; + """BTD00050"""^^xsd:string ; + UMLS:has_cui """C0032143"""^^xsd:string ; + UMLS:has_tui """T116"""^^xsd:string ; + UMLS:has_tui """T121"""^^xsd:string ; + UMLS:has_tui """T126"""^^xsd:string ; + UMLS:has_sty ; + UMLS:has_sty ; + UMLS:has_sty ; +``` + +I need to look more into how to tell which name is the correct name. \ No newline at end of file From a819ef9c7ca78103ab6960e814f80168c9faf420 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 7 Aug 2023 20:09:27 -0700 Subject: [PATCH 010/117] #316 UMLS source predicates --- understanding_umls.md | 123 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) diff --git a/understanding_umls.md b/understanding_umls.md index 2fa8a089..ce7019d2 100644 --- a/understanding_umls.md +++ b/understanding_umls.md @@ -837,4 +837,125 @@ Here is `DB00009` in the `TTL` file: UMLS:has_sty ; ``` -I need to look more into how to tell which name is the correct name. \ No newline at end of file +I need to look more into how to tell which name is the correct name. + +UMLS Source Predicates: +``` +[ + { + "e.source_predicate": "UMLS:RB", + "e.primary_knowledge_source": "infores:umls-metathesaurus", + "count(e)": 235110 + }, + { + "e.source_predicate": "UMLS:RO", + "e.primary_knowledge_source": "infores:umls-metathesaurus", + "count(e)": 722308 + }, + { + "e.source_predicate": "UMLS:related_to", + "e.primary_knowledge_source": "infores:medlineplus", + "count(e)": 5658 + }, + { + "e.source_predicate": "UMLS:RQ", + "e.primary_knowledge_source": "infores:medlineplus", + "count(e)": 3224 + }, + { + "e.source_predicate": "UMLS:SY", + "e.primary_knowledge_source": "infores:medlineplus", + "count(e)": 932 + }, + { + "e.source_predicate": "UMLS:mapped_to", + "e.primary_knowledge_source": "infores:medlineplus", + "count(e)": 1008 + }, + { + "e.source_predicate": "UMLS:exhibited_by", + "e.primary_knowledge_source": "infores:umls-metathesaurus", + "count(e)": 2332 + }, + { + "e.source_predicate": "UMLS:has_structural_class", + "e.primary_knowledge_source": "infores:medrt-umls", + "count(e)": 4 + }, + { + "e.source_predicate": "UMLS:has_mapping_qualifier", + "e.primary_knowledge_source": "infores:medlineplus", + "count(e)": 42 + }, + { + "e.source_predicate": "UMLS:measures", + "e.primary_knowledge_source": "infores:umls-metathesaurus", + "count(e)": 406 + }, + { + "e.source_predicate": "UMLS:owning_subsection_of", + "e.primary_knowledge_source": "infores:hl7-umls", + "count(e)": 84 + }, + { + "e.source_predicate": "UMLS:has_supported_concept_property", + "e.primary_knowledge_source": "infores:hl7-umls", + "count(e)": 738 + }, + { + "e.source_predicate": "UMLS:has_supported_concept_relationship", + "e.primary_knowledge_source": "infores:hl7-umls", + "count(e)": 648 + }, + { + "e.source_predicate": "UMLS:class_code_classified_by", + "e.primary_knowledge_source": "infores:hl7-umls", + "count(e)": 122 + }, + { + "e.source_predicate": "UMLS:owning_section_of", + "e.primary_knowledge_source": "infores:hl7-umls", + "count(e)": 18 + }, + { + "e.source_predicate": "UMLS:has_context_binding", + "e.primary_knowledge_source": "infores:hl7-umls", + "count(e)": 134 + }, + { + "e.source_predicate": "UMLS:may_be_qualified_by", + "e.primary_knowledge_source": "infores:hl7-umls", + "count(e)": 40 + }, + { + "e.source_predicate": "UMLS:larger_than", + "e.primary_knowledge_source": "infores:hl7-umls", + "count(e)": 2 + }, + { + "e.source_predicate": "UMLS:component_of", + "e.primary_knowledge_source": "infores:hl7-umls", + "count(e)": 28 + }, + { + "e.source_predicate": "UMLS:has_component", + "e.primary_knowledge_source": "infores:hl7-umls", + "count(e)": 18 + }, + { + "e.source_predicate": "UMLS:has_owning_affiliate", + "e.primary_knowledge_source": "infores:hl7-umls", + "count(e)": 2 + }, + { + "e.source_predicate": "UMLS:has_physiologic_effect", + "e.primary_knowledge_source": "infores:medrt-umls", + "count(e)": 2 + }, + { + "e.source_predicate": "UMLS:has_form", + "e.primary_knowledge_source": "infores:umls-metathesaurus", + "count(e)": 2 + } +] +``` \ No newline at end of file From 90d9b8c1865cfb33d06310592d1ef6169858707e Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 8 Aug 2023 11:32:11 -0700 Subject: [PATCH 011/117] #316 adding more to query development --- understanding_umls.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/understanding_umls.md b/understanding_umls.md index ce7019d2..e0e48b6a 100644 --- a/understanding_umls.md +++ b/understanding_umls.md @@ -841,7 +841,7 @@ I need to look more into how to tell which name is the correct name. UMLS Source Predicates: ``` -[ +[ { "e.source_predicate": "UMLS:RB", "e.primary_knowledge_source": "infores:umls-metathesaurus", @@ -958,4 +958,8 @@ UMLS Source Predicates: "count(e)": 2 } ] -``` \ No newline at end of file +``` + +``` +select con.CODE, con.SAB, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.ISPREF, '|', con.STR) SEPARATOR '\t'), GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', sat.ATV) SEPARATOR '\t') from MRCONSO con left join MRSAT sat on con.CODE=sat.CODE GROUP BY con.CODE, con.SAB; +``` From 5b74948e2a764a0e715039d9c4062e66fc2993ed Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 8 Aug 2023 18:44:37 -0700 Subject: [PATCH 012/117] #316 start implementing queries --- umls_mysql_to_list_jsonl.py | 99 +++++++++++++++++++++++++++++++++++++ understanding_umls.md | 22 +++++++++ 2 files changed, 121 insertions(+) create mode 100755 umls_mysql_to_list_jsonl.py diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py new file mode 100755 index 00000000..d3c4707f --- /dev/null +++ b/umls_mysql_to_list_jsonl.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +'''umls_mysql_to_list_jsonl.py: extracts all of the information from UMLS and stores it in a JSON Lines output + + Usage: umls_mysql_to_list_jsonl.py [--test] +''' + +__author__ = 'Erica Wood' +__copyright__ = 'Oregon State University' +__credits__ = ['Stephen Ramsey', 'Erica Wood'] +__license__ = 'MIT' +__version__ = '0.1.0' +__maintainer__ = '' +__email__ = '' +__status__ = 'Prototype' + + +import argparse +import kg2_util +import pymysql + + +def make_arg_parser(): + arg_parser = argparse.ArgumentParser(description='umls_mysql_to_list_jsonl.py: extracts all of the information from UMLS and stores it in a JSON Lines output') + arg_parser.add_argument('mysqlConfigFile', type=str) + arg_parser.add_argument('mysqlDBName', type=str) + arg_parser.add_argument('outputFile', type=str) + return arg_parser + + +def code_sources(cursor, output): + code_source_info = dict() + + names_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.ISPREF, '|', con.STR) SEPARATOR '\t') FROM MRCONSO con GROUP BY con.CODE, con.SAB" + extra_info_sql_statement = "SELECT sat.CODE, sat.SAB, GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', sat.ATV) SEPARATOR '\t') FROM MRSAT sat GROUP BY sat.CODE, sat.SAB" + + cursor.execute(names_sql_statement) + + cui_key = 'cuis' + name_key = 'names' + info_key = 'info' + + for result in cursor.fetchall(): + (node_id, node_source, cui, name) = result + key = (node_id, node_source) + code_source_info[key] = dict() + code_source_info[key][cui_key] = cui.split(',') + code_source_info[key][name_key] = name.split('\t') + + print("Finished names_sql_statement at", kg2_util.date()) + + cursor.execute(extra_info_sql_statement) + + for result in cursor.fetchall(): + (node_id, node_source, info) = result + key = (node_id, node_source) + if key not in code_source_info: + code_source_info[key] = dict() + print(key, "not in original code_source_info dict") + code_source_info[key][info_key] = info.split('\t') + + print("Finished extra_info_sql_statement at", kg2_util.date()) + + for key, val in code_source_info.items(): + # It needs to print it all out for some reason to actually do the output write + print(str({str(key): val})) + output.write({str(key): val}) + + +if __name__ == '__main__': + print("Starting umls_mysql_to_list_jsonl.py at", kg2_util.date()) + args = make_arg_parser().parse_args() + mysql_config_file = args.mysqlConfigFile + mysql_db_name = args.mysqlDBName + output_file_name = args.outputFile + connection = pymysql.connect(read_default_file=mysql_config_file, db=mysql_db_name) + preds_dict = dict() + + output_info = kg2_util.create_single_jsonlines(False) + output = output_info[0] + + # https://stackoverflow.com/questions/7208773/mysql-row-30153-was-cut-by-group-concat-error + max_len_sql_statement = "SET group_concat_max_len=1000000000" + + sql_statement = ("SELECT SUBJECT_CUI, PREDICATE, OBJECT_CUI, GROUP_CONCAT(DISTINCT SUBJECT_SEMTYPE), GROUP_CONCAT(DISTINCT OBJECT_SEMTYPE), " + "GROUP_CONCAT(DISTINCT DATE_FORMAT(CURR_TIMESTAMP, '%Y-%m-%d %H:%i:%S')), " + "GROUP_CONCAT(CONCAT(PMID, '|', SENTENCE, '|', SUBJECT_SCORE, '|', OBJECT_SCORE, '|', DP) SEPARATOR '\t') " + "FROM ((PREDICATION NATURAL JOIN CITATIONS) NATURAL JOIN SENTENCE) NATURAL JOIN PREDICATION_AUX " + "GROUP BY SUBJECT_CUI, PREDICATE, OBJECT_CUI") + + with connection.cursor() as cursor: + cursor.execute(max_len_sql_statement) + cursor.fetchall() + + # Execute statement we care about after clearing any "results" + code_sources(cursor, output) + connection.close() + + kg2_util.close_single_jsonlines(output_info, output_file_name) + print("Finishing umls_mysql_to_list_jsonl.py at", kg2_util.date()) diff --git a/understanding_umls.md b/understanding_umls.md index e0e48b6a..65f4b8e1 100644 --- a/understanding_umls.md +++ b/understanding_umls.md @@ -963,3 +963,25 @@ UMLS Source Predicates: ``` select con.CODE, con.SAB, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.ISPREF, '|', con.STR) SEPARATOR '\t'), GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', sat.ATV) SEPARATOR '\t') from MRCONSO con left join MRSAT sat on con.CODE=sat.CODE GROUP BY con.CODE, con.SAB; ``` +This ran for about 4.5 hours before hitting this error: +``` +ERROR 1114 (HY000): The table '/tmp/#sql31e_8_6' is full +``` + +New plan - split it up: + +``` +select con.CODE, con.SAB, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.ISPREF, '|', con.STR) SEPARATOR '\t') from MRCONSO con GROUP BY con.CODE, con.SAB; +``` +gives +``` +7137936 rows in set, 3758 warnings (1 min 29.37 sec) +``` + +``` +select sat.CODE, sat.SAB, GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', sat.ATV) SEPARATOR '\t') from MRSAT sat GROUP BY sat.CODE, sat.SAB; +``` +gives +``` +5330040 rows in set, 65535 warnings (10 min 11.85 sec) +``` \ No newline at end of file From 7068316cef666ff77059071bad627b133ce40e0e Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 9 Aug 2023 12:57:36 -0700 Subject: [PATCH 013/117] #316 add CUI related code --- umls_mysql_to_list_jsonl.py | 81 +++++++++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 13 deletions(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index d3c4707f..01f61f8a 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -29,16 +29,14 @@ def make_arg_parser(): def code_sources(cursor, output): code_source_info = dict() + cui_key = 'cuis' + name_key = 'names' + info_key = 'info' names_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.ISPREF, '|', con.STR) SEPARATOR '\t') FROM MRCONSO con GROUP BY con.CODE, con.SAB" extra_info_sql_statement = "SELECT sat.CODE, sat.SAB, GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', sat.ATV) SEPARATOR '\t') FROM MRSAT sat GROUP BY sat.CODE, sat.SAB" cursor.execute(names_sql_statement) - - cui_key = 'cuis' - name_key = 'names' - info_key = 'info' - for result in cursor.fetchall(): (node_id, node_source, cui, name) = result key = (node_id, node_source) @@ -49,7 +47,6 @@ def code_sources(cursor, output): print("Finished names_sql_statement at", kg2_util.date()) cursor.execute(extra_info_sql_statement) - for result in cursor.fetchall(): (node_id, node_source, info) = result key = (node_id, node_source) @@ -66,6 +63,70 @@ def code_sources(cursor, output): output.write({str(key): val}) +def cui_sources(cursor, output): + cui_source_info = dict() + tui_key = 'tuis' + name_key = 'names' + relation_key = 'relations' + definitions_key = 'definitions' + + names_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT CONCAT(ISPREF, '|', STR) SEPARATOR '\t') FROM MRCONSO WHERE LAT=\"ENG\" GROUP BY CUI" + tuis_sql_statement = "SELECT CUI, GROUP_CONCAT(TUI) FROM MRSTY GROUP BY CUI" + relations_sql_statement = "SELECT CUI1, REL, RELA, DIR, CUI2, SAB FROM MRREL" + definitions_sql_statement = "SELECT CUI, DEF FROM MRDEF" + + cursor.execute(names_sql_statement) + for result in cursor.fetchall(): + (node_id, name) = result + key = node_id + cui_source_info[key] = dict() + cui_source_info[key][name_key] = name.split('\t') + + print("Finished names_sql_statement at", kg2_util.date()) + + cursor.execute(tuis_sql_statement) + for result in cursor.fetchall(): + (node_id, tuis) = result + key = node_id + if key not in cui_source_info: + # This happens if a node doesn't have an English name. Since UMLS:C5779458 (an example one) + # wasn't in KG2.8.3pre, I am having these skipped + continue + cui_source_info[key][tui_key] = tuis.split('\t') + + print("Finished tuis_sql_statement at", kg2_util.date()) + + cursor.execute(relations_sql_statement) + for result in cursor.fetchall(): + (cui1, rel, rela, direction, cui2, source) = result + key = cui1 + if key not in cui_source_info: + # See above for explanation + continue + if relation_key not in cui_source_info[key]: + cui_source_info[key][relation_key] = list() + cui_source_info[key][relation_key].append((rel, rela, direction, cui2, source)) + + print("Finished relations_sql_statement at", kg2_util.date()) + + cursor.execute(definitions_sql_statement) + for result in cursor.fetchall(): + (node_id, definition) = result + key = node_id + if key not in cui_source_info: + # See above for explanation + continue + cui_source_info[key][definitions_key] = definition + + print("Finished definitions_sql_statement at", kg2_util.date()) + + for key, val in cui_source_info.items(): + # It needs to print it all out for some reason to actually do the output write + print(str({str(key): val})) + output.write({str(key): val}) + + + if __name__ == '__main__': print("Starting umls_mysql_to_list_jsonl.py at", kg2_util.date()) args = make_arg_parser().parse_args() @@ -81,18 +142,12 @@ def code_sources(cursor, output): # https://stackoverflow.com/questions/7208773/mysql-row-30153-was-cut-by-group-concat-error max_len_sql_statement = "SET group_concat_max_len=1000000000" - sql_statement = ("SELECT SUBJECT_CUI, PREDICATE, OBJECT_CUI, GROUP_CONCAT(DISTINCT SUBJECT_SEMTYPE), GROUP_CONCAT(DISTINCT OBJECT_SEMTYPE), " - "GROUP_CONCAT(DISTINCT DATE_FORMAT(CURR_TIMESTAMP, '%Y-%m-%d %H:%i:%S')), " - "GROUP_CONCAT(CONCAT(PMID, '|', SENTENCE, '|', SUBJECT_SCORE, '|', OBJECT_SCORE, '|', DP) SEPARATOR '\t') " - "FROM ((PREDICATION NATURAL JOIN CITATIONS) NATURAL JOIN SENTENCE) NATURAL JOIN PREDICATION_AUX " - "GROUP BY SUBJECT_CUI, PREDICATE, OBJECT_CUI") - with connection.cursor() as cursor: cursor.execute(max_len_sql_statement) cursor.fetchall() # Execute statement we care about after clearing any "results" - code_sources(cursor, output) + cui_sources(cursor, output) connection.close() kg2_util.close_single_jsonlines(output_info, output_file_name) From e99d8f96ceafdc7b8123ffd99c0442aae900b15e Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 9 Aug 2023 14:39:06 -0700 Subject: [PATCH 014/117] #316 we currently only need english based names --- umls_mysql_to_list_jsonl.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index 01f61f8a..e2f2a04b 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -27,6 +27,21 @@ def make_arg_parser(): return arg_parser +def get_english_sources(cursor): + sources_sql_statement = "SELECT RSAB, LAT FROM MRSAB" + sources = [] + + cursor.execute(sources_sql_statement) + for result in cursor.fetchall(): + (source, language) = result + if language == 'ENG': + sources.append(source) + + print("Finished sources_sql_statement at", kg2_util.date()) + + return sources + + def code_sources(cursor, output): code_source_info = dict() cui_key = 'cuis' @@ -63,17 +78,19 @@ def code_sources(cursor, output): output.write({str(key): val}) -def cui_sources(cursor, output): +def cui_sources(cursor, output, sources): cui_source_info = dict() tui_key = 'tuis' name_key = 'names' relation_key = 'relations' definitions_key = 'definitions' - names_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT CONCAT(ISPREF, '|', STR) SEPARATOR '\t') FROM MRCONSO WHERE LAT=\"ENG\" GROUP BY CUI" + sources_where = str(sources).replace('[', '(').replace(']', ')') + + names_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT CONCAT(SAB, '|', ISPREF, '|', STR) SEPARATOR '\t') FROM MRCONSO WHERE SAB IN " + sources_where + " GROUP BY CUI" tuis_sql_statement = "SELECT CUI, GROUP_CONCAT(TUI) FROM MRSTY GROUP BY CUI" - relations_sql_statement = "SELECT CUI1, REL, RELA, DIR, CUI2, SAB FROM MRREL" - definitions_sql_statement = "SELECT CUI, DEF FROM MRDEF" + relations_sql_statement = "SELECT CUI1, REL, RELA, DIR, CUI2, SAB FROM MRREL WHERE SAB IN " + sources_where + definitions_sql_statement = "SELECT CUI, DEF FROM MRDEF WHERE SAB IN " + sources_where cursor.execute(names_sql_statement) for result in cursor.fetchall(): @@ -89,8 +106,7 @@ def cui_sources(cursor, output): (node_id, tuis) = result key = node_id if key not in cui_source_info: - # This happens if a node doesn't have an English name. Since UMLS:C5779458 (an example one) - # wasn't in KG2.8.3pre, I am having these skipped + # This happens if a node doesn't have an English name. See https://github.com/RTXteam/RTX-KG2/issues/316#issuecomment-1672074392 continue cui_source_info[key][tui_key] = tuis.split('\t') @@ -147,7 +163,9 @@ def cui_sources(cursor, output): cursor.fetchall() # Execute statement we care about after clearing any "results" - cui_sources(cursor, output) + sources = get_english_sources(cursor) + + cui_sources(cursor, output, sources) connection.close() kg2_util.close_single_jsonlines(output_info, output_file_name) From 83d641e745201983d71509969bfbe523f21aa244 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 9 Aug 2023 16:58:00 -0700 Subject: [PATCH 015/117] #316 making some progress with structuring name data --- umls_mysql_to_list_jsonl.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index e2f2a04b..1862fce2 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -94,10 +94,18 @@ def cui_sources(cursor, output, sources): cursor.execute(names_sql_statement) for result in cursor.fetchall(): - (node_id, name) = result + (node_id, names) = result key = node_id cui_source_info[key] = dict() - cui_source_info[key][name_key] = name.split('\t') + cui_source_info[key][name_key] = dict() + for name in names.split('\t'): + split_name = name.split('|') + assert len(split_name) == 3, split_name + if split_name[0] not in cui_source_info[key][name_key]: + cui_source_info[key][name_key][split_name[0]] = dict() + if split_name[1] not in cui_source_info[key][name_key][split_name[0]]: + cui_source_info[key][name_key][split_name[0]][split_name[1]] = list() + existing_val = cui_source_info[key][name_key][split_name[0]][split_name[1]].append(split_name[2]) print("Finished names_sql_statement at", kg2_util.date()) @@ -166,6 +174,8 @@ def cui_sources(cursor, output, sources): sources = get_english_sources(cursor) cui_sources(cursor, output, sources) + + # code_sources(cursor, output) connection.close() kg2_util.close_single_jsonlines(output_info, output_file_name) From acac6310c6beb2f77356f57305ae582514eba612 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 10 Aug 2023 14:53:38 -0700 Subject: [PATCH 016/117] #316 making the extraction actually pull stuff out and store it in a helpful way --- umls_mysql_to_list_jsonl.py | 61 ++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index 1862fce2..93b150e8 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -48,16 +48,25 @@ def code_sources(cursor, output): name_key = 'names' info_key = 'info' - names_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.ISPREF, '|', con.STR) SEPARATOR '\t') FROM MRCONSO con GROUP BY con.CODE, con.SAB" - extra_info_sql_statement = "SELECT sat.CODE, sat.SAB, GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', sat.ATV) SEPARATOR '\t') FROM MRSAT sat GROUP BY sat.CODE, sat.SAB" + names_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.TTY, '|', con.ISPREF, '|', con.STR) SEPARATOR '\t') FROM MRCONSO con GROUP BY con.CODE, con.SAB" + extra_info_sql_statement = "SELECT sat.CODE, sat.SAB, GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', REPLACE(sat.ATV, '\t', ' ')) SEPARATOR '\t') FROM MRSAT sat GROUP BY sat.CODE, sat.SAB" cursor.execute(names_sql_statement) for result in cursor.fetchall(): - (node_id, node_source, cui, name) = result + (node_id, node_source, cui, names) = result key = (node_id, node_source) code_source_info[key] = dict() code_source_info[key][cui_key] = cui.split(',') - code_source_info[key][name_key] = name.split('\t') + if name_key not in code_source_info[key]: + code_source_info[key][name_key] = dict() + for name in names.split('\t'): + split_name = name.split('|') + assert len(split_name) == 3, split_name + if split_name[0] not in code_source_info[key][name_key]: + code_source_info[key][name_key][split_name[0]] = dict() + if split_name[1] not in code_source_info[key][name_key][split_name[0]]: + code_source_info[key][name_key][split_name[0]][split_name[1]] = list() + code_source_info[key][name_key][split_name[0]][split_name[1]].append(split_name[2]) print("Finished names_sql_statement at", kg2_util.date()) @@ -66,17 +75,28 @@ def code_sources(cursor, output): (node_id, node_source, info) = result key = (node_id, node_source) if key not in code_source_info: - code_source_info[key] = dict() - print(key, "not in original code_source_info dict") - code_source_info[key][info_key] = info.split('\t') + # This occurs if a node doesn't have a name. + continue + if info_key not in code_source_info[key]: + code_source_info[key][info_key] = dict() + for info_piece in info.split('\t'): + split_info_piece = info_piece.split('|') + assert len(split_info_piece) == 2, split_info_piece + if split_info_piece[0] not in code_source_info[key][info_key]: + code_source_info[key][info_key][split_info_piece[0]] = set() + code_source_info[key][info_key][split_info_piece[0]].add(split_info_piece[1]) + for info_type in code_source_info[key][info_key]: + code_source_info[key][info_key][info_type] = list(code_source_info[key][info_key][info_type]) print("Finished extra_info_sql_statement at", kg2_util.date()) + record_num = 0 for key, val in code_source_info.items(): - # It needs to print it all out for some reason to actually do the output write - print(str({str(key): val})) + record_num += 1 output.write({str(key): val}) + print("Finished adding", record_num, "records in code_sources() at", kg2_util.date()) + def cui_sources(cursor, output, sources): cui_source_info = dict() @@ -105,7 +125,7 @@ def cui_sources(cursor, output, sources): cui_source_info[key][name_key][split_name[0]] = dict() if split_name[1] not in cui_source_info[key][name_key][split_name[0]]: cui_source_info[key][name_key][split_name[0]][split_name[1]] = list() - existing_val = cui_source_info[key][name_key][split_name[0]][split_name[1]].append(split_name[2]) + cui_source_info[key][name_key][split_name[0]][split_name[1]].append(split_name[2]) print("Finished names_sql_statement at", kg2_util.date()) @@ -116,7 +136,7 @@ def cui_sources(cursor, output, sources): if key not in cui_source_info: # This happens if a node doesn't have an English name. See https://github.com/RTXteam/RTX-KG2/issues/316#issuecomment-1672074392 continue - cui_source_info[key][tui_key] = tuis.split('\t') + cui_source_info[key][tui_key] = tuis.split(',') print("Finished tuis_sql_statement at", kg2_util.date()) @@ -128,8 +148,14 @@ def cui_sources(cursor, output, sources): # See above for explanation continue if relation_key not in cui_source_info[key]: - cui_source_info[key][relation_key] = list() - cui_source_info[key][relation_key].append((rel, rela, direction, cui2, source)) + cui_source_info[key][relation_key] = dict() + + relation_type_key = ','.join([str(rel), str(rela), str(direction)]) + if source not in cui_source_info[key][relation_key]: + cui_source_info[key][relation_key][source] = dict() + if relation_type_key not in cui_source_info[key][relation_key][source]: + cui_source_info[key][relation_key][source][relation_type_key] = list() + cui_source_info[key][relation_key][source][relation_type_key].append(cui2) print("Finished relations_sql_statement at", kg2_util.date()) @@ -144,11 +170,12 @@ def cui_sources(cursor, output, sources): print("Finished definitions_sql_statement at", kg2_util.date()) + record_num = 0 for key, val in cui_source_info.items(): - # It needs to print it all out for some reason to actually do the output write - print(str({str(key): val})) + record_num += 1 output.write({str(key): val}) + print("Finished adding", record_num, "records in cui_sources() at", kg2_util.date()) if __name__ == '__main__': @@ -173,9 +200,9 @@ def cui_sources(cursor, output, sources): # Execute statement we care about after clearing any "results" sources = get_english_sources(cursor) - cui_sources(cursor, output, sources) + code_sources(cursor, output) + # cui_sources(cursor, output, sources) - # code_sources(cursor, output) connection.close() kg2_util.close_single_jsonlines(output_info, output_file_name) From e361f8483a9effa04714e04b9b1254c6c870d5d9 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 15 Aug 2023 12:03:51 -0700 Subject: [PATCH 017/117] #316 more name information on CUI sources, TUIs on code sources --- umls_mysql_to_list_jsonl.py | 46 ++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index 93b150e8..8bdb02c0 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -44,12 +44,17 @@ def get_english_sources(cursor): def code_sources(cursor, output): code_source_info = dict() + tui_key = 'tuis' cui_key = 'cuis' name_key = 'names' - info_key = 'info' + # See info about these here: https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html + info_key = 'attributes' + + # See TTY meanings here: https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html names_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.TTY, '|', con.ISPREF, '|', con.STR) SEPARATOR '\t') FROM MRCONSO con GROUP BY con.CODE, con.SAB" extra_info_sql_statement = "SELECT sat.CODE, sat.SAB, GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', REPLACE(sat.ATV, '\t', ' ')) SEPARATOR '\t') FROM MRSAT sat GROUP BY sat.CODE, sat.SAB" + tuis_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT sty.TUI) FROM MRCONSO con LEFT JOIN MRSTY sty ON con.CUI = sty.CUI GROUP BY con.CODE, con.SAB" cursor.execute(names_sql_statement) for result in cursor.fetchall(): @@ -90,6 +95,17 @@ def code_sources(cursor, output): print("Finished extra_info_sql_statement at", kg2_util.date()) + cursor.execute(tuis_sql_statement) + for result in cursor.fetchall(): + (node_id, node_source, tuis) = result + key = (node_id, node_source) + if key not in code_source_info: + # This occurs if a node doesn't have a name. + continue + code_source_info[key][tui_key] = tuis.split(',') + + print("Finished tuis_sql_statement at", kg2_util.date()) + record_num = 0 for key, val in code_source_info.items(): record_num += 1 @@ -105,11 +121,13 @@ def cui_sources(cursor, output, sources): relation_key = 'relations' definitions_key = 'definitions' + # Make the sources list a MySQL list sources_where = str(sources).replace('[', '(').replace(']', ')') - names_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT CONCAT(SAB, '|', ISPREF, '|', STR) SEPARATOR '\t') FROM MRCONSO WHERE SAB IN " + sources_where + " GROUP BY CUI" + # See TTY meanings here: https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html + names_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT CONCAT(TTY, '|', SAB, '|', ISPREF, '|', STR) SEPARATOR '\t') FROM MRCONSO WHERE SAB IN " + sources_where + " GROUP BY CUI" tuis_sql_statement = "SELECT CUI, GROUP_CONCAT(TUI) FROM MRSTY GROUP BY CUI" - relations_sql_statement = "SELECT CUI1, REL, RELA, DIR, CUI2, SAB FROM MRREL WHERE SAB IN " + sources_where + relations_sql_statement = "SELECT DISTINCT CUI1, REL, RELA, DIR, CUI2, SAB FROM MRREL WHERE SAB IN " + sources_where definitions_sql_statement = "SELECT CUI, DEF FROM MRDEF WHERE SAB IN " + sources_where cursor.execute(names_sql_statement) @@ -120,12 +138,18 @@ def cui_sources(cursor, output, sources): cui_source_info[key][name_key] = dict() for name in names.split('\t'): split_name = name.split('|') - assert len(split_name) == 3, split_name - if split_name[0] not in cui_source_info[key][name_key]: - cui_source_info[key][name_key][split_name[0]] = dict() - if split_name[1] not in cui_source_info[key][name_key][split_name[0]]: - cui_source_info[key][name_key][split_name[0]][split_name[1]] = list() - cui_source_info[key][name_key][split_name[0]][split_name[1]].append(split_name[2]) + assert len(split_name) == 4, split_name + name_tty = split_name[0] + name_source = split_name[1] + name_ispref = split_name[2] + name_str = split_name[3] + if name_source not in cui_source_info[key][name_key]: + cui_source_info[key][name_key][name_source] = dict() + if name_tty not in cui_source_info[key][name_key][name_source]: + cui_source_info[key][name_key][name_source][name_tty] = dict() + if name_ispref not in cui_source_info[key][name_key][name_source][name_tty]: + cui_source_info[key][name_key][name_source][name_tty][name_ispref] = list() + cui_source_info[key][name_key][name_source][name_tty][name_ispref].append(name_str) print("Finished names_sql_statement at", kg2_util.date()) @@ -197,11 +221,11 @@ def cui_sources(cursor, output, sources): cursor.execute(max_len_sql_statement) cursor.fetchall() - # Execute statement we care about after clearing any "results" + # This ensure we don't have UMLS sources that overwrite each other's names sources = get_english_sources(cursor) code_sources(cursor, output) - # cui_sources(cursor, output, sources) + cui_sources(cursor, output, sources) connection.close() From 4c71b157a2e618a681ac1828c6f32588eb902ffc Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 16 Aug 2023 09:23:26 -0700 Subject: [PATCH 018/117] #316 making it easier to process --- umls_mysql_to_list_jsonl.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index 8bdb02c0..88ff11a1 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 '''umls_mysql_to_list_jsonl.py: extracts all of the information from UMLS and stores it in a JSON Lines output - Usage: umls_mysql_to_list_jsonl.py [--test] + Usage: umls_mysql_to_list_jsonl.py [--test] ''' __author__ = 'Erica Wood' @@ -19,12 +19,12 @@ import pymysql -def make_arg_parser(): +def get_args(): arg_parser = argparse.ArgumentParser(description='umls_mysql_to_list_jsonl.py: extracts all of the information from UMLS and stores it in a JSON Lines output') arg_parser.add_argument('mysqlConfigFile', type=str) arg_parser.add_argument('mysqlDBName', type=str) arg_parser.add_argument('outputFile', type=str) - return arg_parser + return arg_parser.parse_args() def get_english_sources(cursor): @@ -120,6 +120,7 @@ def cui_sources(cursor, output, sources): name_key = 'names' relation_key = 'relations' definitions_key = 'definitions' + umls_source_name = 'UMLS' # Make the sources list a MySQL list sources_where = str(sources).replace('[', '(').replace(']', ')') @@ -133,7 +134,7 @@ def cui_sources(cursor, output, sources): cursor.execute(names_sql_statement) for result in cursor.fetchall(): (node_id, names) = result - key = node_id + key = (node_id, umls_source_name) cui_source_info[key] = dict() cui_source_info[key][name_key] = dict() for name in names.split('\t'): @@ -156,7 +157,7 @@ def cui_sources(cursor, output, sources): cursor.execute(tuis_sql_statement) for result in cursor.fetchall(): (node_id, tuis) = result - key = node_id + key = (node_id, umls_source_name) if key not in cui_source_info: # This happens if a node doesn't have an English name. See https://github.com/RTXteam/RTX-KG2/issues/316#issuecomment-1672074392 continue @@ -167,7 +168,7 @@ def cui_sources(cursor, output, sources): cursor.execute(relations_sql_statement) for result in cursor.fetchall(): (cui1, rel, rela, direction, cui2, source) = result - key = cui1 + key = (cui1, umls_source_name) if key not in cui_source_info: # See above for explanation continue @@ -186,7 +187,7 @@ def cui_sources(cursor, output, sources): cursor.execute(definitions_sql_statement) for result in cursor.fetchall(): (node_id, definition) = result - key = node_id + key = (node_id, umls_source_name) if key not in cui_source_info: # See above for explanation continue @@ -204,7 +205,7 @@ def cui_sources(cursor, output, sources): if __name__ == '__main__': print("Starting umls_mysql_to_list_jsonl.py at", kg2_util.date()) - args = make_arg_parser().parse_args() + args = get_args() mysql_config_file = args.mysqlConfigFile mysql_db_name = args.mysqlDBName output_file_name = args.outputFile From 38c2473ecb8f716468429c3adde21c3c49416e7f Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 16 Aug 2023 09:24:17 -0700 Subject: [PATCH 019/117] #316 starting to process the JSON Lines list into nodes/edges --- umls_list_jsonl_to_kg_jsonl.py | 237 +++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 umls_list_jsonl_to_kg_jsonl.py diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py new file mode 100644 index 00000000..5cc50f37 --- /dev/null +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 +'''umls_list_jsonl_to_kg_jsonl.py: converts UMLS MySQL JSON Lines dump into KG2 JSON format + + Usage: umls_list_jsonl_to_kg_jsonl.py [--test] +''' + +__author__ = 'Erica Wood' +__copyright__ = 'Oregon State University' +__credits__ = ['Stephen Ramsey', 'Erica Wood'] +__license__ = 'MIT' +__version__ = '0.1.0' +__maintainer__ = '' +__email__ = '' +__status__ = 'Prototype' + + +import argparse +import kg2_util +import json + + +DESIRED_CODES = ['ATC', 'CHV', 'DRUGBANK', 'FMA', 'GO', 'HCPCS', 'HGNC', 'HL7V3.0', + 'HL7', 'HPO', 'ICD10PCS', 'ICD9CM', 'MED-RT', 'MEDLINEPLUS', 'MSH', + 'MTH', 'NCBI', 'NCBITAXON', 'NCI', 'NDDF', 'NDFRT', 'OMIM', 'PDQ', + 'PSY', 'RXNORM', 'VANDF'] +CUIS_KEY = 'cuis' +INFO_KEY = 'info' +NAMES_KEY = 'names' +TUIS_KEY = 'tuis' + +TUI_MAPPINGS = {"T001": "individual organism", + "T002": "organism taxon", + "T004": "organism taxon", + "T005": "organism taxon", + "T007": "organism taxon", + "T008": "organism taxon", + "T010": "organism taxon", + "T011": "organism taxon", + "T012": "organism taxon", + "T013": "organism taxon", + "T014": "organism taxon", + "T015": "organism taxon", + "T016": "organism taxon", + "T017": "anatomical entity", + "T018": "gross anatomical structure", + "T019": "disease", + "T020": "disease", + "T021": "gross anatomical structure", + "T022": "anatomical entity", + "T023": "gross anatomical structure", + "T024": "gross anatomical structure", + "T025": "cell", + "T026": "cellular component", + "T028": "biological entity", + "T029": "anatomical entity", + "T030": "anatomical entity", + "T031": "anatomical entity", + "T032": "named thing", + "T033": "disease or phenotypic feature", + "T034": "phenomenon", + "T037": "pathological process", + "T038": "phenomenon", + "T039": "physiological process", + "T040": "physiological process", + "T041": "behavior", + "T042": "physiological process", + "T043": "physiological process", + "T044": "molecular activity", + "T045": "physiological process", + "T046": "pathological process", + "T047": "disease", + "T048": "disease", + "T049": "disease", + "T050": "biological entity", + "T051": "event", + "T052": "activity", + "T053": "behavior", + "T054": "behavior", + "T055": "behavior", + "T056": "activity", + "T057": "activity", + "T058": "activity", + "T059": "procedure", + "T060": "procedure", + "T061": "procedure", + "T062": "activity", + "T063": "procedure", + "T064": "activity", + "T065": "activity", + "T066": "activity", + "T067": "phenomenon", + "T068": "phenomenon", + "T069": "phenomenon", + "T070": "phenomenon", + "T071": "named thing", + "T072": "physical entity", + "T073": "physical entity", + "T074": "device", + "T075": "device", + "T077": "information content entity", + "T078": "information content entity", + "T079": "information content entity", + "T080": "information content entity", + "T081": "information content entity", + "T082": "information content entity", + "T083": "geographic location", + "T085": "biological entity", + "T086": "nucleic acid entity", + "T087": "polypeptide", + "T088": "biological entity", + "T089": "information content entity", + "T090": "individual organism", + "T091": "named thing", + "T092": "agent", + "T093": "agent", + "T094": "agent", + "T095": "agent", + "T096": "agent", + "T097": "cohort", + "T098": "population of individual organisms", + "T099": "cohort", + "T100": "cohort", + "T101": "cohort", + "T102": "information content entity", + "T103": "chemical entity", + "T104": "chemical entity", + "T109": "chemical entity", + "T114": "nucleic acid entity", + "T116": "polypeptide", + "T120": "chemical entity", + "T121": "drug", + "T122": "device", + "T123": "chemical entity", + "T125": "chemical entity", + "T126": "protein", + "T127": "small molecule", + "T129": "biological entity", + "T130": "chemical entity", + "T131": "chemical entity", + "T167": "chemical entity", + "T168": "food", + "T169": "information content entity", + "T170": "publication", + "T171": "information content entity", + "T184": "phenotypic feature", + "T185": "information content entity", + "T190": "disease", + "T191": "disease", + "T192": "protein", + "T194": "organism taxon", + "T195": "drug", + "T196": "small molecule", + "T197": "chemical entity", + "T200": "drug", + "T201": "named thing", + "T203": "device", + "T204": "organism taxon"} + +def get_args(): + arg_parser = argparse.ArgumentParser(description='umls_list_jsonl_to_kg_jsonl.py: converts UMLS MySQL JSON Lines dump into KG2 JSON format') + arg_parser.add_argument('inputFile', type=str) + arg_parser.add_argument('outputNodesFile', type=str) + arg_parser.add_argument('outputEdgesFile', type=str) + return arg_parser.parse_args() + + +def extract_node_id(node_id_str): + node_id_str = node_id_str.replace('(', '').replace(')', '').replace("'", '') + node_id = node_id_str.split(',') + return node_id[1].strip(), node_id[0].strip() + + +def make_node_id(curie_prefix, node_id_val): + return curie_prefix + ':' + node_id_val + + +def process_drugbank_item(node_id_val, info): + node_curie = make_node_id(kg2_util.CURIE_PREFIX_DRUGBANK, node_id_val) + cuis = info.get(CUIS_KEY, list()) + tuis = info.get(TUIS_KEY, list()) + fda_codes = info.get(INFO_KEY, dict()).get('FDA_UNII_CODE', list()) + secondary_accession_keys = info.get(INFO_KEY, dict()).get('SID', list()) + name = info.get(NAMES_KEY, dict()).get('IN', dict()).get('N', list()) + if len(name) == 0: + name = info.get(NAMES_KEY, dict()).get('IN', dict()).get('Y', list()) + assert len(name) == 1, str(name) + " " + node_curie + name = name[0] + synonyms = list() + for syn_cat in info.get('SY', dict()): + synonyms += info['SY'][syn_cat] + + print(json.dumps({'node_curie': node_curie, 'cuis': cuis, 'tuis': tuis, 'fda_codes': fda_codes, 'secondary_accession_keys': secondary_accession_keys, 'name': name, 'synonyms': synonyms})) + return str(tuis) + + +if __name__ == '__main__': + args = get_args() + input_file_name = args.inputFile + + input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) + input_items = input_read_jsonlines_info[0] + + tui_combos = dict() + + for data in input_items: + # There should only be one item in the data dictionary + for entity in data: + if entity == "('NOCODE', 'MTH')": + continue + value = data[entity] + source, node_id_val = extract_node_id(entity) + if source not in DESIRED_CODES and source != 'UMLS': + continue + + # Process the data specifically by source + tui_combo = tuple(sorted(value.get(TUIS_KEY, list()))) + if tui_combo not in tui_combos: + tui_combos[tui_combo] = dict() + tui_combos[tui_combo]['tuis'] = list() + tui_combos[tui_combo]['tui_count'] = 0 + tui_combos[tui_combo]['tuis'].append(entity) + tui_combos[tui_combo]['tui_count'] += 1 + if source == 'DRUGBANK': + process_drugbank_item(node_id_val, value) + + lines = str() + for tui_combo in tui_combos: + line = str(tui_combos[tui_combo]['tui_count']) + '\t' + for tui in tui_combo: + line += tui + "\t" + TUI_MAPPINGS[tui] + "\t" + line = line.strip() + line += '\n' + lines += line + + print(lines) + + kg2_util.end_read_jsonlines(input_read_jsonlines_info) \ No newline at end of file From f84d38cbba72a9fa899c8d02870ab3b9dfbecce8 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 16 Aug 2023 15:14:04 -0700 Subject: [PATCH 020/117] #316 UMLS DrugBank nodes seems to be getting brought in correctly --- umls_list_jsonl_to_kg_jsonl.py | 205 ++++++++------------------------- 1 file changed, 47 insertions(+), 158 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 5cc50f37..0daf46e0 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -24,143 +24,19 @@ 'MTH', 'NCBI', 'NCBITAXON', 'NCI', 'NDDF', 'NDFRT', 'OMIM', 'PDQ', 'PSY', 'RXNORM', 'VANDF'] CUIS_KEY = 'cuis' -INFO_KEY = 'info' +INFO_KEY = 'attributes' NAMES_KEY = 'names' TUIS_KEY = 'tuis' -TUI_MAPPINGS = {"T001": "individual organism", - "T002": "organism taxon", - "T004": "organism taxon", - "T005": "organism taxon", - "T007": "organism taxon", - "T008": "organism taxon", - "T010": "organism taxon", - "T011": "organism taxon", - "T012": "organism taxon", - "T013": "organism taxon", - "T014": "organism taxon", - "T015": "organism taxon", - "T016": "organism taxon", - "T017": "anatomical entity", - "T018": "gross anatomical structure", - "T019": "disease", - "T020": "disease", - "T021": "gross anatomical structure", - "T022": "anatomical entity", - "T023": "gross anatomical structure", - "T024": "gross anatomical structure", - "T025": "cell", - "T026": "cellular component", - "T028": "biological entity", - "T029": "anatomical entity", - "T030": "anatomical entity", - "T031": "anatomical entity", - "T032": "named thing", - "T033": "disease or phenotypic feature", - "T034": "phenomenon", - "T037": "pathological process", - "T038": "phenomenon", - "T039": "physiological process", - "T040": "physiological process", - "T041": "behavior", - "T042": "physiological process", - "T043": "physiological process", - "T044": "molecular activity", - "T045": "physiological process", - "T046": "pathological process", - "T047": "disease", - "T048": "disease", - "T049": "disease", - "T050": "biological entity", - "T051": "event", - "T052": "activity", - "T053": "behavior", - "T054": "behavior", - "T055": "behavior", - "T056": "activity", - "T057": "activity", - "T058": "activity", - "T059": "procedure", - "T060": "procedure", - "T061": "procedure", - "T062": "activity", - "T063": "procedure", - "T064": "activity", - "T065": "activity", - "T066": "activity", - "T067": "phenomenon", - "T068": "phenomenon", - "T069": "phenomenon", - "T070": "phenomenon", - "T071": "named thing", - "T072": "physical entity", - "T073": "physical entity", - "T074": "device", - "T075": "device", - "T077": "information content entity", - "T078": "information content entity", - "T079": "information content entity", - "T080": "information content entity", - "T081": "information content entity", - "T082": "information content entity", - "T083": "geographic location", - "T085": "biological entity", - "T086": "nucleic acid entity", - "T087": "polypeptide", - "T088": "biological entity", - "T089": "information content entity", - "T090": "individual organism", - "T091": "named thing", - "T092": "agent", - "T093": "agent", - "T094": "agent", - "T095": "agent", - "T096": "agent", - "T097": "cohort", - "T098": "population of individual organisms", - "T099": "cohort", - "T100": "cohort", - "T101": "cohort", - "T102": "information content entity", - "T103": "chemical entity", - "T104": "chemical entity", - "T109": "chemical entity", - "T114": "nucleic acid entity", - "T116": "polypeptide", - "T120": "chemical entity", - "T121": "drug", - "T122": "device", - "T123": "chemical entity", - "T125": "chemical entity", - "T126": "protein", - "T127": "small molecule", - "T129": "biological entity", - "T130": "chemical entity", - "T131": "chemical entity", - "T167": "chemical entity", - "T168": "food", - "T169": "information content entity", - "T170": "publication", - "T171": "information content entity", - "T184": "phenotypic feature", - "T185": "information content entity", - "T190": "disease", - "T191": "disease", - "T192": "protein", - "T194": "organism taxon", - "T195": "drug", - "T196": "small molecule", - "T197": "chemical entity", - "T200": "drug", - "T201": "named thing", - "T203": "device", - "T204": "organism taxon"} +UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE + def get_args(): arg_parser = argparse.ArgumentParser(description='umls_list_jsonl_to_kg_jsonl.py: converts UMLS MySQL JSON Lines dump into KG2 JSON format') arg_parser.add_argument('inputFile', type=str) arg_parser.add_argument('outputNodesFile', type=str) arg_parser.add_argument('outputEdgesFile', type=str) + arg_parser.add_argument('--test', dest='test', action="store_true", default=False) return arg_parser.parse_args() @@ -170,12 +46,15 @@ def extract_node_id(node_id_str): return node_id[1].strip(), node_id[0].strip() -def make_node_id(curie_prefix, node_id_val): - return curie_prefix + ':' + node_id_val +def make_node_id(curie_prefix, node_id): + return curie_prefix + ':' + node_id -def process_drugbank_item(node_id_val, info): - node_curie = make_node_id(kg2_util.CURIE_PREFIX_DRUGBANK, node_id_val) +def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): + curie_prefix = kg2_util.CURIE_PREFIX_DRUGBANK + provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) + iri = iri_mappings[curie_prefix] + node_id + node_curie = make_node_id(curie_prefix, node_id) cuis = info.get(CUIS_KEY, list()) tuis = info.get(TUIS_KEY, list()) fda_codes = info.get(INFO_KEY, dict()).get('FDA_UNII_CODE', list()) @@ -186,21 +65,48 @@ def process_drugbank_item(node_id_val, info): assert len(name) == 1, str(name) + " " + node_curie name = name[0] synonyms = list() - for syn_cat in info.get('SY', dict()): - synonyms += info['SY'][syn_cat] + for syn_cat in info.get(NAMES_KEY, dict()).get('SY', dict()): + synonyms += info.get(NAMES_KEY, dict()).get('SY', dict())[syn_cat] + for syn_cat in info.get(NAMES_KEY, dict()).get('FSY', dict()): + synonyms += info.get(NAMES_KEY, dict()).get('FSY', dict())[syn_cat] + + # TODO: figure out update date + node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) + node['synonym'] = synonyms + description = str() + for tui in tuis: + description += "; UMLS Semantic Type: STY:" + tui + description.strip("; ") + node['description'] = description - print(json.dumps({'node_curie': node_curie, 'cuis': cuis, 'tuis': tuis, 'fda_codes': fda_codes, 'secondary_accession_keys': secondary_accession_keys, 'name': name, 'synonyms': synonyms})) - return str(tuis) + nodes_output.write(node) if __name__ == '__main__': args = get_args() input_file_name = args.inputFile + test_mode = args.test + output_nodes_file_name = args.outputNodesFile + output_edges_file_name = args.outputEdgesFile + + nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode) + nodes_output = nodes_info[0] + edges_output = edges_info[0] input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) input_items = input_read_jsonlines_info[0] - tui_combos = dict() + tui_mappings = dict() + + with open('tui_combo_mappings.json') as mappings: + tui_mappings = json.load(mappings) + + iri_mappings = dict() + iri_mappings_raw = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string('curies-to-urls-map.yaml'))['use_for_bidirectional_mapping'] + for item in iri_mappings_raw: + for prefix in item: + iri_mappings[prefix] = item[prefix] + print(json.dumps(iri_mappings, indent=4, sort_keys=True)) for data in input_items: # There should only be one item in the data dictionary @@ -208,30 +114,13 @@ def process_drugbank_item(node_id_val, info): if entity == "('NOCODE', 'MTH')": continue value = data[entity] - source, node_id_val = extract_node_id(entity) + source, node_id = extract_node_id(entity) if source not in DESIRED_CODES and source != 'UMLS': continue # Process the data specifically by source - tui_combo = tuple(sorted(value.get(TUIS_KEY, list()))) - if tui_combo not in tui_combos: - tui_combos[tui_combo] = dict() - tui_combos[tui_combo]['tuis'] = list() - tui_combos[tui_combo]['tui_count'] = 0 - tui_combos[tui_combo]['tuis'].append(entity) - tui_combos[tui_combo]['tui_count'] += 1 if source == 'DRUGBANK': - process_drugbank_item(node_id_val, value) - - lines = str() - for tui_combo in tui_combos: - line = str(tui_combos[tui_combo]['tui_count']) + '\t' - for tui in tui_combo: - line += tui + "\t" + TUI_MAPPINGS[tui] + "\t" - line = line.strip() - line += '\n' - lines += line - - print(lines) + process_drugbank_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) - kg2_util.end_read_jsonlines(input_read_jsonlines_info) \ No newline at end of file + kg2_util.end_read_jsonlines(input_read_jsonlines_info) + kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) \ No newline at end of file From 007352b097513ed89595672623d5d7c4f5ed99e8 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 16 Aug 2023 15:28:57 -0700 Subject: [PATCH 021/117] Correcting issue per CI --- extract-mirbase.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extract-mirbase.sh b/extract-mirbase.sh index d4112368..8dee866e 100755 --- a/extract-mirbase.sh +++ b/extract-mirbase.sh @@ -23,8 +23,8 @@ output_file=${1:-"${BUILD_DIR}/miRNA.dat"} mkdir -p ${output_dir} -${curl_get} https://www.mirbase.org/download_file/miRNA.dat/ > /tmp/miRNA.dat -${curl_get} https://www.mirbase.org/download_readme/ > ${output_dir}/miRBase_README.txt +${curl_get} https://mirbase.org/download/miRNA.dat/ > /tmp/miRNA.dat +${curl_get} https://mirbase.org/download/README/ > ${output_dir}/miRBase_README.txt sed -i "s/
//" ${output_dir}/miRBase_README.txt version_number=`grep -m 1 "The miRBase Sequence Database -- Release" ${output_dir}/miRBase_README.txt | cut -f7 -d ' '` From 50da25ad8487aa9910426b1a5d518d6e85680208 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 16 Aug 2023 17:01:08 -0700 Subject: [PATCH 022/117] #316 ATC and CHV --- umls_list_jsonl_to_kg_jsonl.py | 93 ++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 0daf46e0..d372084d 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -50,6 +50,90 @@ def make_node_id(curie_prefix, node_id): return curie_prefix + ':' + node_id +def process_atc_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): + curie_prefix = kg2_util.CURIE_PREFIX_ATC + provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) + iri = iri_mappings[curie_prefix] + node_id + node_curie = make_node_id(curie_prefix, node_id) + cuis = info.get(CUIS_KEY, list()) + tuis = info.get(TUIS_KEY, list()) + + # Currently not used, but extracting them in case we want them in the future + atc_level = info.get(INFO_KEY, dict()).get('ATC_LEVEL', list())[0] + is_drug_class = info.get(INFO_KEY, dict()).get('IS_DRUG_CLASS', list()) == ["Y"] + + name = str() + synonyms = list() + names = info.get(NAMES_KEY, dict()) + if "RXN_PT" in names: + rxn_pt = names.get('RXN_PT', dict()) + if 'Y' in rxn_pt: + name = rxn_pt.get('Y', '') + assert len(name) == 1 + name = name[0] + else: + name = rxn_pt.get('N', '') + assert len(name) == 1 + name = name[0] + synonyms = [syn for syn in names.get('PT', dict()).get('Y', list())] + synonyms += [syn for syn in names.get('PT', dict()).get('N', list())] + synonyms += [syn for syn in names.get('IN', dict()).get('Y', list())] + synonyms += [syn for syn in names.get('IN', dict()).get('N', list())] + elif "PT" in names: + pt = names.get('PT', dict()) + if 'Y' in pt: + name = pt.get('Y', '') + assert len(name) == 1 + name = name[0] + else: + name = pt.get('N', '') + assert len(name) == 1 + name = name[0] + synonyms += [syn for syn in names.get('IN', dict()).get('Y', list())] + synonyms += [syn for syn in names.get('IN', dict()).get('N', list())] + else: + in_dict = names.get('IN', dict()) + if 'Y' in in_dict: + name = in_dict.get('Y', '') + assert len(name) == 1 + name = name[0] + else: + name = in_dict.get('N', '') + assert len(name) == 1 + name = name[0] + node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) + node['synonym'] = synonyms + description = str() + for tui in tuis: + description += "; UMLS Semantic Type: STY:" + tui + description.strip("; ") + node['description'] = description + + nodes_output.write(node) + + +def process_chv_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): + curie_prefix = "CHV" # This should be replaced with a kg2_util prefix at some point + provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) + iri = iri_mappings[curie_prefix] + node_id + node_curie = make_node_id(curie_prefix, node_id) + cuis = info.get(CUIS_KEY, list()) + tuis = info.get(TUIS_KEY, list()) + + # Currently not used, but extracting them in case we want them in the future + combo_score = info.get(INFO_KEY, dict()).get('COMBO_SCORE', list()) + combo_score_no_top_words = info.get(INFO_KEY, dict()).get('COMBO_SCORE_NO_TOP_WORDS', list()) + context_score = info.get(INFO_KEY, dict()).get('CONTEXT_SCORE', list()) + cui_score = info.get(INFO_KEY, dict()).get('CUI_SCORE', list()) + disparaged = info.get(INFO_KEY, dict()).get('DISPARAGED', list()) + frequency = info.get(INFO_KEY, dict()).get('FREQUENCY', list()) + + name = str() + synonyms = list() + names = info.get(NAMES_KEY, dict()) + + print(curie_prefix + ":", names) + def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): curie_prefix = kg2_util.CURIE_PREFIX_DRUGBANK provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) @@ -57,8 +141,11 @@ def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_outpu node_curie = make_node_id(curie_prefix, node_id) cuis = info.get(CUIS_KEY, list()) tuis = info.get(TUIS_KEY, list()) + + # Currently not used, but extracting them in case we want them in the future fda_codes = info.get(INFO_KEY, dict()).get('FDA_UNII_CODE', list()) secondary_accession_keys = info.get(INFO_KEY, dict()).get('SID', list()) + name = info.get(NAMES_KEY, dict()).get('IN', dict()).get('N', list()) if len(name) == 0: name = info.get(NAMES_KEY, dict()).get('IN', dict()).get('Y', list()) @@ -119,6 +206,12 @@ def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_outpu continue # Process the data specifically by source + if source == 'ATC': + process_atc_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) + + if source == 'CHV': + process_chv_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) + if source == 'DRUGBANK': process_drugbank_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) From 474348fd43ef7f9b05976ed0ab7293530a27235d Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 16 Aug 2023 17:07:47 -0700 Subject: [PATCH 023/117] #316 #344 #280 CHV curies to urls --- curies-to-urls-map.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/curies-to-urls-map.yaml b/curies-to-urls-map.yaml index 8dcec53b..c5e8db0a 100644 --- a/curies-to-urls-map.yaml +++ b/curies-to-urls-map.yaml @@ -59,6 +59,8 @@ use_for_bidirectional_mapping: CHEMBL.TARGET: "https://identifiers.org/chembl.target:" - CHMO: http://purl.obolibrary.org/obo/CHMO_ + - + CHV: http://purl.bioontology.org/ontology/CHV/ - CID: 'http://pubchem.ncbi.nlm.nih.gov/compound/' - From d1b17a4d551664657ba2d89f676b48911195d3ac Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 16 Aug 2023 17:16:54 -0700 Subject: [PATCH 024/117] miRBase URL issue --- extract-mirbase.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract-mirbase.sh b/extract-mirbase.sh index 8dee866e..c4ff3f00 100755 --- a/extract-mirbase.sh +++ b/extract-mirbase.sh @@ -23,7 +23,7 @@ output_file=${1:-"${BUILD_DIR}/miRNA.dat"} mkdir -p ${output_dir} -${curl_get} https://mirbase.org/download/miRNA.dat/ > /tmp/miRNA.dat +${curl_get} https://mirbase.org/download/miRNA.dat > /tmp/miRNA.dat ${curl_get} https://mirbase.org/download/README/ > ${output_dir}/miRBase_README.txt sed -i "s/
//" ${output_dir}/miRBase_README.txt From e84d67323926c60d2f840542595896bd54aa54c1 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 16 Aug 2023 17:53:07 -0700 Subject: [PATCH 025/117] #316 TUI Category mappings --- tui_combo_mappings.json | 809 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 809 insertions(+) create mode 100644 tui_combo_mappings.json diff --git a/tui_combo_mappings.json b/tui_combo_mappings.json new file mode 100644 index 00000000..d30a088a --- /dev/null +++ b/tui_combo_mappings.json @@ -0,0 +1,809 @@ +{ + "('T001',)": "individual organism", + "('T002', 'T004')": "organism taxon", + "('T002', 'T025')": "cell", + "('T002', 'T033')": "disease or phenotypic feature", + "('T002', 'T037', 'T047')": "pathological process", + "('T002', 'T047')": "disease", + "('T002', 'T048', 'T167')": "disease", + "('T002', 'T059')": "procedure", + "('T002', 'T061')": "procedure", + "('T002', 'T061', 'T109', 'T121', 'T168')": "drug", + "('T002', 'T090')": "individual organism", + "('T002', 'T109')": "chemical entity", + "('T002', 'T109', 'T116', 'T121')": "drug", + "('T002', 'T109', 'T121')": "drug", + "('T002', 'T109', 'T121', 'T131')": "drug", + "('T002', 'T109', 'T121', 'T167')": "drug", + "('T002', 'T109', 'T121', 'T168')": "drug", + "('T002', 'T109', 'T130')": "chemical entity", + "('T002', 'T109', 'T168')": "food", + "('T002', 'T116', 'T121')": "drug", + "('T002', 'T116', 'T126')": "protein", + "('T002', 'T116', 'T129', 'T168')": "food", + "('T002', 'T121')": "drug", + "('T002', 'T121', 'T129')": "drug", + "('T002', 'T121', 'T129', 'T130')": "drug", + "('T002', 'T129', 'T168')": "food", + "('T002', 'T167')": "chemical entity", + "('T002', 'T168')": "food", + "('T002', 'T204')": "organism taxon", + "('T002',)": "organism taxon", + "('T004', 'T017')": "anatomical entity", + "('T004', 'T060')": "procedure", + "('T004', 'T109', 'T121')": "drug", + "('T004', 'T121', 'T129', 'T168')": "drug", + "('T004', 'T200')": "drug", + "('T004', 'T204')": "organism taxon", + "('T004',)": "organism taxon", + "('T005', 'T023', 'T026')": "cellular component", + "('T005', 'T028')": "organism taxon", + "('T005', 'T042')": "physiological process", + "('T005', 'T047')": "disease", + "('T005', 'T059', 'T067')": "procedure", + "('T005', 'T081')": "organism taxon", + "('T005', 'T114')": "nucleic acid entity", + "('T005', 'T116')": "polypeptide", + "('T005', 'T116', 'T121', 'T129')": "drug", + "('T005', 'T116', 'T123')": "polypeptide", + "('T005', 'T121')": "drug", + "('T005', 'T200')": "drug", + "('T005',)": "organism taxon", + "('T007', 'T032', 'T201')": "organism taxon", + "('T007', 'T037')": "pathological process", + "('T007', 'T047')": "disease", + "('T007', 'T058')": "activity", + "('T007', 'T059')": "procedure", + "('T007', 'T070')": "phenomenon", + "('T007', 'T074')": "device", + "('T007', 'T109', 'T121')": "drug", + "('T007', 'T109', 'T121', 'T129')": "drug", + "('T007', 'T109', 'T123')": "chemical entity", + "('T007', 'T116', 'T121', 'T129')": "drug", + "('T007', 'T121')": "drug", + "('T007', 'T121', 'T129')": "drug", + "('T007', 'T122')": "device", + "('T007', 'T168')": "food", + "('T007', 'T185')": "organism taxon", + "('T007', 'T200')": "drug", + "('T007', 'T203')": "device", + "('T007', 'T204')": "organism taxon", + "('T007',)": "organism taxon", + "('T008',)": "organism taxon", + "('T010',)": "organism taxon", + "('T011',)": "organism taxon", + "('T012',)": "organism taxon", + "('T013',)": "organism taxon", + "('T014',)": "organism taxon", + "('T015',)": "organism taxon", + "('T016',)": "organism taxon", + "('T017',)": "anatomical entity", + "('T018',)": "gross anatomical structure", + "('T019', 'T028')": "disease", + "('T019', 'T028', 'T047')": "disease", + "('T019', 'T033')": "disease", + "('T019', 'T047')": "disease", + "('T019',)": "disease", + "('T020',)": "disease", + "('T021',)": "gross anatomical structure", + "('T022',)": "anatomical entity", + "('T023', 'T024')": "gross anatomical structure", + "('T023', 'T025')": "cell", + "('T023', 'T026')": "cellular component", + "('T023', 'T029')": "anatomical entity", + "('T023', 'T030')": "anatomical entity", + "('T023', 'T033')": "disease or phenotypic feature", + "('T023', 'T033', 'T047')": "disease", + "('T023', 'T047')": "disease", + "('T023', 'T061')": "procedure", + "('T023', 'T191')": "disease", + "('T023',)": "gross anatomical structure", + "('T024', 'T026')": "cellular component", + "('T024', 'T031')": "gross anatomical structure", + "('T024', 'T033')": "disease or phenotypic feature", + "('T024', 'T040')": "physiological process", + "('T024', 'T109', 'T123')": "chemical entity", + "('T024', 'T116', 'T123')": "polypeptide", + "('T024', 'T116', 'T129')": "polypeptide", + "('T024', 'T121')": "drug", + "('T024', 'T200')": "drug", + "('T024', 'T201')": "gross anatomical structure", + "('T024',)": "gross anatomical structure", + "('T025', 'T026')": "cell", + "('T025', 'T029')": "anatomical entity", + "('T025', 'T031', 'T061')": "cell", + "('T025', 'T031', 'T185')": "anatomical entity", + "('T025', 'T032')": "cell", + "('T025', 'T033')": "disease or phenotypic feature", + "('T025', 'T033', 'T047')": "disease", + "('T025', 'T034')": "phenomenon", + "('T025', 'T037')": "pathological process", + "('T025', 'T038')": "phenomenon", + "('T025', 'T049')": "disease", + "('T025', 'T059')": "procedure", + "('T025', 'T063', 'T170')": "procedure", + "('T025', 'T081')": "cell", + "('T025', 'T109', 'T121')": "drug", + "('T025', 'T114', 'T121')": "drug", + "('T025', 'T116', 'T121', 'T129')": "drug", + "('T025', 'T121')": "drug", + "('T025', 'T121', 'T129')": "drug", + "('T025', 'T122')": "device", + "('T025', 'T129')": "cell", + "('T025', 'T170')": "publication", + "('T025', 'T191')": "disease", + "('T025', 'T200')": "drug", + "('T025',)": "cell", + "('T026',)": "cellular component", + "('T028', 'T033')": "disease or phenotypic feature", + "('T028', 'T033', 'T047')": "disease", + "('T028', 'T033', 'T047', 'T191')": "disease", + "('T028', 'T047')": "disease", + "('T028', 'T048')": "disease", + "('T028',)": "named thing", + "('T029',)": "anatomical entity", + "('T030',)": "anatomical entity", + "('T031', 'T121')": "drug", + "('T031', 'T121', 'T200')": "drug", + "('T031',)": "anatomical entity", + "('T032',)": "named thing", + "('T033', 'T034')": "phenomenon", + "('T033', 'T034', 'T047')": "disease", + "('T033', 'T034', 'T059')": "phenomenon", + "('T033', 'T037')": "pathological process", + "('T033', 'T037', 'T047')": "disease", + "('T033', 'T037', 'T055')": "pathological process", + "('T033', 'T037', 'T070', 'T167', 'T191')": "disease", + "('T033', 'T039')": "physiological process", + "('T033', 'T040')": "physiological process", + "('T033', 'T040', 'T046', 'T047')": "disease", + "('T033', 'T040', 'T047')": "disease", + "('T033', 'T041')": "behavior", + "('T033', 'T042')": "physiological process", + "('T033', 'T042', 'T047')": "disease", + "('T033', 'T046')": "pathological process", + "('T033', 'T046', 'T047')": "disease", + "('T033', 'T046', 'T047', 'T184')": "disease or phenotypic feature", + "('T033', 'T046', 'T061', 'T081', 'T093')": "pathological process", + "('T033', 'T046', 'T184')": "disease or phenotypic feature", + "('T033', 'T047')": "disease", + "('T033', 'T047', 'T048', 'T054', 'T102')": "disease", + "('T033', 'T047', 'T048', 'T184')": "disease or phenotypic feature", + "('T033', 'T047', 'T059', 'T074')": "disease", + "('T033', 'T047', 'T170')": "disease", + "('T033', 'T047', 'T184')": "disease or phenotypic feature", + "('T033', 'T047', 'T190')": "disease", + "('T033', 'T047', 'T191')": "disease", + "('T033', 'T048')": "disease", + "('T033', 'T048', 'T054')": "disease", + "('T033', 'T048', 'T169')": "disease", + "('T033', 'T049')": "disease", + "('T033', 'T051')": "event", + "('T033', 'T052', 'T061')": "procedure", + "('T033', 'T054')": "behavior", + "('T033', 'T054', 'T080')": "behavior", + "('T033', 'T055')": "behavior", + "('T033', 'T055', 'T061')": "procedure", + "('T033', 'T055', 'T185')": "behavior", + "('T033', 'T056', 'T078', 'T080', 'T169', 'T170')": "publication", + "('T033', 'T057', 'T080')": "activity", + "('T033', 'T058')": "activity", + "('T033', 'T059')": "procedure", + "('T033', 'T060')": "procedure", + "('T033', 'T060', 'T080')": "procedure", + "('T033', 'T061')": "procedure", + "('T033', 'T061', 'T168')": "procedure", + "('T033', 'T067')": "phenomenon", + "('T033', 'T069', 'T131')": "phenomenon", + "('T033', 'T074')": "device", + "('T033', 'T078')": "disease or phenotypic feature", + "('T033', 'T078', 'T079', 'T170')": "publication", + "('T033', 'T078', 'T089', 'T095', 'T170')": "publication", + "('T033', 'T078', 'T089', 'T170')": "publication", + "('T033', 'T078', 'T169', 'T170')": "publication", + "('T033', 'T078', 'T170')": "publication", + "('T033', 'T079')": "disease or phenotypic feature", + "('T033', 'T079', 'T080', 'T081', 'T169', 'T170')": "publication", + "('T033', 'T080')": "disease or phenotypic feature", + "('T033', 'T080', 'T082')": "disease or phenotypic feature", + "('T033', 'T080', 'T170')": "publication", + "('T033', 'T081')": "disease or phenotypic feature", + "('T033', 'T083', 'T093', 'T169', 'T170')": "publication", + "('T033', 'T089')": "disease or phenotypic feature", + "('T033', 'T098')": "population of individual organisms", + "('T033', 'T098', 'T116', 'T121', 'T129')": "drug", + "('T033', 'T098', 'T121', 'T129')": "drug", + "('T033', 'T099')": "cohort", + "('T033', 'T101')": "cohort", + "('T033', 'T102')": "disease or phenotypic feature", + "('T033', 'T109', 'T122')": "chemical entity", + "('T033', 'T109', 'T123')": "chemical entity", + "('T033', 'T116', 'T123')": "polypeptide", + "('T033', 'T116', 'T129')": "polypeptide", + "('T033', 'T121')": "drug", + "('T033', 'T122')": "device", + "('T033', 'T168')": "food", + "('T033', 'T168', 'T170')": "food", + "('T033', 'T169')": "disease or phenotypic feature", + "('T033', 'T170')": "publication", + "('T033', 'T184')": "phenotypic feature", + "('T033', 'T185')": "disease or phenotypic feature", + "('T033', 'T190')": "disease", + "('T033', 'T191')": "disease", + "('T033', 'T197')": "chemical entity", + "('T033', 'T201')": "disease or phenotypic feature", + "('T033',)": "disease or phenotypic feature", + "('T034', 'T046')": "pathological process", + "('T034', 'T047')": "phenomenon", + "('T034', 'T058', 'T060')": "procedure", + "('T034', 'T059')": "procedure", + "('T034', 'T060')": "procedure", + "('T034', 'T063')": "procedure", + "('T034', 'T073')": "phenomenon", + "('T034', 'T073', 'T093')": "phenomenon", + "('T034', 'T074')": "device", + "('T034', 'T078')": "phenomenon", + "('T034', 'T081', 'T116', 'T121', 'T123')": "drug", + "('T034', 'T116', 'T121', 'T123')": "drug", + "('T034', 'T116', 'T129')": "polypeptide", + "('T034', 'T121')": "drug", + "('T034', 'T123', 'T196')": "small molecule", + "('T034', 'T196')": "small molecule", + "('T034', 'T201')": "phenomenon", + "('T034',)": "phenomenon", + "('T037', 'T047')": "disease", + "('T037',)": "pathological process", + "('T038', 'T043')": "phenomenon", + "('T038',)": "phenomenon", + "('T039', 'T040')": "physiological process", + "('T039', 'T121')": "drug", + "('T039',)": "physiological process", + "('T040', 'T042')": "physiological process", + "('T040', 'T043')": "physiological process", + "('T040', 'T044')": "physiological process", + "('T040',)": "physiological process", + "('T041',)": "behavior", + "('T042',)": "physiological process", + "('T043', 'T044')": "physiological process", + "('T043', 'T045')": "physiological process", + "('T043',)": "physiological process", + "('T044', 'T045')": "physiological process", + "('T044',)": "molecular activity", + "('T045',)": "physiological process", + "('T046', 'T047')": "disease", + "('T046',)": "pathological process", + "('T047', 'T184')": "disease or phenotypic feature", + "('T047', 'T190')": "disease", + "('T047', 'T191')": "disease", + "('T047',)": "disease", + "('T048',)": "disease", + "('T049',)": "disease", + "('T050', 'T191')": "disease", + "('T050',)": "biological entity", + "('T051',)": "event", + "('T052',)": "activity", + "('T053',)": "behavior", + "('T054',)": "behavior", + "('T055',)": "behavior", + "('T056',)": "activity", + "('T057',)": "activity", + "('T058',)": "activity", + "('T059',)": "procedure", + "('T060',)": "procedure", + "('T061',)": "procedure", + "('T062',)": "activity", + "('T063',)": "procedure", + "('T064',)": "activity", + "('T065',)": "activity", + "('T066',)": "activity", + "('T067',)": "phenomenon", + "('T068',)": "phenomenon", + "('T069',)": "phenomenon", + "('T070',)": "phenomenon", + "('T071',)": "named thing", + "('T072',)": "physical entity", + "('T073', 'T092')": "agent", + "('T073', 'T093')": "agent", + "('T073', 'T170')": "publication", + "('T073',)": "physical entity", + "('T074', 'T109', 'T121')": "drug", + "('T074', 'T200')": "drug", + "('T074',)": "device", + "('T075',)": "device", + "('T077',)": "information content entity", + "('T078',)": "information content entity", + "('T079', 'T080')": "named thing", + "('T079', 'T080', 'T083')": "geographic location", + "('T079', 'T080', 'T170')": "publication", + "('T079', 'T081')": "named thing", + "('T079', 'T081', 'T169', 'T170')": "publication", + "('T079', 'T082')": "named thing", + "('T079', 'T083')": "geographic location", + "('T079', 'T083', 'T098')": "population of individual organisms", + "('T079', 'T090')": "individual organism", + "('T079', 'T090', 'T170')": "publication", + "('T079', 'T093')": "agent", + "('T079', 'T098')": "population of individual organisms", + "('T079', 'T098', 'T100')": "cohort", + "('T079', 'T100')": "cohort", + "('T079', 'T101')": "cohort", + "('T079', 'T102')": "named thing", + "('T079', 'T169')": "named thing", + "('T079', 'T170')": "publication", + "('T079',)": "named thing", + "('T080',)": "information content entity", + "('T081',)": "information content entity", + "('T082',)": "information content entity", + "('T083',)": "geographic location", + "('T085',)": "biological entity", + "('T086',)": "nucleic acid entity", + "('T087',)": "polypeptide", + "('T088',)": "biological entity", + "('T089',)": "information content entity", + "('T090',)": "individual organism", + "('T091',)": "named thing", + "('T092',)": "agent", + "('T093',)": "agent", + "('T094',)": "agent", + "('T095',)": "agent", + "('T096',)": "agent", + "('T097',)": "cohort", + "('T098',)": "population of individual organisms", + "('T099',)": "cohort", + "('T100',)": "cohort", + "('T101',)": "cohort", + "('T102',)": "information content entity", + "('T103',)": "chemical entity", + "('T104', 'T109')": "chemical entity", + "('T104', 'T109', 'T116', 'T121', 'T123', 'T130')": "drug", + "('T104', 'T109', 'T121')": "drug", + "('T104', 'T109', 'T121', 'T123', 'T130')": "drug", + "('T104', 'T109', 'T121', 'T130')": "drug", + "('T104', 'T109', 'T121', 'T130', 'T131')": "drug", + "('T104', 'T109', 'T123')": "chemical entity", + "('T104', 'T109', 'T123', 'T130')": "chemical entity", + "('T104', 'T109', 'T130')": "chemical entity", + "('T104', 'T109', 'T130', 'T131')": "chemical entity", + "('T104', 'T114', 'T121', 'T123', 'T130')": "drug", + "('T104', 'T116')": "polypeptide", + "('T104', 'T121')": "drug", + "('T104', 'T122')": "chemical entity", + "('T104', 'T122', 'T197')": "chemical entity", + "('T104', 'T123')": "chemical entity", + "('T104', 'T130')": "chemical entity", + "('T104', 'T130', 'T197')": "chemical entity", + "('T104', 'T169')": "chemical entity", + "('T104', 'T197')": "chemical entity", + "('T104',)": "chemical entity", + "('T109', 'T114')": "nucleic acid entity", + "('T109', 'T114', 'T116')": "polypeptide", + "('T109', 'T114', 'T116', 'T121', 'T129', 'T200')": "drug", + "('T109', 'T114', 'T121')": "drug", + "('T109', 'T114', 'T121', 'T123')": "drug", + "('T109', 'T114', 'T121', 'T123', 'T127')": "drug", + "('T109', 'T114', 'T121', 'T127')": "drug", + "('T109', 'T114', 'T121', 'T129')": "drug", + "('T109', 'T114', 'T121', 'T130')": "drug", + "('T109', 'T114', 'T121', 'T131')": "drug", + "('T109', 'T114', 'T121', 'T200')": "drug", + "('T109', 'T114', 'T123')": "nucleic acid entity", + "('T109', 'T114', 'T123', 'T130')": "nucleic acid entity", + "('T109', 'T114', 'T127')": "small molecule", + "('T109', 'T114', 'T129', 'T130')": "nucleic acid entity", + "('T109', 'T114', 'T130')": "nucleic acid entity", + "('T109', 'T114', 'T195')": "drug", + "('T109', 'T116')": "polypeptide", + "('T109', 'T116', 'T121')": "drug", + "('T109', 'T116', 'T121', 'T122')": "drug", + "('T109', 'T116', 'T121', 'T123')": "drug", + "('T109', 'T116', 'T121', 'T123', 'T125')": "drug", + "('T109', 'T116', 'T121', 'T123', 'T129')": "drug", + "('T109', 'T116', 'T121', 'T123', 'T130')": "drug", + "('T109', 'T116', 'T121', 'T123', 'T131')": "drug", + "('T109', 'T116', 'T121', 'T123', 'T192')": "drug", + "('T109', 'T116', 'T121', 'T123', 'T200')": "drug", + "('T109', 'T116', 'T121', 'T125')": "drug", + "('T109', 'T116', 'T121', 'T125', 'T130')": "drug", + "('T109', 'T116', 'T121', 'T126')": "drug", + "('T109', 'T116', 'T121', 'T126', 'T168')": "drug", + "('T109', 'T116', 'T121', 'T127')": "drug", + "('T109', 'T116', 'T121', 'T127', 'T197')": "drug", + "('T109', 'T116', 'T121', 'T129')": "drug", + "('T109', 'T116', 'T121', 'T129', 'T130')": "drug", + "('T109', 'T116', 'T121', 'T129', 'T130', 'T192')": "drug", + "('T109', 'T116', 'T121', 'T129', 'T131')": "drug", + "('T109', 'T116', 'T121', 'T129', 'T192')": "drug", + "('T109', 'T116', 'T121', 'T130')": "drug", + "('T109', 'T116', 'T121', 'T131')": "drug", + "('T109', 'T116', 'T121', 'T192')": "drug", + "('T109', 'T116', 'T121', 'T195')": "drug", + "('T109', 'T116', 'T122')": "device", + "('T109', 'T116', 'T123')": "polypeptide", + "('T109', 'T116', 'T123', 'T129')": "polypeptide", + "('T109', 'T116', 'T123', 'T130')": "polypeptide", + "('T109', 'T116', 'T123', 'T131')": "polypeptide", + "('T109', 'T116', 'T123', 'T192')": "protein", + "('T109', 'T116', 'T123', 'T195')": "drug", + "('T109', 'T116', 'T126')": "protein", + "('T109', 'T116', 'T129')": "polypeptide", + "('T109', 'T116', 'T129', 'T130')": "polypeptide", + "('T109', 'T116', 'T129', 'T185')": "polypeptide", + "('T109', 'T116', 'T130')": "polypeptide", + "('T109', 'T116', 'T131')": "polypeptide", + "('T109', 'T116', 'T195')": "drug", + "('T109', 'T120')": "chemical entity", + "('T109', 'T120', 'T121')": "drug", + "('T109', 'T120', 'T121', 'T130')": "drug", + "('T109', 'T120', 'T121', 'T168')": "drug", + "('T109', 'T120', 'T130')": "chemical entity", + "('T109', 'T120', 'T130', 'T131')": "chemical entity", + "('T109', 'T120', 'T200')": "drug", + "('T109', 'T121')": "drug", + "('T109', 'T121', 'T122')": "drug", + "('T109', 'T121', 'T122', 'T123')": "drug", + "('T109', 'T121', 'T122', 'T130')": "drug", + "('T109', 'T121', 'T122', 'T131')": "drug", + "('T109', 'T121', 'T122', 'T197', 'T200')": "drug", + "('T109', 'T121', 'T122', 'T200')": "drug", + "('T109', 'T121', 'T123')": "drug", + "('T109', 'T121', 'T123', 'T125')": "drug", + "('T109', 'T121', 'T123', 'T127')": "drug", + "('T109', 'T121', 'T123', 'T130')": "drug", + "('T109', 'T121', 'T123', 'T130', 'T131')": "drug", + "('T109', 'T121', 'T123', 'T131')": "drug", + "('T109', 'T121', 'T123', 'T168')": "drug", + "('T109', 'T121', 'T123', 'T195')": "drug", + "('T109', 'T121', 'T123', 'T196')": "drug", + "('T109', 'T121', 'T123', 'T197')": "drug", + "('T109', 'T121', 'T123', 'T200')": "drug", + "('T109', 'T121', 'T125')": "drug", + "('T109', 'T121', 'T125', 'T127')": "drug", + "('T109', 'T121', 'T125', 'T130')": "drug", + "('T109', 'T121', 'T125', 'T131')": "drug", + "('T109', 'T121', 'T125', 'T196')": "drug", + "('T109', 'T121', 'T125', 'T200')": "drug", + "('T109', 'T121', 'T126')": "drug", + "('T109', 'T121', 'T127')": "drug", + "('T109', 'T121', 'T127', 'T130')": "drug", + "('T109', 'T121', 'T127', 'T200')": "drug", + "('T109', 'T121', 'T129')": "drug", + "('T109', 'T121', 'T129', 'T130')": "drug", + "('T109', 'T121', 'T129', 'T130', 'T131')": "drug", + "('T109', 'T121', 'T129', 'T131')": "drug", + "('T109', 'T121', 'T129', 'T168')": "drug", + "('T109', 'T121', 'T129', 'T192')": "drug", + "('T109', 'T121', 'T129', 'T200')": "drug", + "('T109', 'T121', 'T130')": "drug", + "('T109', 'T121', 'T130', 'T131')": "drug", + "('T109', 'T121', 'T130', 'T195')": "drug", + "('T109', 'T121', 'T130', 'T196')": "drug", + "('T109', 'T121', 'T130', 'T196', 'T197')": "drug", + "('T109', 'T121', 'T130', 'T197')": "drug", + "('T109', 'T121', 'T130', 'T200')": "drug", + "('T109', 'T121', 'T131')": "drug", + "('T109', 'T121', 'T131', 'T167')": "drug", + "('T109', 'T121', 'T131', 'T197')": "drug", + "('T109', 'T121', 'T131', 'T200')": "drug", + "('T109', 'T121', 'T131', 'T204')": "drug", + "('T109', 'T121', 'T167')": "drug", + "('T109', 'T121', 'T168')": "drug", + "('T109', 'T121', 'T168', 'T197')": "drug", + "('T109', 'T121', 'T168', 'T200')": "drug", + "('T109', 'T121', 'T170')": "drug", + "('T109', 'T121', 'T195')": "drug", + "('T109', 'T121', 'T195', 'T200')": "drug", + "('T109', 'T121', 'T196')": "drug", + "('T109', 'T121', 'T196', 'T197')": "drug", + "('T109', 'T121', 'T197')": "drug", + "('T109', 'T121', 'T200')": "drug", + "('T109', 'T121', 'T201')": "drug", + "('T109', 'T121', 'T203')": "drug", + "('T109', 'T121', 'T204')": "drug", + "('T109', 'T122')": "device", + "('T109', 'T122', 'T123')": "device", + "('T109', 'T122', 'T130')": "device", + "('T109', 'T122', 'T131')": "device", + "('T109', 'T122', 'T167')": "device", + "('T109', 'T122', 'T200')": "drug", + "('T109', 'T123')": "chemical entity", + "('T109', 'T123', 'T125')": "chemical entity", + "('T109', 'T123', 'T129')": "chemical entity", + "('T109', 'T123', 'T130')": "chemical entity", + "('T109', 'T123', 'T130', 'T200')": "drug", + "('T109', 'T123', 'T131')": "chemical entity", + "('T109', 'T123', 'T168')": "food", + "('T109', 'T123', 'T192')": "protein", + "('T109', 'T123', 'T195')": "drug", + "('T109', 'T123', 'T200')": "drug", + "('T109', 'T123', 'T204')": "chemical entity", + "('T109', 'T125')": "chemical entity", + "('T109', 'T125', 'T130')": "chemical entity", + "('T109', 'T125', 'T200')": "drug", + "('T109', 'T127')": "small molecule", + "('T109', 'T127', 'T130')": "small molecule", + "('T109', 'T127', 'T195')": "drug", + "('T109', 'T129')": "chemical entity", + "('T109', 'T129', 'T130')": "chemical entity", + "('T109', 'T129', 'T131')": "chemical entity", + "('T109', 'T129', 'T185')": "chemical entity", + "('T109', 'T129', 'T192')": "protein", + "('T109', 'T129', 'T200')": "drug", + "('T109', 'T130')": "chemical entity", + "('T109', 'T130', 'T131')": "chemical entity", + "('T109', 'T130', 'T131', 'T196')": "small molecule", + "('T109', 'T130', 'T131', 'T197')": "chemical entity", + "('T109', 'T130', 'T131', 'T200')": "drug", + "('T109', 'T130', 'T167')": "chemical entity", + "('T109', 'T130', 'T195')": "drug", + "('T109', 'T130', 'T196')": "small molecule", + "('T109', 'T130', 'T197')": "chemical entity", + "('T109', 'T130', 'T200')": "drug", + "('T109', 'T131')": "chemical entity", + "('T109', 'T131', 'T195')": "drug", + "('T109', 'T131', 'T196')": "small molecule", + "('T109', 'T131', 'T197')": "chemical entity", + "('T109', 'T131', 'T200')": "drug", + "('T109', 'T167')": "chemical entity", + "('T109', 'T168')": "food", + "('T109', 'T168', 'T200')": "drug", + "('T109', 'T184')": "phenotypic feature", + "('T109', 'T195')": "drug", + "('T109', 'T195', 'T200')": "drug", + "('T109', 'T196')": "small molecule", + "('T109', 'T197')": "chemical entity", + "('T109', 'T200')": "drug", + "('T109', 'T203')": "device", + "('T109',)": "chemical entity", + "('T114', 'T116')": "polypeptide", + "('T114', 'T116', 'T121')": "drug", + "('T114', 'T116', 'T121', 'T129')": "drug", + "('T114', 'T116', 'T121', 'T200')": "drug", + "('T114', 'T116', 'T123')": "nucleic acid entity", + "('T114', 'T116', 'T123', 'T126')": "polypeptide", + "('T114', 'T116', 'T126')": "protein", + "('T114', 'T116', 'T129')": "polypeptide", + "('T114', 'T116', 'T195')": "drug", + "('T114', 'T121')": "drug", + "('T114', 'T121', 'T123')": "drug", + "('T114', 'T121', 'T123', 'T130')": "drug", + "('T114', 'T121', 'T123', 'T200')": "drug", + "('T114', 'T121', 'T127')": "drug", + "('T114', 'T121', 'T129')": "drug", + "('T114', 'T121', 'T129', 'T200')": "drug", + "('T114', 'T121', 'T130')": "drug", + "('T114', 'T121', 'T131')": "drug", + "('T114', 'T121', 'T195')": "drug", + "('T114', 'T121', 'T200')": "drug", + "('T114', 'T123')": "nucleic acid entity", + "('T114', 'T123', 'T130')": "nucleic acid entity", + "('T114', 'T123', 'T131')": "nucleic acid entity", + "('T114', 'T123', 'T195')": "drug", + "('T114', 'T123', 'T200')": "drug", + "('T114', 'T126')": "protein", + "('T114', 'T127')": "small molecule", + "('T114', 'T129')": "nucleic acid entity", + "('T114', 'T130')": "nucleic acid entity", + "('T114', 'T131')": "nucleic acid entity", + "('T114', 'T195')": "drug", + "('T114',)": "nucleic acid entity", + "('T116', 'T121')": "drug", + "('T116', 'T121', 'T122')": "drug", + "('T116', 'T121', 'T122', 'T123')": "drug", + "('T116', 'T121', 'T122', 'T126')": "drug", + "('T116', 'T121', 'T123')": "drug", + "('T116', 'T121', 'T123', 'T125')": "drug", + "('T116', 'T121', 'T123', 'T126')": "drug", + "('T116', 'T121', 'T123', 'T129')": "drug", + "('T116', 'T121', 'T123', 'T129', 'T131')": "drug", + "('T116', 'T121', 'T123', 'T130')": "drug", + "('T116', 'T121', 'T123', 'T131')": "drug", + "('T116', 'T121', 'T123', 'T168')": "drug", + "('T116', 'T121', 'T123', 'T192')": "drug", + "('T116', 'T121', 'T123', 'T195')": "drug", + "('T116', 'T121', 'T123', 'T196')": "drug", + "('T116', 'T121', 'T123', 'T200')": "drug", + "('T116', 'T121', 'T125')": "drug", + "('T116', 'T121', 'T125', 'T129')": "drug", + "('T116', 'T121', 'T125', 'T130')": "drug", + "('T116', 'T121', 'T125', 'T200')": "drug", + "('T116', 'T121', 'T126')": "drug", + "('T116', 'T121', 'T126', 'T129')": "drug", + "('T116', 'T121', 'T126', 'T200')": "drug", + "('T116', 'T121', 'T127')": "drug", + "('T116', 'T121', 'T129')": "drug", + "('T116', 'T121', 'T129', 'T130')": "drug", + "('T116', 'T121', 'T129', 'T131')": "drug", + "('T116', 'T121', 'T129', 'T167')": "drug", + "('T116', 'T121', 'T129', 'T168')": "drug", + "('T116', 'T121', 'T129', 'T192')": "drug", + "('T116', 'T121', 'T129', 'T197')": "drug", + "('T116', 'T121', 'T129', 'T200')": "drug", + "('T116', 'T121', 'T130')": "drug", + "('T116', 'T121', 'T131')": "drug", + "('T116', 'T121', 'T168')": "drug", + "('T116', 'T121', 'T192')": "drug", + "('T116', 'T121', 'T195')": "drug", + "('T116', 'T121', 'T195', 'T200')": "drug", + "('T116', 'T121', 'T200')": "drug", + "('T116', 'T121', 'T203')": "drug", + "('T116', 'T122')": "device", + "('T116', 'T122', 'T123')": "polypeptide", + "('T116', 'T123')": "polypeptide", + "('T116', 'T123', 'T125')": "polypeptide", + "('T116', 'T123', 'T126')": "protein", + "('T116', 'T123', 'T126', 'T129')": "protein", + "('T116', 'T123', 'T126', 'T131')": "protein", + "('T116', 'T123', 'T126', 'T192')": "protein", + "('T116', 'T123', 'T129')": "polypeptide", + "('T116', 'T123', 'T129', 'T130')": "polypeptide", + "('T116', 'T123', 'T129', 'T192')": "protein", + "('T116', 'T123', 'T130')": "polypeptide", + "('T116', 'T123', 'T131')": "polypeptide", + "('T116', 'T123', 'T184')": "polypeptide", + "('T116', 'T123', 'T192')": "protein", + "('T116', 'T123', 'T195')": "drug", + "('T116', 'T123', 'T200')": "drug", + "('T116', 'T125')": "polypeptide", + "('T116', 'T125', 'T130')": "polypeptide", + "('T116', 'T125', 'T200')": "drug", + "('T116', 'T126')": "protein", + "('T116', 'T126', 'T127')": "protein", + "('T116', 'T126', 'T129')": "protein", + "('T116', 'T126', 'T129', 'T131')": "protein", + "('T116', 'T126', 'T130')": "protein", + "('T116', 'T126', 'T131')": "protein", + "('T116', 'T126', 'T169')": "protein", + "('T116', 'T126', 'T184')": "protein", + "('T116', 'T126', 'T191')": "protein", + "('T116', 'T126', 'T192')": "protein", + "('T116', 'T126', 'T200')": "drug", + "('T116', 'T127')": "polypeptide", + "('T116', 'T129')": "polypeptide", + "('T116', 'T129', 'T130')": "polypeptide", + "('T116', 'T129', 'T131')": "polypeptide", + "('T116', 'T129', 'T192')": "protein", + "('T116', 'T129', 'T195')": "drug", + "('T116', 'T129', 'T196')": "polypeptide", + "('T116', 'T129', 'T200')": "drug", + "('T116', 'T130')": "polypeptide", + "('T116', 'T130', 'T131')": "polypeptide", + "('T116', 'T130', 'T192')": "protein", + "('T116', 'T130', 'T195')": "drug", + "('T116', 'T130', 'T200')": "drug", + "('T116', 'T131')": "polypeptide", + "('T116', 'T131', 'T200')": "drug", + "('T116', 'T168')": "food", + "('T116', 'T168', 'T195')": "drug", + "('T116', 'T192')": "protein", + "('T116', 'T195')": "drug", + "('T116', 'T195', 'T200')": "drug", + "('T116', 'T200')": "drug", + "('T116',)": "polypeptide", + "('T120',)": "chemical entity", + "('T121', 'T122')": "drug", + "('T121', 'T122', 'T127')": "drug", + "('T121', 'T122', 'T130', 'T196', 'T197')": "drug", + "('T121', 'T122', 'T197')": "drug", + "('T121', 'T123')": "drug", + "('T121', 'T123', 'T125')": "drug", + "('T121', 'T123', 'T129')": "drug", + "('T121', 'T123', 'T130', 'T197')": "drug", + "('T121', 'T123', 'T131')": "drug", + "('T121', 'T123', 'T168', 'T196')": "drug", + "('T121', 'T123', 'T196')": "drug", + "('T121', 'T123', 'T196', 'T197')": "drug", + "('T121', 'T123', 'T196', 'T200')": "drug", + "('T121', 'T123', 'T197')": "drug", + "('T121', 'T123', 'T200')": "drug", + "('T121', 'T125')": "drug", + "('T121', 'T125', 'T127')": "drug", + "('T121', 'T126')": "drug", + "('T121', 'T127')": "drug", + "('T121', 'T127', 'T130')": "drug", + "('T121', 'T127', 'T167')": "drug", + "('T121', 'T127', 'T200')": "drug", + "('T121', 'T129')": "drug", + "('T121', 'T129', 'T130')": "drug", + "('T121', 'T129', 'T130', 'T200')": "drug", + "('T121', 'T129', 'T131')": "drug", + "('T121', 'T129', 'T168')": "drug", + "('T121', 'T129', 'T200')": "drug", + "('T121', 'T130')": "drug", + "('T121', 'T130', 'T131', 'T196', 'T197')": "drug", + "('T121', 'T130', 'T131', 'T197')": "drug", + "('T121', 'T130', 'T196')": "drug", + "('T121', 'T130', 'T196', 'T197')": "drug", + "('T121', 'T130', 'T196', 'T200')": "drug", + "('T121', 'T130', 'T197')": "drug", + "('T121', 'T130', 'T197', 'T200')": "drug", + "('T121', 'T130', 'T200')": "drug", + "('T121', 'T131')": "drug", + "('T121', 'T131', 'T196')": "drug", + "('T121', 'T131', 'T197')": "drug", + "('T121', 'T167', 'T197')": "drug", + "('T121', 'T168')": "drug", + "('T121', 'T168', 'T196')": "drug", + "('T121', 'T168', 'T197')": "drug", + "('T121', 'T169')": "drug", + "('T121', 'T170')": "drug", + "('T121', 'T195')": "drug", + "('T121', 'T196')": "drug", + "('T121', 'T196', 'T197')": "drug", + "('T121', 'T196', 'T200')": "drug", + "('T121', 'T197')": "drug", + "('T121', 'T197', 'T200')": "drug", + "('T121', 'T197', 'T203')": "drug", + "('T121', 'T200')": "drug", + "('T121', 'T203')": "drug", + "('T121', 'T204')": "drug", + "('T121',)": "drug", + "('T122', 'T123')": "chemical entity", + "('T122', 'T130')": "chemical entity", + "('T122', 'T167')": "chemical entity", + "('T122', 'T169')": "device", + "('T122', 'T170')": "publication", + "('T122', 'T197')": "chemical entity", + "('T122', 'T200')": "drug", + "('T122',)": "device", + "('T123', 'T129')": "chemical entity", + "('T123', 'T129', 'T131')": "chemical entity", + "('T123', 'T130')": "chemical entity", + "('T123', 'T130', 'T196')": "small molecule", + "('T123', 'T130', 'T197')": "chemical entity", + "('T123', 'T131')": "chemical entity", + "('T123', 'T131', 'T200')": "drug", + "('T123', 'T168', 'T196')": "small molecule", + "('T123', 'T195')": "drug", + "('T123', 'T196')": "small molecule", + "('T123', 'T196', 'T197')": "small molecule", + "('T123', 'T196', 'T200')": "drug", + "('T123', 'T197')": "chemical entity", + "('T123', 'T197', 'T200')": "drug", + "('T123', 'T200')": "drug", + "('T123',)": "chemical entity", + "('T125',)": "chemical entity", + "('T126',)": "protein", + "('T127',)": "small molecule", + "('T129', 'T130')": "chemical entity", + "('T129', 'T131')": "chemical entity", + "('T129', 'T167')": "chemical entity", + "('T129', 'T168')": "food", + "('T129', 'T185')": "named thing", + "('T129', 'T192')": "protein", + "('T129', 'T200')": "drug", + "('T129',)": "biological entity", + "('T130', 'T131')": "chemical entity", + "('T130', 'T131', 'T196')": "small molecule", + "('T130', 'T131', 'T196', 'T197')": "small molecule", + "('T130', 'T131', 'T197')": "chemical entity", + "('T130', 'T167')": "chemical entity", + "('T130', 'T195')": "drug", + "('T130', 'T196')": "small molecule", + "('T130', 'T196', 'T197')": "small molecule", + "('T130', 'T197')": "chemical entity", + "('T130', 'T197', 'T200')": "drug", + "('T130', 'T200')": "drug", + "('T130',)": "chemical entity", + "('T131', 'T196')": "small molecule", + "('T131', 'T197')": "chemical entity", + "('T131',)": "chemical entity", + "('T167',)": "chemical entity", + "('T168',)": "food", + "('T169',)": "information content entity", + "('T170',)": "publication", + "('T171',)": "information content entity", + "('T184',)": "phenotypic feature", + "('T185',)": "information content entity", + "('T190',)": "disease", + "('T191',)": "disease", + "('T192',)": "protein", + "('T194',)": "organism taxon", + "('T195',)": "drug", + "('T196',)": "small molecule", + "('T197',)": "chemical entity", + "('T200',)": "drug", + "('T201',)": "named thing", + "('T203',)": "device", + "('T204',)": "organism taxon", + "()": "named thing" +} From 3702a77111c5bbf8dfc052d2b1912fc8bc18a845 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 16 Aug 2023 17:53:14 -0700 Subject: [PATCH 026/117] #316 more CHV code --- umls_list_jsonl_to_kg_jsonl.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index d372084d..711021cd 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -131,8 +131,27 @@ def process_chv_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed name = str() synonyms = list() names = info.get(NAMES_KEY, dict()) + pt = names.get('PT', dict()) + if 'Y' in pt: + name = pt.get('Y', '') + assert len(name) == 1, str(name) + ' ' + node_curie + name = name[0] + else: + name = pt.get('N', '') + assert len(name) == 1, str(name) + ' ' + node_curie + name = name[0] + synonyms += [syn for syn in names.get('SY', dict()).get('Y', list())] + synonyms += [syn for syn in names.get('SY', dict()).get('N', list())] + + node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) + node['synonym'] = synonyms + description = str() + for tui in tuis: + description += "; UMLS Semantic Type: STY:" + tui + description.strip("; ") + node['description'] = description - print(curie_prefix + ":", names) + nodes_output.write(node) def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): curie_prefix = kg2_util.CURIE_PREFIX_DRUGBANK From 83f85916c327154bd338ab4f097e57641c3e315e Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 17 Aug 2023 11:59:20 -0700 Subject: [PATCH 027/117] #316 CHV is working now --- tui_combo_mappings.json | 519 ++++++++++++++++++++++++++++++++- umls_list_jsonl_to_kg_jsonl.py | 10 +- 2 files changed, 512 insertions(+), 17 deletions(-) diff --git a/tui_combo_mappings.json b/tui_combo_mappings.json index d30a088a..ea707d0a 100644 --- a/tui_combo_mappings.json +++ b/tui_combo_mappings.json @@ -1,4 +1,11 @@ { + "('T001', 'T002', 'T008')": "organism taxon", + "('T001', 'T007')": "individual organism", + "('T001', 'T028', 'T032')": "individual organism", + "('T001', 'T058')": "activity", + "('T001', 'T077')": "individual organism", + "('T001', 'T102')": "individual organism", + "('T001', 'T204')": "individual organism", "('T001',)": "individual organism", "('T002', 'T004')": "organism taxon", "('T002', 'T025')": "cell", @@ -69,34 +76,113 @@ "('T007', 'T203')": "device", "('T007', 'T204')": "organism taxon", "('T007',)": "organism taxon", + "('T008', 'T015')": "organism taxon", + "('T008', 'T040')": "organism taxon", + "('T008', 'T061', 'T083', 'T170')": "publication", + "('T008', 'T083', 'T170')": "publication", + "('T008', 'T170')": "organism taxon", + "('T008', 'T204')": "organism taxon", "('T008',)": "organism taxon", + "('T010', 'T061')": "procedure", "('T010',)": "organism taxon", + "('T011', 'T204')": "organism taxon", "('T011',)": "organism taxon", + "('T012', 'T047')": "disease", + "('T012', 'T059')": "procedure", "('T012',)": "organism taxon", + "('T013', 'T047')": "disease", + "('T013', 'T116', 'T126')": "protein", + "('T013', 'T168')": "food", "('T013',)": "organism taxon", "('T014',)": "organism taxon", + "('T015', 'T109', 'T121', 'T131')": "drug", "('T015',)": "organism taxon", + "('T016', 'T023')": "gross anatomical structure", + "('T016', 'T054')": "behavior", "('T016',)": "organism taxon", + "('T017', 'T023')": "anatomical entity", + "('T017', 'T026')": "cellular component", + "('T017', 'T061')": "procedure", + "('T017', 'T091')": "anatomical entity", "('T017',)": "anatomical entity", "('T018',)": "gross anatomical structure", + "('T019', 'T020')": "disease", + "('T019', 'T023')": "disease", "('T019', 'T028')": "disease", + "('T019', 'T028', 'T033')": "disease", + "('T019', 'T028', 'T033', 'T047')": "disease", + "('T019', 'T028', 'T033', 'T047', 'T191')": "disease", "('T019', 'T028', 'T047')": "disease", "('T019', 'T033')": "disease", + "('T019', 'T033', 'T047')": "disease", + "('T019', 'T033', 'T190')": "disease", + "('T019', 'T046')": "disease", + "('T019', 'T046', 'T047')": "disease", + "('T019', 'T046', 'T080', 'T169')": "disease", "('T019', 'T047')": "disease", + "('T019', 'T047', 'T190')": "disease", + "('T019', 'T047', 'T191')": "disease", + "('T019', 'T073')": "disease", + "('T019', 'T116', 'T121', 'T123')": "named thing", + "('T019', 'T121', 'T123', 'T196')": "named thing", + "('T019', 'T190')": "disease", + "('T019', 'T191')": "disease", "('T019',)": "disease", + "('T020', 'T026')": "disease", + "('T020', 'T030', 'T047')": "disease", + "('T020', 'T031', 'T046')": "disease", + "('T020', 'T033', 'T046')": "disease", + "('T020', 'T033', 'T047')": "disease", + "('T020', 'T037', 'T072')": "disease", + "('T020', 'T046')": "disease", + "('T020', 'T046', 'T190')": "disease", + "('T020', 'T047')": "disease", + "('T020', 'T047', 'T190')": "disease", + "('T020', 'T059')": "disease", + "('T020', 'T081')": "disease", + "('T020', 'T121', 'T196')": "named thing", + "('T020', 'T190')": "disease", "('T020',)": "disease", "('T021',)": "gross anatomical structure", + "('T022', 'T023')": "gross anatomical structure", + "('T022', 'T033')": "anatomical entity", + "('T022', 'T116', 'T121')": "drug", "('T022',)": "anatomical entity", "('T023', 'T024')": "gross anatomical structure", + "('T023', 'T024', 'T025')": "cell", + "('T023', 'T024', 'T030')": "gross anatomical structure", "('T023', 'T025')": "cell", "('T023', 'T026')": "cellular component", "('T023', 'T029')": "anatomical entity", + "('T023', 'T029', 'T030')": "gross anatomical structure", + "('T023', 'T029', 'T070')": "gross anatomical structure", "('T023', 'T030')": "anatomical entity", + "('T023', 'T030', 'T047', 'T048')": "disease", + "('T023', 'T031')": "gross anatomical structure", "('T023', 'T033')": "disease or phenotypic feature", "('T023', 'T033', 'T047')": "disease", + "('T023', 'T034')": "phenomenon", + "('T023', 'T037')": "pathological process", + "('T023', 'T037', 'T047', 'T190')": "disease", + "('T023', 'T042')": "gross anatomical structure", + "('T023', 'T046', 'T047', 'T190')": "disease", "('T023', 'T047')": "disease", + "('T023', 'T047', 'T061')": "disease", + "('T023', 'T047', 'T184')": "disease or phenotypic feature", + "('T023', 'T047', 'T191')": "disease", "('T023', 'T061')": "procedure", + "('T023', 'T061', 'T080', 'T081')": "named thing", + "('T023', 'T073', 'T093')": "gross anatomical structure", + "('T023', 'T074')": "device", + "('T023', 'T078')": "gross anatomical structure", + "('T023', 'T078', 'T097')": "gross anatomical structure", + "('T023', 'T078', 'T170')": "named thing", + "('T023', 'T080')": "gross anatomical structure", + "('T023', 'T080', 'T081')": "gross anatomical structure", + "('T023', 'T081', 'T083')": "gross anatomical structure", + "('T023', 'T170')": "publication", "('T023', 'T191')": "disease", + "('T023', 'T200')": "drug", "('T023',)": "gross anatomical structure", "('T024', 'T026')": "cellular component", "('T024', 'T031')": "gross anatomical structure", @@ -134,27 +220,90 @@ "('T025', 'T191')": "disease", "('T025', 'T200')": "drug", "('T025',)": "cell", + "('T026', 'T028')": "cellular component", + "('T026', 'T030')": "cellular component", + "('T026', 'T031')": "cellular component", + "('T026', 'T033')": "cellular component", + "('T026', 'T038', 'T043')": "phenomenon", + "('T026', 'T043')": "physiological process", + "('T026', 'T043', 'T044')": "cellular component", + "('T026', 'T044')": "cellular component", + "('T026', 'T044', 'T045')": "physiological process", + "('T026', 'T045')": "physiological process", + "('T026', 'T047')": "disease", + "('T026', 'T073')": "cellular component", + "('T026', 'T114', 'T123')": "nucleic acid entity", + "('T026', 'T116', 'T123')": "polypeptide", + "('T026', 'T116', 'T123', 'T126')": "protein", + "('T026', 'T116', 'T126')": "protein", + "('T026', 'T116', 'T129')": "polypeptide", + "('T026', 'T167')": "cellular component", + "('T026', 'T191')": "disease", "('T026',)": "cellular component", "('T028', 'T033')": "disease or phenotypic feature", "('T028', 'T033', 'T047')": "disease", "('T028', 'T033', 'T047', 'T191')": "disease", + "('T028', 'T033', 'T191')": "disease", + "('T028', 'T045')": "physiological process", + "('T028', 'T046', 'T047')": "disease", "('T028', 'T047')": "disease", + "('T028', 'T047', 'T048')": "disease", + "('T028', 'T047', 'T191')": "disease", "('T028', 'T048')": "disease", + "('T028', 'T054')": "behavior", + "('T028', 'T114')": "nucleic acid entity", + "('T028', 'T114', 'T123')": "nucleic acid entity", + "('T028', 'T116', 'T123')": "polypeptide", + "('T028', 'T191')": "disease", "('T028',)": "named thing", + "('T029', 'T061')": "procedure", + "('T029', 'T078', 'T170')": "publication", + "('T029', 'T081', 'T167', 'T170')": "named thing", + "('T029', 'T082')": "anatomical entity", + "('T029', 'T116', 'T129')": "polypeptide", "('T029',)": "anatomical entity", + "('T030', 'T033')": "anatomical entity", "('T030',)": "anatomical entity", + "('T031', 'T033')": "anatomical entity", + "('T031', 'T033', 'T046')": "pathological process", + "('T031', 'T033', 'T047')": "disease", + "('T031', 'T033', 'T073', 'T078', 'T079', 'T080', 'T170')": "disease or phenotypic feature", + "('T031', 'T033', 'T184')": "phenotypic feature", + "('T031', 'T037')": "anatomical entity", + "('T031', 'T039')": "physiological process", + "('T031', 'T040')": "physiological process", + "('T031', 'T046')": "pathological process", + "('T031', 'T047')": "disease", + "('T031', 'T058', 'T059')": "procedure", + "('T031', 'T059')": "procedure", + "('T031', 'T060')": "procedure", + "('T031', 'T062', 'T078', 'T169', 'T170')": "named thing", + "('T031', 'T073', 'T078', 'T080', 'T081', 'T093', 'T170')": "named thing", + "('T031', 'T073', 'T093')": "agent", + "('T031', 'T078')": "anatomical entity", + "('T031', 'T079')": "anatomical entity", + "('T031', 'T082')": "anatomical entity", + "('T031', 'T099')": "anatomical entity", + "('T031', 'T101', 'T169')": "anatomical entity", "('T031', 'T121')": "drug", "('T031', 'T121', 'T200')": "drug", + "('T031', 'T168')": "food", + "('T031', 'T169')": "anatomical entity", + "('T031', 'T170')": "anatomical entity", + "('T031', 'T184')": "phenotypic feature", "('T031',)": "anatomical entity", + "('T032', 'T033')": "disease or phenotypic feature", "('T032',)": "named thing", "('T033', 'T034')": "phenomenon", "('T033', 'T034', 'T047')": "disease", "('T033', 'T034', 'T059')": "phenomenon", + "('T033', 'T034', 'T073', 'T079', 'T093')": "named thing", "('T033', 'T037')": "pathological process", "('T033', 'T037', 'T047')": "disease", "('T033', 'T037', 'T055')": "pathological process", "('T033', 'T037', 'T070', 'T167', 'T191')": "disease", "('T033', 'T039')": "physiological process", + "('T033', 'T039', 'T040', 'T169')": "disease or phenotypic feature", "('T033', 'T040')": "physiological process", "('T033', 'T040', 'T046', 'T047')": "disease", "('T033', 'T040', 'T047')": "disease", @@ -185,6 +334,7 @@ "('T033', 'T055')": "behavior", "('T033', 'T055', 'T061')": "procedure", "('T033', 'T055', 'T185')": "behavior", + "('T033', 'T056', 'T073', 'T078', 'T079', 'T081', 'T093', 'T169', 'T170')": "named thing", "('T033', 'T056', 'T078', 'T080', 'T169', 'T170')": "publication", "('T033', 'T057', 'T080')": "activity", "('T033', 'T058')": "activity", @@ -192,12 +342,19 @@ "('T033', 'T060')": "procedure", "('T033', 'T060', 'T080')": "procedure", "('T033', 'T061')": "procedure", + "('T033', 'T061', 'T078', 'T079', 'T081', 'T170')": "named thing", "('T033', 'T061', 'T168')": "procedure", "('T033', 'T067')": "phenomenon", "('T033', 'T069', 'T131')": "phenomenon", + "('T033', 'T073', 'T078', 'T079', 'T093', 'T169', 'T170')": "disease or phenotypic feature", + "('T033', 'T073', 'T078', 'T080', 'T093', 'T169')": "disease or phenotypic feature", + "('T033', 'T073', 'T079', 'T080', 'T169', 'T170')": "named thing", + "('T033', 'T073', 'T093')": "disease or phenotypic feature", "('T033', 'T074')": "device", "('T033', 'T078')": "disease or phenotypic feature", "('T033', 'T078', 'T079', 'T170')": "publication", + "('T033', 'T078', 'T080', 'T081', 'T169')": "disease or phenotypic feature", + "('T033', 'T078', 'T080', 'T170')": "named thing", "('T033', 'T078', 'T089', 'T095', 'T170')": "publication", "('T033', 'T078', 'T089', 'T170')": "publication", "('T033', 'T078', 'T169', 'T170')": "publication", @@ -208,19 +365,29 @@ "('T033', 'T080', 'T082')": "disease or phenotypic feature", "('T033', 'T080', 'T170')": "publication", "('T033', 'T081')": "disease or phenotypic feature", + "('T033', 'T082')": "disease or phenotypic feature", + "('T033', 'T082', 'T170')": "named thing", "('T033', 'T083', 'T093', 'T169', 'T170')": "publication", "('T033', 'T089')": "disease or phenotypic feature", + "('T033', 'T091', 'T169')": "disease or phenotypic feature", + "('T033', 'T092', 'T170')": "publication", + "('T033', 'T093')": "disease or phenotypic feature", + "('T033', 'T097')": "disease or phenotypic feature", "('T033', 'T098')": "population of individual organisms", "('T033', 'T098', 'T116', 'T121', 'T129')": "drug", "('T033', 'T098', 'T121', 'T129')": "drug", "('T033', 'T099')": "cohort", + "('T033', 'T099', 'T200')": "named thing", "('T033', 'T101')": "cohort", "('T033', 'T102')": "disease or phenotypic feature", + "('T033', 'T109', 'T121')": "named thing", "('T033', 'T109', 'T122')": "chemical entity", "('T033', 'T109', 'T123')": "chemical entity", "('T033', 'T116', 'T123')": "polypeptide", + "('T033', 'T116', 'T126')": "protein", "('T033', 'T116', 'T129')": "polypeptide", "('T033', 'T121')": "drug", + "('T033', 'T121', 'T125', 'T127')": "drug", "('T033', 'T122')": "device", "('T033', 'T168')": "food", "('T033', 'T168', 'T170')": "food", @@ -251,67 +418,333 @@ "('T034', 'T196')": "small molecule", "('T034', 'T201')": "phenomenon", "('T034',)": "phenomenon", + "('T037', 'T046')": "pathological process", "('T037', 'T047')": "disease", + "('T037', 'T058')": "pathological process", + "('T037', 'T059')": "procedure", + "('T037', 'T061')": "procedure", + "('T037', 'T067')": "pathological process", + "('T037', 'T073', 'T092')": "agent", + "('T037', 'T109', 'T195')": "drug", + "('T037', 'T116', 'T123', 'T131')": "polypeptide", + "('T037', 'T121', 'T123', 'T196')": "drug", + "('T037', 'T123', 'T197')": "chemical entity", + "('T037', 'T190')": "disease", + "('T037', 'T204')": "pathological process", "('T037',)": "pathological process", + "('T038', 'T039')": "phenomenon", + "('T038', 'T039', 'T043')": "phenomenon", + "('T038', 'T040')": "phenomenon", + "('T038', 'T040', 'T043')": "phenomenon", + "('T038', 'T040', 'T080', 'T169')": "phenomenon", + "('T038', 'T042')": "phenomenon", "('T038', 'T043')": "phenomenon", + "('T038', 'T044')": "phenomenon", + "('T038', 'T046')": "pathological process", + "('T038', 'T070')": "phenomenon", + "('T038', 'T169')": "phenomenon", "('T038',)": "phenomenon", "('T039', 'T040')": "physiological process", + "('T039', 'T042')": "physiological process", + "('T039', 'T043')": "physiological process", + "('T039', 'T043', 'T044')": "physiological process", + "('T039', 'T044')": "physiological process", + "('T039', 'T047')": "disease", + "('T039', 'T061')": "physiological process", + "('T039', 'T070')": "phenomenon", + "('T039', 'T109', 'T121')": "drug", "('T039', 'T121')": "drug", + "('T039', 'T121', 'T125')": "drug", "('T039',)": "physiological process", "('T040', 'T042')": "physiological process", "('T040', 'T043')": "physiological process", + "('T040', 'T043', 'T044')": "physiological process", "('T040', 'T044')": "physiological process", + "('T040', 'T045')": "physiological process", + "('T040', 'T046')": "pathological process", + "('T040', 'T047')": "disease", + "('T040', 'T055')": "behavior", + "('T040', 'T061')": "procedure", + "('T040', 'T070')": "phenomenon", "('T040',)": "physiological process", + "('T041', 'T042')": "physiological process", + "('T041', 'T046')": "pathological process", + "('T041', 'T047')": "disease", + "('T041', 'T048')": "disease", + "('T041', 'T048', 'T055')": "disease", + "('T041', 'T048', 'T184')": "disease", + "('T041', 'T053', 'T058')": "behavior", + "('T041', 'T054')": "behavior", + "('T041', 'T054', 'T055')": "behavior", + "('T041', 'T055')": "behavior", + "('T041', 'T055', 'T078')": "behavior", + "('T041', 'T058')": "behavior", + "('T041', 'T061')": "procedure", + "('T041', 'T062')": "behavior", + "('T041', 'T062', 'T091')": "behavior", + "('T041', 'T067')": "phenomenon", + "('T041', 'T067', 'T080', 'T091', 'T097')": "phenomenon", + "('T041', 'T078')": "behavior", + "('T041', 'T078', 'T080')": "behavior", + "('T041', 'T078', 'T102')": "behavior", + "('T041', 'T080')": "behavior", + "('T041', 'T081')": "behavior", + "('T041', 'T091')": "behavior", + "('T041', 'T170')": "publication", + "('T041', 'T184')": "phenotypic feature", "('T041',)": "behavior", + "('T042', 'T043')": "physiological process", + "('T042', 'T044')": "physiological process", + "('T042', 'T060')": "physiological process", + "('T042', 'T080')": "physiological process", + "('T042', 'T116', 'T126')": "protein", + "('T042', 'T121')": "drug", + "('T042', 'T201')": "physiological process", "('T042',)": "physiological process", "('T043', 'T044')": "physiological process", "('T043', 'T045')": "physiological process", + "('T043', 'T046')": "pathological process", + "('T043', 'T079')": "physiological process", "('T043',)": "physiological process", "('T044', 'T045')": "physiological process", + "('T044', 'T046')": "pathological process", + "('T044', 'T047')": "disease", + "('T044', 'T070')": "phenomenon", "('T044',)": "molecular activity", + "('T045', 'T049')": "disease", "('T045',)": "physiological process", "('T046', 'T047')": "disease", + "('T046', 'T056')": "pathological process", + "('T046', 'T061')": "pathological process", + "('T046', 'T082', 'T201')": "pathological process", + "('T046', 'T109', 'T121')": "drug", + "('T046', 'T109', 'T121', 'T130')": "drug", + "('T046', 'T116', 'T121')": "drug", + "('T046', 'T184')": "pathological process", + "('T046', 'T190')": "disease", + "('T046', 'T191')": "disease", "('T046',)": "pathological process", + "('T047', 'T048')": "disease", + "('T047', 'T048', 'T184')": "disease or phenotypic feature", + "('T047', 'T049')": "disease", + "('T047', 'T050')": "disease", + "('T047', 'T059')": "named thing", + "('T047', 'T060')": "disease", + "('T047', 'T061')": "disease", + "('T047', 'T067')": "disease", + "('T047', 'T080')": "disease", + "('T047', 'T081')": "disease", + "('T047', 'T109', 'T121')": "drug", + "('T047', 'T109', 'T121', 'T123')": "named thing", + "('T047', 'T109', 'T123')": "disease", + "('T047', 'T116', 'T121', 'T123')": "named thing", + "('T047', 'T116', 'T123')": "named thing", + "('T047', 'T116', 'T129')": "named thing", + "('T047', 'T169')": "disease", "('T047', 'T184')": "disease or phenotypic feature", "('T047', 'T190')": "disease", "('T047', 'T191')": "disease", + "('T047', 'T196')": "disease", + "('T047', 'T204')": "disease", "('T047',)": "disease", + "('T048', 'T054')": "disease", + "('T048', 'T055')": "disease", + "('T048', 'T184')": "disease or phenotypic feature", "('T048',)": "disease", + "('T049', 'T059')": "disease", "('T049',)": "disease", "('T050', 'T191')": "disease", - "('T050',)": "biological entity", + "('T050',)": "named thing", "('T051',)": "event", + "('T052', 'T079')": "activity", "('T052',)": "activity", "('T053',)": "behavior", + "('T054', 'T055')": "behavior", + "('T054', 'T068')": "behavior", + "('T054', 'T078')": "behavior", + "('T054', 'T080')": "behavior", + "('T054', 'T098')": "behavior", "('T054',)": "behavior", + "('T055', 'T078')": "behavior", + "('T055', 'T080')": "behavior", + "('T055', 'T170')": "behavior", "('T055',)": "behavior", + "('T056', 'T073')": "activity", + "('T056', 'T079')": "activity", "('T056',)": "activity", + "('T057', 'T058')": "activity", + "('T057', 'T062')": "activity", + "('T057', 'T073')": "activity", + "('T057', 'T073', 'T170')": "activity", + "('T057', 'T078')": "activity", + "('T057', 'T079')": "activity", + "('T057', 'T080')": "activity", + "('T057', 'T081')": "activity", + "('T057', 'T090')": "activity", + "('T057', 'T170')": "activity", "('T057',)": "activity", + "('T058', 'T060')": "procedure", + "('T058', 'T061')": "procedure", + "('T058', 'T065')": "activity", + "('T058', 'T073', 'T093')": "agent", + "('T058', 'T078')": "activity", + "('T058', 'T080')": "activity", + "('T058', 'T081')": "activity", + "('T058', 'T091')": "activity", + "('T058', 'T093')": "activity", + "('T058', 'T097')": "activity", + "('T058', 'T098', 'T116', 'T121', 'T129')": "drug", + "('T058', 'T098', 'T121', 'T129')": "drug", + "('T058', 'T101')": "activity", + "('T058', 'T169')": "activity", + "('T058', 'T170')": "publication", + "('T058', 'T184')": "phenotypic feature", "('T058',)": "activity", + "('T059', 'T060')": "procedure", + "('T059', 'T060', 'T170')": "procedure", + "('T059', 'T061')": "procedure", + "('T059', 'T063')": "procedure", + "('T059', 'T070')": "phenomenon", + "('T059', 'T073')": "procedure", + "('T059', 'T073', 'T074')": "device", + "('T059', 'T074')": "device", + "('T059', 'T075')": "procedure", + "('T059', 'T078')": "procedure", + "('T059', 'T080')": "procedure", + "('T059', 'T080', 'T169')": "procedure", + "('T059', 'T081')": "procedure", + "('T059', 'T082')": "procedure", + "('T059', 'T090')": "procedure", + "('T059', 'T091')": "procedure", + "('T059', 'T093')": "agent", + "('T059', 'T109')": "procedure", + "('T059', 'T109', 'T121')": "drug", + "('T059', 'T109', 'T121', 'T127')": "drug", + "('T059', 'T109', 'T121', 'T130')": "drug", + "('T059', 'T109', 'T122')": "chemical entity", + "('T059', 'T109', 'T123')": "procedure", + "('T059', 'T109', 'T127')": "small molecule", + "('T059', 'T109', 'T130')": "chemical entity", + "('T059', 'T109', 'T195')": "drug", + "('T059', 'T116')": "polypeptide", + "('T059', 'T116', 'T121', 'T125')": "drug", + "('T059', 'T116', 'T121', 'T129')": "drug", + "('T059', 'T116', 'T121', 'T129', 'T130')": "drug", + "('T059', 'T116', 'T123')": "polypeptide", + "('T059', 'T116', 'T129')": "polypeptide", + "('T059', 'T116', 'T195')": "drug", + "('T059', 'T121')": "drug", + "('T059', 'T130')": "chemical entity", + "('T059', 'T168')": "food", + "('T059', 'T169')": "procedure", + "('T059', 'T170')": "procedure", + "('T059', 'T184')": "phenotypic feature", + "('T059', 'T200')": "drug", "('T059',)": "procedure", + "('T060', 'T061')": "procedure", + "('T060', 'T074')": "procedure", + "('T060', 'T081')": "procedure", + "('T060', 'T091')": "procedure", + "('T060', 'T121')": "drug", + "('T060', 'T170')": "procedure", + "('T060', 'T184')": "phenotypic feature", + "('T060', 'T204')": "procedure", "('T060',)": "procedure", + "('T061', 'T062')": "procedure", + "('T061', 'T068')": "phenomenon", + "('T061', 'T073', 'T093')": "physical entity", + "('T061', 'T074')": "device", + "('T061', 'T078', 'T080')": "procedure", + "('T061', 'T079')": "procedure", + "('T061', 'T091')": "procedure", + "('T061', 'T098')": "procedure", + "('T061', 'T109', 'T121')": "drug", + "('T061', 'T116', 'T121', 'T129')": "drug", + "('T061', 'T121')": "drug", + "('T061', 'T169')": "procedure", "('T061',)": "procedure", + "('T062', 'T081')": "activity", + "('T062', 'T083')": "activity", + "('T062', 'T091')": "activity", + "('T062', 'T170')": "activity", "('T062',)": "activity", "('T063',)": "procedure", + "('T064', 'T078')": "activity", + "('T064', 'T081')": "activity", + "('T064', 'T089')": "activity", "('T064',)": "activity", + "('T065', 'T080', 'T185')": "activity", + "('T065', 'T109')": "chemical entity", "('T065',)": "activity", + "('T066', 'T073')": "activity", + "('T066', 'T170')": "activity", "('T066',)": "activity", + "('T067', 'T070')": "phenomenon", + "('T067', 'T116', 'T121', 'T123')": "drug", "('T067',)": "phenomenon", + "('T068', 'T073')": "phenomenon", "('T068',)": "phenomenon", "('T069',)": "phenomenon", + "('T070', 'T078')": "phenomenon", + "('T070', 'T083')": "phenomenon", + "('T070', 'T169', 'T170')": "named thing", "('T070',)": "phenomenon", "('T071',)": "named thing", "('T072',)": "physical entity", + "('T073', 'T074')": "device", + "('T073', 'T078', 'T079', 'T080', 'T169', 'T170')": "publication", + "('T073', 'T078', 'T093')": "agent", + "('T073', 'T079', 'T093', 'T170')": "publication", + "('T073', 'T080', 'T169')": "physical entity", + "('T073', 'T083', 'T093')": "agent", + "('T073', 'T090')": "physical entity", "('T073', 'T092')": "agent", "('T073', 'T093')": "agent", + "('T073', 'T093', 'T169')": "agent", + "('T073', 'T093', 'T170')": "agent", + "('T073', 'T109', 'T121')": "drug", + "('T073', 'T121')": "drug", + "('T073', 'T167')": "chemical entity", + "('T073', 'T167', 'T170')": "chemical entity", "('T073', 'T170')": "publication", + "('T073', 'T200')": "drug", "('T073',)": "physical entity", + "('T074', 'T109')": "device", + "('T074', 'T109', 'T120')": "device", "('T074', 'T109', 'T121')": "drug", + "('T074', 'T109', 'T121', 'T127')": "drug", + "('T074', 'T109', 'T122')": "device", + "('T074', 'T109', 'T130')": "device", + "('T074', 'T114', 'T121')": "drug", + "('T074', 'T116', 'T121')": "drug", + "('T074', 'T121')": "drug", + "('T074', 'T121', 'T123', 'T196')": "drug", + "('T074', 'T121', 'T127')": "drug", + "('T074', 'T121', 'T129')": "drug", + "('T074', 'T121', 'T197')": "drug", + "('T074', 'T122')": "device", + "('T074', 'T168')": "food", "('T074', 'T200')": "drug", + "('T074', 'T203')": "device", "('T074',)": "device", "('T075',)": "device", - "('T077',)": "information content entity", - "('T078',)": "information content entity", + "('T077', 'T078')": "named thing", + "('T077', 'T170')": "publication", + "('T077',)": "named thing", + "('T078', 'T079')": "named thing", + "('T078', 'T079', 'T170')": "publication", + "('T078', 'T080')": "named thing", + "('T078', 'T080', 'T082', 'T099')": "cohort", + "('T078', 'T080', 'T170')": "publication", + "('T078', 'T081')": "named thing", + "('T078', 'T089')": "named thing", + "('T078', 'T091')": "named thing", + "('T078', 'T092')": "agent", + "('T078', 'T098')": "population of individual organisms", + "('T078', 'T169')": "named thing", + "('T078', 'T169', 'T170')": "publication", + "('T078', 'T170')": "publication", + "('T078',)": "named thing", "('T079', 'T080')": "named thing", "('T079', 'T080', 'T083')": "geographic location", "('T079', 'T080', 'T170')": "publication", @@ -331,28 +764,73 @@ "('T079', 'T169')": "named thing", "('T079', 'T170')": "publication", "('T079',)": "named thing", - "('T080',)": "information content entity", - "('T081',)": "information content entity", - "('T082',)": "information content entity", + "('T080', 'T081')": "named thing", + "('T080', 'T081', 'T169')": "named thing", + "('T080', 'T082', 'T169')": "named thing", + "('T080', 'T089')": "named thing", + "('T080', 'T169')": "named thing", + "('T080', 'T170')": "publication", + "('T080',)": "named thing", + "('T081', 'T083')": "geographic location", + "('T081', 'T085')": "named thing", + "('T081', 'T086')": "nucleic acid entity", + "('T081', 'T087')": "polypeptide", + "('T081', 'T097')": "cohort", + "('T081', 'T098')": "population of individual organisms", + "('T081', 'T102')": "named thing", + "('T081', 'T109')": "chemical entity", + "('T081', 'T121')": "drug", + "('T081', 'T167')": "chemical entity", + "('T081', 'T167', 'T168', 'T169')": "food", + "('T081', 'T169')": "named thing", + "('T081', 'T170')": "publication", + "('T081', 'T196')": "small molecule", + "('T081', 'T201')": "named thing", + "('T081',)": "named thing", + "('T082', 'T083')": "geographic location", + "('T082', 'T098')": "population of individual organisms", + "('T082', 'T103')": "chemical entity", + "('T082', 'T109', 'T123')": "chemical entity", + "('T082', 'T116', 'T123')": "polypeptide", + "('T082', 'T170')": "publication", + "('T082', 'T190')": "disease", + "('T082', 'T191')": "disease", + "('T082',)": "named thing", + "('T083', 'T169')": "geographic location", "('T083',)": "geographic location", - "('T085',)": "biological entity", + "('T085',)": "named thing", "('T086',)": "nucleic acid entity", "('T087',)": "polypeptide", - "('T088',)": "biological entity", - "('T089',)": "information content entity", + "('T088',)": "named thing", + "('T089', 'T170')": "publication", + "('T089',)": "named thing", + "('T090', 'T091')": "individual organism", + "('T090', 'T170')": "individual organism", "('T090',)": "individual organism", + "('T091', 'T097')": "cohort", "('T091',)": "named thing", + "('T092', 'T097', 'T170')": "agent", + "('T092', 'T170')": "named thing", "('T092',)": "agent", + "('T093', 'T109', 'T123')": "agent", + "('T093', 'T116', 'T123')": "polypeptide", + "('T093', 'T121')": "drug", "('T093',)": "agent", "('T094',)": "agent", "('T095',)": "agent", "('T096',)": "agent", + "('T097', 'T170')": "cohort", "('T097',)": "cohort", + "('T098', 'T109', 'T121', 'T129')": "drug", + "('T098', 'T116', 'T121', 'T129')": "drug", + "('T098', 'T121', 'T129')": "named thing", + "('T098', 'T170')": "publication", "('T098',)": "population of individual organisms", + "('T099', 'T102')": "cohort", "('T099',)": "cohort", "('T100',)": "cohort", "('T101',)": "cohort", - "('T102',)": "information content entity", + "('T102',)": "named thing", "('T103',)": "chemical entity", "('T104', 'T109')": "chemical entity", "('T104', 'T109', 'T116', 'T121', 'T123', 'T130')": "drug", @@ -679,6 +1157,7 @@ "('T116', 'T195', 'T200')": "drug", "('T116', 'T200')": "drug", "('T116',)": "polypeptide", + "('T120', 'T121')": "drug", "('T120',)": "chemical entity", "('T121', 'T122')": "drug", "('T121', 'T122', 'T127')": "drug", @@ -761,7 +1240,9 @@ "('T123', 'T197', 'T200')": "drug", "('T123', 'T200')": "drug", "('T123',)": "chemical entity", + "('T125', 'T130')": "chemical entity", "('T125',)": "chemical entity", + "('T126', 'T129')": "protein", "('T126',)": "protein", "('T127',)": "small molecule", "('T129', 'T130')": "chemical entity", @@ -771,7 +1252,7 @@ "('T129', 'T185')": "named thing", "('T129', 'T192')": "protein", "('T129', 'T200')": "drug", - "('T129',)": "biological entity", + "('T129',)": "named thing", "('T130', 'T131')": "chemical entity", "('T130', 'T131', 'T196')": "small molecule", "('T130', 'T131', 'T196', 'T197')": "small molecule", @@ -784,23 +1265,33 @@ "('T130', 'T197', 'T200')": "drug", "('T130', 'T200')": "drug", "('T130',)": "chemical entity", + "('T131', 'T167')": "chemical entity", "('T131', 'T196')": "small molecule", + "('T131', 'T196', 'T197')": "small molecule", "('T131', 'T197')": "chemical entity", + "('T131', 'T197', 'T200')": "drug", "('T131',)": "chemical entity", "('T167',)": "chemical entity", + "('T168', 'T200')": "drug", "('T168',)": "food", - "('T169',)": "information content entity", + "('T169', 'T170')": "publication", + "('T169',)": "named thing", + "('T170', 'T185')": "publication", "('T170',)": "publication", - "('T171',)": "information content entity", + "('T171',)": "named thing", + "('T184', 'T190')": "disease or phenotypic feature", "('T184',)": "phenotypic feature", - "('T185',)": "information content entity", + "('T185',)": "named thing", "('T190',)": "disease", "('T191',)": "disease", "('T192',)": "protein", "('T194',)": "organism taxon", "('T195',)": "drug", + "('T196', 'T197')": "small molecule", "('T196',)": "small molecule", + "('T197', 'T200')": "drug", "('T197',)": "chemical entity", + "('T200', 'T203')": "drug", "('T200',)": "drug", "('T201',)": "named thing", "('T203',)": "device", diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 711021cd..2063e960 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -132,16 +132,20 @@ def process_chv_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed synonyms = list() names = info.get(NAMES_KEY, dict()) pt = names.get('PT', dict()) + synonyms += [syn for syn in names.get('SY', dict()).get('Y', list())] + synonyms += [syn for syn in names.get('SY', dict()).get('N', list())] if 'Y' in pt: name = pt.get('Y', '') assert len(name) == 1, str(name) + ' ' + node_curie name = name[0] - else: + elif 'N' in pt: name = pt.get('N', '') assert len(name) == 1, str(name) + ' ' + node_curie name = name[0] - synonyms += [syn for syn in names.get('SY', dict()).get('Y', list())] - synonyms += [syn for syn in names.get('SY', dict()).get('N', list())] + else: + name = synonyms[0] + synonyms = synonyms[1:] + name = name[0] node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) node['synonym'] = synonyms From 770fb5a316f511b872725f8dbab6fdecf8dbf922 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 17 Aug 2023 12:28:03 -0700 Subject: [PATCH 028/117] #316 FMA is working now --- tui_combo_mappings.json | 3 +++ umls_list_jsonl_to_kg_jsonl.py | 47 ++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/tui_combo_mappings.json b/tui_combo_mappings.json index ea707d0a..6aebaa1b 100644 --- a/tui_combo_mappings.json +++ b/tui_combo_mappings.json @@ -256,11 +256,14 @@ "('T028', 'T116', 'T123')": "polypeptide", "('T028', 'T191')": "disease", "('T028',)": "named thing", + "('T029', 'T030')": "anatomical entity", "('T029', 'T061')": "procedure", "('T029', 'T078', 'T170')": "publication", "('T029', 'T081', 'T167', 'T170')": "named thing", "('T029', 'T082')": "anatomical entity", "('T029', 'T116', 'T129')": "polypeptide", + "('T029', 'T170')": "publication", + "('T029', 'T184')": "phenotypic feature", "('T029',)": "anatomical entity", "('T030', 'T033')": "anatomical entity", "('T030',)": "anatomical entity", diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 2063e960..c544fa97 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -192,6 +192,50 @@ def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_outpu nodes_output.write(node) +def process_fma_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): + curie_prefix = "FMA" # This should be replaced with a kg2_util prefix at some point + provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) + iri = iri_mappings[curie_prefix] + node_id + node_curie = make_node_id(curie_prefix, node_id) + cuis = info.get(CUIS_KEY, list()) + tuis = info.get(TUIS_KEY, list()) + + # Currently not used, but extracting them in case we want them in the future + authority = info.get(INFO_KEY, dict()).get('AUTHORITY', list()) + date_last_modified = info.get(INFO_KEY, dict()).get('DATE_LAST_MODIFIED', list()) + + name = str() + synonyms = list() + names = info.get(NAMES_KEY, dict()) + pt = names.get('PT', dict()) + synonyms += [syn for syn in names.get('SY', dict()).get('Y', list())] + synonyms += [syn for syn in names.get('SY', dict()).get('N', list())] + if 'Y' in pt: + name = pt.get('Y', '') + if len(name) > 1: + synonyms += name[1:] + name = name[0] + elif 'N' in pt: + name = pt.get('N', '') + if len(name) > 1: + synonyms += name[1:] + name = name[0] + else: + name = synonyms[0] + synonyms = synonyms[1:] + name = name[0] + + node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) + node['synonym'] = synonyms + description = str() + for tui in tuis: + description += "; UMLS Semantic Type: STY:" + tui + description.strip("; ") + node['description'] = description + + nodes_output.write(node) + + if __name__ == '__main__': args = get_args() input_file_name = args.inputFile @@ -238,5 +282,8 @@ def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_outpu if source == 'DRUGBANK': process_drugbank_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) + if source == 'FMA': + process_fma_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) + kg2_util.end_read_jsonlines(input_read_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) \ No newline at end of file From bffcb152607218dd98362afaca528acb2016c0dd Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 17 Aug 2023 16:56:29 -0700 Subject: [PATCH 029/117] #316 more tui combo mappings --- tui_combo_mappings.json | 298 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 298 insertions(+) diff --git a/tui_combo_mappings.json b/tui_combo_mappings.json index 6aebaa1b..543efc37 100644 --- a/tui_combo_mappings.json +++ b/tui_combo_mappings.json @@ -105,23 +105,47 @@ "('T017', 'T061')": "procedure", "('T017', 'T091')": "anatomical entity", "('T017',)": "anatomical entity", + "('T018', 'T019')": "disease", + "('T018', 'T019', 'T024')": "gross anatomical structure", + "('T018', 'T019', 'T028', 'T033', 'T047')": "disease or phenotypic feature", + "('T018', 'T024')": "gross anatomical structure", + "('T018', 'T025')": "cell", + "('T018', 'T026')": "cellular component", "('T018',)": "gross anatomical structure", "('T019', 'T020')": "disease", + "('T019', 'T020', 'T033', 'T190')": "disease", + "('T019', 'T020', 'T037', 'T190')": "disease", + "('T019', 'T020', 'T047')": "disease", + "('T019', 'T020', 'T047', 'T190')": "disease", + "('T019', 'T020', 'T190')": "disease", "('T019', 'T023')": "disease", "('T019', 'T028')": "disease", "('T019', 'T028', 'T033')": "disease", "('T019', 'T028', 'T033', 'T047')": "disease", "('T019', 'T028', 'T033', 'T047', 'T191')": "disease", + "('T019', 'T028', 'T033', 'T190')": "disease", + "('T019', 'T028', 'T033', 'T191')": "disease", "('T019', 'T028', 'T047')": "disease", + "('T019', 'T028', 'T047', 'T190')": "disease", + "('T019', 'T030')": "disease", + "('T019', 'T031', 'T047')": "disease", "('T019', 'T033')": "disease", + "('T019', 'T033', 'T037', 'T047', 'T190')": "disease", "('T019', 'T033', 'T047')": "disease", + "('T019', 'T033', 'T047', 'T190')": "disease", "('T019', 'T033', 'T190')": "disease", + "('T019', 'T033', 'T191')": "disease", + "('T019', 'T037')": "disease", "('T019', 'T046')": "disease", "('T019', 'T046', 'T047')": "disease", "('T019', 'T046', 'T080', 'T169')": "disease", "('T019', 'T047')": "disease", + "('T019', 'T047', 'T049')": "disease", + "('T019', 'T047', 'T054')": "disease", "('T019', 'T047', 'T190')": "disease", "('T019', 'T047', 'T191')": "disease", + "('T019', 'T049')": "disease", + "('T019', 'T059')": "disease", "('T019', 'T073')": "disease", "('T019', 'T116', 'T121', 'T123')": "named thing", "('T019', 'T121', 'T123', 'T196')": "named thing", @@ -145,8 +169,15 @@ "('T020',)": "disease", "('T021',)": "gross anatomical structure", "('T022', 'T023')": "gross anatomical structure", + "('T022', 'T023', 'T030')": "gross anatomical structure", + "('T022', 'T023', 'T040')": "gross anatomical structure", + "('T022', 'T030')": "anatomical entity", "('T022', 'T033')": "anatomical entity", + "('T022', 'T033', 'T047')": "disease", + "('T022', 'T109', 'T129')": "chemical entity", "('T022', 'T116', 'T121')": "drug", + "('T022', 'T170')": "publication", + "('T022', 'T185')": "anatomical entity", "('T022',)": "anatomical entity", "('T023', 'T024')": "gross anatomical structure", "('T023', 'T024', 'T025')": "cell", @@ -240,20 +271,56 @@ "('T026', 'T167')": "cellular component", "('T026', 'T191')": "disease", "('T026',)": "cellular component", + "('T028', 'T031', 'T033')": "disease or phenotypic feature", + "('T028', 'T032')": "named thing", + "('T028', 'T032', 'T033', 'T047')": "disease", + "('T028', 'T032', 'T045')": "physiological process", "('T028', 'T033')": "disease or phenotypic feature", + "('T028', 'T033', 'T034')": "phenomenon", + "('T028', 'T033', 'T037')": "pathological process", + "('T028', 'T033', 'T046')": "pathological process", + "('T028', 'T033', 'T046', 'T047')": "disease", "('T028', 'T033', 'T047')": "disease", + "('T028', 'T033', 'T047', 'T048')": "disease", + "('T028', 'T033', 'T047', 'T109', 'T121')": "named thing", + "('T028', 'T033', 'T047', 'T116', 'T129')": "disease", + "('T028', 'T033', 'T047', 'T190')": "disease", "('T028', 'T033', 'T047', 'T191')": "disease", + "('T028', 'T033', 'T048')": "disease", + "('T028', 'T033', 'T116')": "polypeptide", "('T028', 'T033', 'T191')": "disease", + "('T028', 'T033', 'T201')": "disease or phenotypic feature", + "('T028', 'T034')": "phenomenon", + "('T028', 'T037', 'T047')": "disease", + "('T028', 'T038')": "phenomenon", + "('T028', 'T039')": "physiological process", + "('T028', 'T042')": "physiological process", "('T028', 'T045')": "physiological process", + "('T028', 'T046')": "pathological process", "('T028', 'T046', 'T047')": "disease", + "('T028', 'T046', 'T047', 'T191')": "disease", "('T028', 'T047')": "disease", "('T028', 'T047', 'T048')": "disease", + "('T028', 'T047', 'T048', 'T191')": "disease", + "('T028', 'T047', 'T116', 'T126')": "protein", "('T028', 'T047', 'T191')": "disease", "('T028', 'T048')": "disease", "('T028', 'T054')": "behavior", + "('T028', 'T062', 'T082', 'T114')": "nucleic acid entity", + "('T028', 'T082')": "named thing", + "('T028', 'T086')": "nucleic acid entity", + "('T028', 'T086', 'T114', 'T123')": "nucleic acid entity", + "('T028', 'T109', 'T121')": "drug", + "('T028', 'T109', 'T130')": "chemical entity", "('T028', 'T114')": "nucleic acid entity", + "('T028', 'T114', 'T116')": "polypeptide", "('T028', 'T114', 'T123')": "nucleic acid entity", + "('T028', 'T116')": "polypeptide", + "('T028', 'T116', 'T121')": "drug", "('T028', 'T116', 'T123')": "polypeptide", + "('T028', 'T116', 'T126')": "protein", + "('T028', 'T170')": "publication", + "('T028', 'T190')": "disease", "('T028', 'T191')": "disease", "('T028',)": "named thing", "('T029', 'T030')": "anatomical entity", @@ -266,6 +333,7 @@ "('T029', 'T184')": "phenotypic feature", "('T029',)": "anatomical entity", "('T030', 'T033')": "anatomical entity", + "('T030', 'T060')": "anatomical entity", "('T030',)": "anatomical entity", "('T031', 'T033')": "anatomical entity", "('T031', 'T033', 'T046')": "pathological process", @@ -296,29 +364,71 @@ "('T031', 'T184')": "phenotypic feature", "('T031',)": "anatomical entity", "('T032', 'T033')": "disease or phenotypic feature", + "('T032', 'T033', 'T054')": "disease or phenotypic feature", + "('T032', 'T033', 'T058', 'T079', 'T080', 'T081', 'T169', 'T170')": "disease or phenotypic feature", + "('T032', 'T033', 'T078', 'T080', 'T170')": "disease or phenotypic feature", + "('T032', 'T033', 'T184')": "phenotypic feature", + "('T032', 'T033', 'T201')": "disease or phenotypic feature", + "('T032', 'T037')": "pathological process", + "('T032', 'T038')": "phenomenon", + "('T032', 'T038', 'T040')": "physiological process", + "('T032', 'T039', 'T040', 'T046', 'T121')": "named thing", + "('T032', 'T039', 'T040', 'T201')": "physiological process", + "('T032', 'T040')": "physiological process", + "('T032', 'T040', 'T055', 'T062', 'T071')": "physiological process", + "('T032', 'T041')": "behavior", + "('T032', 'T042')": "physiological process", + "('T032', 'T045')": "physiological process", + "('T032', 'T047')": "disease", + "('T032', 'T053', 'T055')": "behavior", + "('T032', 'T079')": "named thing", + "('T032', 'T081')": "named thing", + "('T032', 'T093')": "agent", "('T032',)": "named thing", "('T033', 'T034')": "phenomenon", + "('T033', 'T034', 'T040')": "disease or phenotypic feature", "('T033', 'T034', 'T047')": "disease", "('T033', 'T034', 'T059')": "phenomenon", + "('T033', 'T034', 'T059', 'T081')": "disease or phenotypic feature", + "('T033', 'T034', 'T073', 'T078', 'T080', 'T170')": "disease or phenotypic feature", "('T033', 'T034', 'T073', 'T079', 'T093')": "named thing", + "('T033', 'T034', 'T201')": "disease or phenotypic feature", "('T033', 'T037')": "pathological process", "('T033', 'T037', 'T047')": "disease", "('T033', 'T037', 'T055')": "pathological process", "('T033', 'T037', 'T070', 'T167', 'T191')": "disease", + "('T033', 'T038', 'T040')": "disease or phenotypic feature", "('T033', 'T039')": "physiological process", "('T033', 'T039', 'T040', 'T169')": "disease or phenotypic feature", + "('T033', 'T039', 'T121')": "disease or phenotypic feature", "('T033', 'T040')": "physiological process", "('T033', 'T040', 'T046', 'T047')": "disease", "('T033', 'T040', 'T047')": "disease", + "('T033', 'T040', 'T081')": "disease or phenotypic feature", + "('T033', 'T040', 'T184')": "phenotypic feature", + "('T033', 'T040', 'T190')": "disease", "('T033', 'T041')": "behavior", + "('T033', 'T041', 'T048')": "disease", + "('T033', 'T041', 'T184')": "disease or phenotypic feature", "('T033', 'T042')": "physiological process", + "('T033', 'T042', 'T043')": "physiological process", "('T033', 'T042', 'T047')": "disease", + "('T033', 'T042', 'T060')": "disease or phenotypic feature", + "('T033', 'T043')": "disease or phenotypic feature", + "('T033', 'T045')": "disease or phenotypic feature", + "('T033', 'T045', 'T054')": "disease or phenotypic feature", "('T033', 'T046')": "pathological process", "('T033', 'T046', 'T047')": "disease", "('T033', 'T046', 'T047', 'T184')": "disease or phenotypic feature", + "('T033', 'T046', 'T047', 'T190')": "disease", + "('T033', 'T046', 'T048')": "disease", "('T033', 'T046', 'T061', 'T081', 'T093')": "pathological process", + "('T033', 'T046', 'T067', 'T169')": "pathological process", + "('T033', 'T046', 'T079')": "disease or phenotypic feature", "('T033', 'T046', 'T184')": "disease or phenotypic feature", + "('T033', 'T046', 'T190')": "disease", "('T033', 'T047')": "disease", + "('T033', 'T047', 'T048')": "disease", "('T033', 'T047', 'T048', 'T054', 'T102')": "disease", "('T033', 'T047', 'T048', 'T184')": "disease or phenotypic feature", "('T033', 'T047', 'T059', 'T074')": "disease", @@ -328,62 +438,93 @@ "('T033', 'T047', 'T191')": "disease", "('T033', 'T048')": "disease", "('T033', 'T048', 'T054')": "disease", + "('T033', 'T048', 'T055')": "disease", "('T033', 'T048', 'T169')": "disease", + "('T033', 'T048', 'T184')": "disease or phenotypic feature", "('T033', 'T049')": "disease", "('T033', 'T051')": "event", "('T033', 'T052', 'T061')": "procedure", + "('T033', 'T052', 'T066', 'T067')": "disease or phenotypic feature", "('T033', 'T054')": "behavior", + "('T033', 'T054', 'T055', 'T062', 'T072', 'T079', 'T080', 'T081')": "disease or phenotypic feature", "('T033', 'T054', 'T080')": "behavior", + "('T033', 'T054', 'T098')": "disease or phenotypic feature", "('T033', 'T055')": "behavior", + "('T033', 'T055', 'T056', 'T061')": "disease or phenotypic feature", + "('T033', 'T055', 'T058', 'T073')": "disease or phenotypic feature", "('T033', 'T055', 'T061')": "procedure", + "('T033', 'T055', 'T081')": "disease or phenotypic feature", "('T033', 'T055', 'T185')": "behavior", "('T033', 'T056', 'T073', 'T078', 'T079', 'T081', 'T093', 'T169', 'T170')": "named thing", "('T033', 'T056', 'T078', 'T080', 'T169', 'T170')": "publication", + "('T033', 'T056', 'T078', 'T081', 'T082', 'T083', 'T096')": "disease or phenotypic feature", "('T033', 'T057', 'T080')": "activity", "('T033', 'T058')": "activity", + "('T033', 'T058', 'T059', 'T060')": "disease or phenotypic feature", + "('T033', 'T058', 'T093')": "disease or phenotypic feature", "('T033', 'T059')": "procedure", + "('T033', 'T059', 'T060')": "procedure", "('T033', 'T060')": "procedure", "('T033', 'T060', 'T080')": "procedure", + "('T033', 'T060', 'T185')": "disease or phenotypic feature", "('T033', 'T061')": "procedure", "('T033', 'T061', 'T078', 'T079', 'T081', 'T170')": "named thing", "('T033', 'T061', 'T168')": "procedure", + "('T033', 'T063')": "disease or phenotypic feature", "('T033', 'T067')": "phenomenon", + "('T033', 'T069')": "disease or phenotypic feature", "('T033', 'T069', 'T131')": "phenomenon", + "('T033', 'T070')": "disease or phenotypic feature", + "('T033', 'T073')": "disease or phenotypic feature", "('T033', 'T073', 'T078', 'T079', 'T093', 'T169', 'T170')": "disease or phenotypic feature", + "('T033', 'T073', 'T078', 'T080')": "disease or phenotypic feature", "('T033', 'T073', 'T078', 'T080', 'T093', 'T169')": "disease or phenotypic feature", "('T033', 'T073', 'T079', 'T080', 'T169', 'T170')": "named thing", "('T033', 'T073', 'T093')": "disease or phenotypic feature", "('T033', 'T074')": "device", + "('T033', 'T077')": "disease or phenotypic feature", + "('T033', 'T077', 'T078', 'T080', 'T081', 'T170', 'T201')": "disease or phenotypic feature", "('T033', 'T078')": "disease or phenotypic feature", + "('T033', 'T078', 'T079', 'T080', 'T170')": "disease or phenotypic feature", "('T033', 'T078', 'T079', 'T170')": "publication", "('T033', 'T078', 'T080', 'T081', 'T169')": "disease or phenotypic feature", + "('T033', 'T078', 'T080', 'T169', 'T170')": "disease or phenotypic feature", "('T033', 'T078', 'T080', 'T170')": "named thing", "('T033', 'T078', 'T089', 'T095', 'T170')": "publication", "('T033', 'T078', 'T089', 'T170')": "publication", "('T033', 'T078', 'T169', 'T170')": "publication", "('T033', 'T078', 'T170')": "publication", + "('T033', 'T078', 'T190')": "disease", "('T033', 'T079')": "disease or phenotypic feature", "('T033', 'T079', 'T080', 'T081', 'T169', 'T170')": "publication", + "('T033', 'T079', 'T081')": "disease or phenotypic feature", "('T033', 'T080')": "disease or phenotypic feature", "('T033', 'T080', 'T082')": "disease or phenotypic feature", + "('T033', 'T080', 'T098')": "disease or phenotypic feature", + "('T033', 'T080', 'T098', 'T201')": "disease or phenotypic feature", "('T033', 'T080', 'T170')": "publication", "('T033', 'T081')": "disease or phenotypic feature", "('T033', 'T082')": "disease or phenotypic feature", "('T033', 'T082', 'T170')": "named thing", "('T033', 'T083', 'T093', 'T169', 'T170')": "publication", "('T033', 'T089')": "disease or phenotypic feature", + "('T033', 'T089', 'T099')": "disease or phenotypic feature", "('T033', 'T091', 'T169')": "disease or phenotypic feature", "('T033', 'T092', 'T170')": "publication", "('T033', 'T093')": "disease or phenotypic feature", + "('T033', 'T095', 'T098')": "disease or phenotypic feature", "('T033', 'T097')": "disease or phenotypic feature", "('T033', 'T098')": "population of individual organisms", + "('T033', 'T098', 'T101')": "disease or phenotypic feature", "('T033', 'T098', 'T116', 'T121', 'T129')": "drug", "('T033', 'T098', 'T121', 'T129')": "drug", "('T033', 'T099')": "cohort", "('T033', 'T099', 'T200')": "named thing", + "('T033', 'T100')": "disease or phenotypic feature", "('T033', 'T101')": "cohort", "('T033', 'T102')": "disease or phenotypic feature", "('T033', 'T109', 'T121')": "named thing", + "('T033', 'T109', 'T121', 'T125')": "drug", "('T033', 'T109', 'T122')": "chemical entity", "('T033', 'T109', 'T123')": "chemical entity", "('T033', 'T116', 'T123')": "polypeptide", @@ -397,6 +538,7 @@ "('T033', 'T169')": "disease or phenotypic feature", "('T033', 'T170')": "publication", "('T033', 'T184')": "phenotypic feature", + "('T033', 'T184', 'T201')": "phenotypic feature", "('T033', 'T185')": "disease or phenotypic feature", "('T033', 'T190')": "disease", "('T033', 'T191')": "disease", @@ -421,54 +563,140 @@ "('T034', 'T196')": "small molecule", "('T034', 'T201')": "phenomenon", "('T034',)": "phenomenon", + "('T037', 'T038', 'T040')": "pathological process", "('T037', 'T046')": "pathological process", + "('T037', 'T046', 'T047')": "disease", "('T037', 'T047')": "disease", + "('T037', 'T047', 'T048')": "disease", + "('T037', 'T047', 'T070')": "disease", + "('T037', 'T048')": "disease", + "('T037', 'T048', 'T051')": "disease", + "('T037', 'T048', 'T051', 'T053', 'T055')": "disease", + "('T037', 'T052', 'T068', 'T131')": "pathological process", + "('T037', 'T056', 'T068', 'T073', 'T078', 'T089')": "pathological process", "('T037', 'T058')": "pathological process", "('T037', 'T059')": "procedure", + "('T037', 'T060')": "pathological process", "('T037', 'T061')": "procedure", "('T037', 'T067')": "pathological process", + "('T037', 'T070')": "pathological process", + "('T037', 'T073')": "pathological process", "('T037', 'T073', 'T092')": "agent", + "('T037', 'T078', 'T091')": "pathological process", + "('T037', 'T081')": "pathological process", "('T037', 'T109', 'T195')": "drug", + "('T037', 'T116', 'T121', 'T131')": "named thing", "('T037', 'T116', 'T123', 'T131')": "polypeptide", "('T037', 'T121', 'T123', 'T196')": "drug", "('T037', 'T123', 'T197')": "chemical entity", + "('T037', 'T131')": "pathological process", + "('T037', 'T184')": "disease or phenotypic feature", "('T037', 'T190')": "disease", + "('T037', 'T201')": "pathological process", "('T037', 'T204')": "pathological process", "('T037',)": "pathological process", "('T038', 'T039')": "phenomenon", + "('T038', 'T039', 'T042')": "phenomenon", "('T038', 'T039', 'T043')": "phenomenon", + "('T038', 'T039', 'T044')": "phenomenon", "('T038', 'T040')": "phenomenon", "('T038', 'T040', 'T043')": "phenomenon", + "('T038', 'T040', 'T046')": "pathological process", + "('T038', 'T040', 'T054')": "phenomenon", + "('T038', 'T040', 'T070')": "phenomenon", "('T038', 'T040', 'T080', 'T169')": "phenomenon", "('T038', 'T042')": "phenomenon", + "('T038', 'T042', 'T043')": "phenomenon", "('T038', 'T043')": "phenomenon", + "('T038', 'T043', 'T044')": "phenomenon", + "('T038', 'T043', 'T045')": "phenomenon", + "('T038', 'T043', 'T046')": "pathological process", + "('T038', 'T043', 'T070')": "phenomenon", "('T038', 'T044')": "phenomenon", + "('T038', 'T044', 'T046')": "pathological process", + "('T038', 'T044', 'T061')": "phenomenon", + "('T038', 'T045')": "phenomenon", "('T038', 'T046')": "pathological process", + "('T038', 'T047')": "disease", + "('T038', 'T059')": "phenomenon", + "('T038', 'T067')": "phenomenon", "('T038', 'T070')": "phenomenon", "('T038', 'T169')": "phenomenon", + "('T038', 'T201')": "phenomenon", "('T038',)": "phenomenon", "('T039', 'T040')": "physiological process", + "('T039', 'T041')": "behavior", "('T039', 'T042')": "physiological process", + "('T039', 'T042', 'T043')": "physiological process", + "('T039', 'T042', 'T044')": "physiological process", + "('T039', 'T042', 'T046', 'T047', 'T060')": "disease", + "('T039', 'T042', 'T070')": "phenomenon", + "('T039', 'T042', 'T201')": "physiological process", "('T039', 'T043')": "physiological process", "('T039', 'T043', 'T044')": "physiological process", "('T039', 'T044')": "physiological process", + "('T039', 'T044', 'T046')": "pathological process", + "('T039', 'T044', 'T070')": "phenomenon", + "('T039', 'T044', 'T121', 'T123')": "drug", + "('T039', 'T045')": "physiological process", "('T039', 'T047')": "disease", + "('T039', 'T047', 'T184')": "disease or phenotypic feature", + "('T039', 'T059')": "procedure", "('T039', 'T061')": "physiological process", + "('T039', 'T062')": "activity", + "('T039', 'T066')": "activity", + "('T039', 'T067')": "physiological process", "('T039', 'T070')": "phenomenon", + "('T039', 'T081')": "physiological process", + "('T039', 'T102')": "physiological process", "('T039', 'T109', 'T121')": "drug", + "('T039', 'T109', 'T121', 'T125')": "drug", + "('T039', 'T109', 'T123', 'T125')": "chemical entity", + "('T039', 'T109', 'T125')": "chemical entity", + "('T039', 'T116', 'T121', 'T123')": "drug", + "('T039', 'T116', 'T121', 'T125')": "drug", + "('T039', 'T120')": "chemical entity", "('T039', 'T121')": "drug", + "('T039', 'T121', 'T123')": "drug", "('T039', 'T121', 'T125')": "drug", + "('T039', 'T121', 'T131')": "drug", + "('T039', 'T131')": "chemical entity", + "('T039', 'T201')": "physiological process", "('T039',)": "physiological process", + "('T040', 'T041')": "behavior", + "('T040', 'T041', 'T042')": "behavior", + "('T040', 'T041', 'T046', 'T060')": "pathological process", "('T040', 'T042')": "physiological process", + "('T040', 'T042', 'T043')": "physiological process", "('T040', 'T043')": "physiological process", "('T040', 'T043', 'T044')": "physiological process", + "('T040', 'T043', 'T046')": "pathological process", "('T040', 'T044')": "physiological process", + "('T040', 'T044', 'T045')": "physiological process", + "('T040', 'T044', 'T046')": "pathological process", "('T040', 'T045')": "physiological process", "('T040', 'T046')": "pathological process", + "('T040', 'T046', 'T061')": "pathological process", "('T040', 'T047')": "disease", + "('T040', 'T052', 'T055')": "behavior", + "('T040', 'T053')": "behavior", + "('T040', 'T054')": "behavior", "('T040', 'T055')": "behavior", + "('T040', 'T055', 'T058')": "behavior", + "('T040', 'T055', 'T081')": "behavior", + "('T040', 'T058')": "activity", + "('T040', 'T058', 'T078')": "activity", "('T040', 'T061')": "procedure", + "('T040', 'T062')": "activity", "('T040', 'T070')": "phenomenon", + "('T040', 'T079')": "physiological process", + "('T040', 'T079', 'T081')": "physiological process", + "('T040', 'T081')": "physiological process", + "('T040', 'T099')": "physiological process", + "('T040', 'T102')": "physiological process", + "('T040', 'T121', 'T197')": "drug", + "('T040', 'T184')": "phenotypic feature", + "('T040', 'T201')": "physiological process", "('T040',)": "physiological process", "('T041', 'T042')": "physiological process", "('T041', 'T046')": "pathological process", @@ -528,26 +756,51 @@ "('T046', 'T191')": "disease", "('T046',)": "pathological process", "('T047', 'T048')": "disease", + "('T047', 'T048', 'T055')": "disease", "('T047', 'T048', 'T184')": "disease or phenotypic feature", "('T047', 'T049')": "disease", "('T047', 'T050')": "disease", + "('T047', 'T054')": "disease", "('T047', 'T059')": "named thing", "('T047', 'T060')": "disease", + "('T047', 'T060', 'T061')": "procedure", "('T047', 'T061')": "disease", + "('T047', 'T061', 'T101')": "disease", "('T047', 'T067')": "disease", + "('T047', 'T068')": "disease", + "('T047', 'T068', 'T078', 'T102', 'T109', 'T131')": "named thing", + "('T047', 'T069', 'T073')": "disease", + "('T047', 'T074')": "disease", + "('T047', 'T078')": "disease", "('T047', 'T080')": "disease", "('T047', 'T081')": "disease", + "('T047', 'T090')": "disease", + "('T047', 'T091')": "disease", + "('T047', 'T091', 'T102')": "disease", + "('T047', 'T098')": "disease", + "('T047', 'T101')": "disease", + "('T047', 'T102')": "disease", "('T047', 'T109', 'T121')": "drug", "('T047', 'T109', 'T121', 'T123')": "named thing", "('T047', 'T109', 'T123')": "disease", + "('T047', 'T116')": "disease", "('T047', 'T116', 'T121', 'T123')": "named thing", + "('T047', 'T116', 'T121', 'T129')": "named thing", "('T047', 'T116', 'T123')": "named thing", "('T047', 'T116', 'T129')": "named thing", + "('T047', 'T121', 'T129')": "named thing", + "('T047', 'T121', 'T197')": "named thing", + "('T047', 'T122')": "disease", + "('T047', 'T123', 'T168', 'T196')": "small molecule", + "('T047', 'T131', 'T197')": "chemical entity", + "('T047', 'T167')": "disease", "('T047', 'T169')": "disease", "('T047', 'T184')": "disease or phenotypic feature", "('T047', 'T190')": "disease", "('T047', 'T191')": "disease", "('T047', 'T196')": "disease", + "('T047', 'T197')": "disease", + "('T047', 'T201')": "disease", "('T047', 'T204')": "disease", "('T047',)": "disease", "('T048', 'T054')": "disease", @@ -645,12 +898,22 @@ "('T059', 'T200')": "drug", "('T059',)": "procedure", "('T060', 'T061')": "procedure", + "('T060', 'T067', 'T070')": "phenomenon", + "('T060', 'T070')": "procedure", + "('T060', 'T073')": "procedure", "('T060', 'T074')": "procedure", + "('T060', 'T080', 'T170')": "named thing", "('T060', 'T081')": "procedure", + "('T060', 'T081', 'T170')": "named thing", + "('T060', 'T090')": "individual organism", "('T060', 'T091')": "procedure", + "('T060', 'T098')": "population of individual organisms", "('T060', 'T121')": "drug", "('T060', 'T170')": "procedure", + "('T060', 'T170', 'T201')": "named thing", "('T060', 'T184')": "phenotypic feature", + "('T060', 'T185')": "procedure", + "('T060', 'T201')": "procedure", "('T060', 'T204')": "procedure", "('T060',)": "procedure", "('T061', 'T062')": "procedure", @@ -695,22 +958,55 @@ "('T071',)": "named thing", "('T072',)": "physical entity", "('T073', 'T074')": "device", + "('T073', 'T075')": "device", + "('T073', 'T078')": "physical entity", "('T073', 'T078', 'T079', 'T080', 'T169', 'T170')": "publication", "('T073', 'T078', 'T093')": "agent", + "('T073', 'T078', 'T093', 'T169')": "agent", + "('T073', 'T078', 'T093', 'T170')": "publication", + "('T073', 'T078', 'T169', 'T170')": "publication", + "('T073', 'T078', 'T170')": "publication", + "('T073', 'T079', 'T093')": "agent", "('T073', 'T079', 'T093', 'T170')": "publication", + "('T073', 'T079', 'T170')": "publication", + "('T073', 'T080')": "physical entity", "('T073', 'T080', 'T169')": "physical entity", + "('T073', 'T081', 'T093')": "agent", + "('T073', 'T082')": "physical entity", + "('T073', 'T082', 'T093', 'T170')": "agent", + "('T073', 'T083')": "physical entity", "('T073', 'T083', 'T093')": "agent", "('T073', 'T090')": "physical entity", + "('T073', 'T090', 'T170')": "publication", "('T073', 'T092')": "agent", + "('T073', 'T092', 'T093')": "agent", "('T073', 'T093')": "agent", + "('T073', 'T093', 'T121')": "drug", "('T073', 'T093', 'T169')": "agent", "('T073', 'T093', 'T170')": "agent", + "('T073', 'T098', 'T102')": "population of individual organisms", + "('T073', 'T099', 'T170')": "publication", + "('T073', 'T104')": "chemical entity", + "('T073', 'T109')": "chemical entity", "('T073', 'T109', 'T121')": "drug", + "('T073', 'T109', 'T122')": "chemical entity", + "('T073', 'T109', 'T130')": "chemical entity", + "('T073', 'T109', 'T131')": "chemical entity", + "('T073', 'T109', 'T167')": "chemical entity", + "('T073', 'T109', 'T195')": "drug", + "('T073', 'T120')": "chemical entity", + "('T073', 'T120', 'T121')": "drug", "('T073', 'T121')": "drug", + "('T073', 'T122')": "physical entity", + "('T073', 'T131')": "chemical entity", "('T073', 'T167')": "chemical entity", "('T073', 'T167', 'T170')": "chemical entity", + "('T073', 'T169')": "physical entity", + "('T073', 'T169', 'T170')": "publication", "('T073', 'T170')": "publication", + "('T073', 'T185')": "physical entity", "('T073', 'T200')": "drug", + "('T073', 'T201')": "physical entity", "('T073',)": "physical entity", "('T074', 'T109')": "device", "('T074', 'T109', 'T120')": "device", @@ -718,8 +1014,10 @@ "('T074', 'T109', 'T121', 'T127')": "drug", "('T074', 'T109', 'T122')": "device", "('T074', 'T109', 'T130')": "device", + "('T074', 'T109', 'T195')": "drug", "('T074', 'T114', 'T121')": "drug", "('T074', 'T116', 'T121')": "drug", + "('T074', 'T116', 'T195')": "drug", "('T074', 'T121')": "drug", "('T074', 'T121', 'T123', 'T196')": "drug", "('T074', 'T121', 'T127')": "drug", From 5b35a3efd5e406bb5cc1aa40548af0c3952102da Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 18 Aug 2023 10:08:15 -0700 Subject: [PATCH 030/117] #316 all of the combo mappings --- tui_combo_mappings.json | 447 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 447 insertions(+) diff --git a/tui_combo_mappings.json b/tui_combo_mappings.json index 543efc37..a8727cd1 100644 --- a/tui_combo_mappings.json +++ b/tui_combo_mappings.json @@ -725,32 +725,95 @@ "('T041', 'T184')": "phenotypic feature", "('T041',)": "behavior", "('T042', 'T043')": "physiological process", + "('T042', 'T043', 'T044')": "physiological process", + "('T042', 'T043', 'T201')": "physiological process", "('T042', 'T044')": "physiological process", + "('T042', 'T045')": "physiological process", + "('T042', 'T046')": "pathological process", "('T042', 'T060')": "physiological process", + "('T042', 'T061')": "procedure", + "('T042', 'T068')": "phenomenon", + "('T042', 'T070')": "phenomenon", + "('T042', 'T079')": "physiological process", "('T042', 'T080')": "physiological process", + "('T042', 'T091')": "physiological process", "('T042', 'T116', 'T126')": "protein", "('T042', 'T121')": "drug", "('T042', 'T201')": "physiological process", "('T042',)": "physiological process", "('T043', 'T044')": "physiological process", "('T043', 'T045')": "physiological process", + "('T043', 'T045', 'T067')": "phenomenon", "('T043', 'T046')": "pathological process", + "('T043', 'T047', 'T059')": "disease", + "('T043', 'T055')": "behavior", + "('T043', 'T060')": "procedure", + "('T043', 'T062')": "activity", + "('T043', 'T067')": "phenomenon", + "('T043', 'T070')": "phenomenon", + "('T043', 'T077')": "physiological process", "('T043', 'T079')": "physiological process", + "('T043', 'T080')": "physiological process", + "('T043', 'T081')": "physiological process", + "('T043', 'T082')": "physiological process", + "('T043', 'T116', 'T123')": "polypeptide", + "('T043', 'T121', 'T129')": "drug", + "('T043', 'T191')": "disease", "('T043',)": "physiological process", "('T044', 'T045')": "physiological process", + "('T044', 'T045', 'T080')": "physiological process", + "('T044', 'T045', 'T116', 'T126')": "protein", "('T044', 'T046')": "pathological process", "('T044', 'T047')": "disease", + "('T044', 'T049')": "disease", + "('T044', 'T059')": "procedure", + "('T044', 'T066')": "activity", + "('T044', 'T067')": "phenomenon", "('T044', 'T070')": "phenomenon", + "('T044', 'T070', 'T080')": "phenomenon", + "('T044', 'T078')": "molecular activity", + "('T044', 'T081')": "molecular activity", + "('T044', 'T109', 'T116', 'T121', 'T123')": "drug", + "('T044', 'T116')": "polypeptide", + "('T044', 'T116', 'T126')": "protein", + "('T044', 'T123')": "chemical entity", + "('T044', 'T169')": "molecular activity", "('T044',)": "molecular activity", "('T045', 'T049')": "disease", + "('T045', 'T049', 'T063')": "disease", + "('T045', 'T049', 'T086')": "disease", + "('T045', 'T063')": "procedure", + "('T045', 'T070')": "phenomenon", + "('T045', 'T081')": "physiological process", + "('T045', 'T086')": "nucleic acid entity", + "('T045', 'T114')": "nucleic acid entity", + "('T045', 'T114', 'T123')": "nucleic acid entity", + "('T045', 'T169')": "physiological process", "('T045',)": "physiological process", "('T046', 'T047')": "disease", + "('T046', 'T047', 'T048')": "disease", + "('T046', 'T047', 'T048', 'T184')": "disease", + "('T046', 'T047', 'T074')": "disease", + "('T046', 'T047', 'T184')": "disease", + "('T046', 'T047', 'T190')": "disease", + "('T046', 'T047', 'T191')": "disease", + "('T046', 'T048')": "disease", + "('T046', 'T048', 'T184')": "disease", + "('T046', 'T049')": "disease", + "('T046', 'T055')": "pathological process", "('T046', 'T056')": "pathological process", + "('T046', 'T059')": "pathological process", "('T046', 'T061')": "pathological process", + "('T046', 'T067')": "pathological process", + "('T046', 'T070')": "pathological process", + "('T046', 'T080')": "pathological process", "('T046', 'T082', 'T201')": "pathological process", + "('T046', 'T109', 'T116', 'T121', 'T123')": "named thing", "('T046', 'T109', 'T121')": "drug", "('T046', 'T109', 'T121', 'T130')": "drug", "('T046', 'T116', 'T121')": "drug", + "('T046', 'T121')": "named thing", + "('T046', 'T169')": "pathological process", "('T046', 'T184')": "pathological process", "('T046', 'T190')": "disease", "('T046', 'T191')": "disease", @@ -803,58 +866,215 @@ "('T047', 'T201')": "disease", "('T047', 'T204')": "disease", "('T047',)": "disease", + "('T048', 'T051')": "disease", "('T048', 'T054')": "disease", "('T048', 'T055')": "disease", + "('T048', 'T055', 'T056')": "disease", + "('T048', 'T055', 'T167')": "disease", + "('T048', 'T055', 'T184')": "disease", + "('T048', 'T059')": "disease", + "('T048', 'T061')": "disease", + "('T048', 'T068')": "disease", + "('T048', 'T102')": "disease", + "('T048', 'T109', 'T121', 'T131')": "named thing", + "('T048', 'T121', 'T131')": "named thing", "('T048', 'T184')": "disease or phenotypic feature", "('T048',)": "disease", "('T049', 'T059')": "disease", + "('T049', 'T086')": "named thing", "('T049',)": "disease", + "('T050', 'T061')": "procedure", "('T050', 'T191')": "disease", "('T050',)": "named thing", + "('T051', 'T052')": "activity", + "('T051', 'T093')": "named thing", + "('T051', 'T098')": "named thing", "('T051',)": "event", + "('T052', 'T053')": "behavior", + "('T052', 'T054')": "behavior", + "('T052', 'T054', 'T078')": "behavior", + "('T052', 'T055')": "behavior", + "('T052', 'T056')": "activity", + "('T052', 'T057')": "activity", + "('T052', 'T057', 'T064', 'T081', 'T090', 'T170')": "activity", + "('T052', 'T058')": "activity", + "('T052', 'T058', 'T079', 'T170')": "activity", + "('T052', 'T059')": "procedure", + "('T052', 'T059', 'T062', 'T090')": "procedure", + "('T052', 'T062', 'T097')": "activity", + "('T052', 'T064')": "activity", + "('T052', 'T066', 'T170')": "activity", + "('T052', 'T068')": "phenomenon", + "('T052', 'T068', 'T072')": "phenomenon", + "('T052', 'T073')": "physical entity", + "('T052', 'T078', 'T081')": "activity", "('T052', 'T079')": "activity", + "('T052', 'T081')": "activity", + "('T052', 'T091')": "activity", + "('T052', 'T095')": "named thing", + "('T052', 'T167')": "chemical entity", + "('T052', 'T170')": "activity", "('T052',)": "activity", + "('T053', 'T054')": "behavior", + "('T053', 'T055')": "behavior", + "('T053', 'T082')": "behavior", "('T053',)": "behavior", "('T054', 'T055')": "behavior", + "('T054', 'T055', 'T068')": "behavior", + "('T054', 'T055', 'T080')": "behavior", + "('T054', 'T055', 'T097')": "behavior", + "('T054', 'T057')": "behavior", + "('T054', 'T058')": "behavior", + "('T054', 'T061')": "behavior", + "('T054', 'T062')": "behavior", + "('T054', 'T062', 'T079', 'T099')": "named thing", + "('T054', 'T064', 'T068', 'T078', 'T080', 'T089', 'T170')": "behavior", + "('T054', 'T064', 'T080')": "behavior", "('T054', 'T068')": "behavior", + "('T054', 'T071')": "behavior", "('T054', 'T078')": "behavior", + "('T054', 'T079')": "behavior", "('T054', 'T080')": "behavior", + "('T054', 'T080', 'T097', 'T170')": "named thing", + "('T054', 'T090')": "named thing", + "('T054', 'T093')": "named thing", + "('T054', 'T095')": "named thing", + "('T054', 'T097')": "named thing", "('T054', 'T098')": "behavior", + "('T054', 'T102')": "behavior", + "('T054', 'T170')": "named thing", "('T054',)": "behavior", + "('T055', 'T056', 'T109', 'T131')": "named thing", + "('T055', 'T057', 'T068', 'T078', 'T090')": "behavior", + "('T055', 'T058', 'T080')": "behavior", + "('T055', 'T061', 'T078')": "behavior", + "('T055', 'T061', 'T098')": "named thing", + "('T055', 'T061', 'T099')": "named thing", + "('T055', 'T064')": "behavior", + "('T055', 'T073', 'T074')": "named thing", "('T055', 'T078')": "behavior", + "('T055', 'T079')": "behavior", + "('T055', 'T079', 'T168')": "named thing", "('T055', 'T080')": "behavior", + "('T055', 'T090')": "behavior", + "('T055', 'T102')": "behavior", + "('T055', 'T109', 'T121')": "named thing", + "('T055', 'T131')": "named thing", "('T055', 'T170')": "behavior", "('T055',)": "behavior", + "('T056', 'T058')": "activity", + "('T056', 'T061')": "procedure", + "('T056', 'T068', 'T121')": "named thing", "('T056', 'T073')": "activity", + "('T056', 'T078')": "activity", "('T056', 'T079')": "activity", + "('T056', 'T080')": "activity", + "('T056', 'T089', 'T090')": "named thing", + "('T056', 'T102')": "activity", + "('T056', 'T169')": "activity", + "('T056', 'T170')": "named thing", "('T056',)": "activity", "('T057', 'T058')": "activity", + "('T057', 'T058', 'T080')": "activity", + "('T057', 'T058', 'T170')": "named thing", + "('T057', 'T059')": "named thing", + "('T057', 'T061')": "named thing", + "('T057', 'T061', 'T169')": "named thing", "('T057', 'T062')": "activity", + "('T057', 'T062', 'T081')": "activity", + "('T057', 'T062', 'T081', 'T098', 'T170')": "named thing", + "('T057', 'T062', 'T169', 'T170')": "named thing", + "('T057', 'T064')": "activity", + "('T057', 'T064', 'T073')": "activity", + "('T057', 'T065')": "activity", + "('T057', 'T066')": "activity", + "('T057', 'T067')": "activity", + "('T057', 'T068')": "activity", + "('T057', 'T070')": "phenomenon", "('T057', 'T073')": "activity", + "('T057', 'T073', 'T078', 'T170')": "named thing", + "('T057', 'T073', 'T097')": "named thing", "('T057', 'T073', 'T170')": "activity", + "('T057', 'T074')": "named thing", + "('T057', 'T077', 'T097')": "named thing", "('T057', 'T078')": "activity", + "('T057', 'T078', 'T080')": "activity", + "('T057', 'T078', 'T080', 'T097', 'T170')": "named thing", + "('T057', 'T078', 'T081')": "activity", + "('T057', 'T078', 'T081', 'T097')": "named thing", "('T057', 'T079')": "activity", "('T057', 'T080')": "activity", "('T057', 'T081')": "activity", + "('T057', 'T081', 'T170')": "named thing", "('T057', 'T090')": "activity", + "('T057', 'T091')": "activity", + "('T057', 'T092')": "named thing", + "('T057', 'T093')": "named thing", + "('T057', 'T095')": "named thing", + "('T057', 'T097')": "named thing", + "('T057', 'T098')": "named thing", + "('T057', 'T167')": "named thing", + "('T057', 'T168')": "named thing", "('T057', 'T170')": "activity", "('T057',)": "activity", + "('T058', 'T059')": "procedure", "('T058', 'T060')": "procedure", + "('T058', 'T060', 'T061')": "procedure", "('T058', 'T061')": "procedure", + "('T058', 'T061', 'T067')": "procedure", + "('T058', 'T061', 'T073', 'T074', 'T093')": "named thing", + "('T058', 'T061', 'T074')": "procedure", + "('T058', 'T061', 'T091')": "procedure", + "('T058', 'T061', 'T093')": "procedure", + "('T058', 'T062')": "activity", + "('T058', 'T062', 'T080')": "activity", + "('T058', 'T062', 'T170')": "named thing", + "('T058', 'T064')": "activity", + "('T058', 'T064', 'T089')": "activity", "('T058', 'T065')": "activity", + "('T058', 'T065', 'T097')": "named thing", + "('T058', 'T066', 'T170')": "named thing", + "('T058', 'T067', 'T170')": "named thing", + "('T058', 'T068')": "phenomenon", + "('T058', 'T068', 'T074')": "named thing", + "('T058', 'T069')": "phenomenon", + "('T058', 'T073')": "named thing", + "('T058', 'T073', 'T078', 'T093')": "named thing", + "('T058', 'T073', 'T080', 'T093', 'T170')": "named thing", "('T058', 'T073', 'T093')": "agent", + "('T058', 'T074')": "named thing", "('T058', 'T078')": "activity", + "('T058', 'T078', 'T080')": "activity", + "('T058', 'T078', 'T080', 'T090')": "named thing", + "('T058', 'T078', 'T082')": "activity", + "('T058', 'T078', 'T089')": "activity", + "('T058', 'T078', 'T093')": "named thing", + "('T058', 'T079')": "activity", "('T058', 'T080')": "activity", + "('T058', 'T080', 'T081')": "activity", "('T058', 'T081')": "activity", + "('T058', 'T081', 'T170')": "named thing", + "('T058', 'T089')": "activity", + "('T058', 'T090')": "named thing", + "('T058', 'T090', 'T093')": "named thing", "('T058', 'T091')": "activity", + "('T058', 'T092', 'T093')": "named thing", "('T058', 'T093')": "activity", + "('T058', 'T093', 'T170')": "named thing", + "('T058', 'T095', 'T096')": "named thing", "('T058', 'T097')": "activity", + "('T058', 'T097', 'T170')": "named thing", "('T058', 'T098', 'T116', 'T121', 'T129')": "drug", + "('T058', 'T098', 'T116', 'T129')": "named thing", "('T058', 'T098', 'T121', 'T129')": "drug", + "('T058', 'T099')": "named thing", "('T058', 'T101')": "activity", + "('T058', 'T102')": "activity", + "('T058', 'T121')": "named thing", "('T058', 'T169')": "activity", "('T058', 'T170')": "publication", "('T058', 'T184')": "phenotypic feature", + "('T058', 'T201')": "activity", "('T058',)": "activity", "('T059', 'T060')": "procedure", "('T059', 'T060', 'T170')": "procedure", @@ -917,45 +1137,156 @@ "('T060', 'T204')": "procedure", "('T060',)": "procedure", "('T061', 'T062')": "procedure", + "('T061', 'T063')": "procedure", + "('T061', 'T065')": "procedure", + "('T061', 'T067')": "procedure", "('T061', 'T068')": "phenomenon", + "('T061', 'T070')": "procedure", "('T061', 'T073', 'T093')": "physical entity", "('T061', 'T074')": "device", + "('T061', 'T074', 'T091')": "named thing", + "('T061', 'T074', 'T101')": "named thing", + "('T061', 'T074', 'T122')": "named thing", + "('T061', 'T078')": "procedure", "('T061', 'T078', 'T080')": "procedure", "('T061', 'T079')": "procedure", + "('T061', 'T080')": "procedure", "('T061', 'T091')": "procedure", + "('T061', 'T091', 'T170')": "named thing", + "('T061', 'T093')": "named thing", "('T061', 'T098')": "procedure", "('T061', 'T109', 'T121')": "drug", + "('T061', 'T109', 'T123')": "named thing", "('T061', 'T116', 'T121', 'T129')": "drug", + "('T061', 'T116', 'T126')": "named thing", + "('T061', 'T116', 'T129')": "named thing", "('T061', 'T121')": "drug", + "('T061', 'T121', 'T123')": "named thing", + "('T061', 'T121', 'T129')": "named thing", + "('T061', 'T122')": "named thing", + "('T061', 'T168')": "named thing", "('T061', 'T169')": "procedure", + "('T061', 'T170')": "named thing", + "('T061', 'T201')": "procedure", "('T061',)": "procedure", + "('T062', 'T063')": "procedure", + "('T062', 'T063', 'T073')": "named thing", + "('T062', 'T065')": "activity", + "('T062', 'T067')": "phenomenon", + "('T062', 'T070')": "phenomenon", + "('T062', 'T075')": "named thing", + "('T062', 'T075', 'T078')": "named thing", + "('T062', 'T078')": "activity", + "('T062', 'T078', 'T080', 'T081', 'T170')": "named thing", + "('T062', 'T078', 'T081', 'T082', 'T090', 'T097', 'T170')": "named thing", + "('T062', 'T079', 'T098', 'T099', 'T102')": "named thing", + "('T062', 'T080', 'T081')": "activity", + "('T062', 'T080', 'T081', 'T130')": "named thing", "('T062', 'T081')": "activity", + "('T062', 'T081', 'T096', 'T169', 'T170')": "named thing", + "('T062', 'T081', 'T170')": "named thing", "('T062', 'T083')": "activity", + "('T062', 'T087')": "named thing", + "('T062', 'T090')": "named thing", "('T062', 'T091')": "activity", + "('T062', 'T109', 'T121')": "named thing", + "('T062', 'T169')": "activity", "('T062', 'T170')": "activity", + "('T062', 'T185')": "activity", "('T062',)": "activity", + "('T063', 'T075')": "named thing", + "('T063', 'T170')": "named thing", "('T063',)": "procedure", + "('T064', 'T067', 'T078')": "phenomenon", + "('T064', 'T069', 'T078', 'T081')": "phenomenon", "('T064', 'T078')": "activity", + "('T064', 'T078', 'T081', 'T089')": "activity", + "('T064', 'T078', 'T089')": "activity", "('T064', 'T081')": "activity", "('T064', 'T089')": "activity", + "('T064', 'T092')": "named thing", + "('T064', 'T098')": "named thing", + "('T064', 'T098', 'T102')": "named thing", + "('T064', 'T170')": "named thing", "('T064',)": "activity", + "('T065', 'T073', 'T170')": "named thing", + "('T065', 'T078')": "activity", + "('T065', 'T080')": "activity", "('T065', 'T080', 'T185')": "activity", + "('T065', 'T081')": "activity", + "('T065', 'T089')": "activity", + "('T065', 'T090')": "named thing", + "('T065', 'T097')": "named thing", + "('T065', 'T098')": "named thing", "('T065', 'T109')": "chemical entity", + "('T065', 'T169')": "activity", + "('T065', 'T170')": "named thing", "('T065',)": "activity", "('T066', 'T073')": "activity", + "('T066', 'T073', 'T074')": "named thing", + "('T066', 'T073', 'T090')": "named thing", + "('T066', 'T073', 'T170')": "named thing", + "('T066', 'T081')": "activity", + "('T066', 'T091')": "activity", "('T066', 'T170')": "activity", "('T066',)": "activity", + "('T067', 'T068')": "phenomenon", + "('T067', 'T068', 'T070')": "phenomenon", + "('T067', 'T068', 'T078')": "phenomenon", + "('T067', 'T068', 'T078', 'T079', 'T081', 'T098')": "named thing", + "('T067', 'T069')": "phenomenon", "('T067', 'T070')": "phenomenon", + "('T067', 'T078')": "phenomenon", + "('T067', 'T079')": "phenomenon", + "('T067', 'T082')": "phenomenon", "('T067', 'T116', 'T121', 'T123')": "drug", "('T067',)": "phenomenon", + "('T068', 'T069')": "phenomenon", + "('T068', 'T070')": "phenomenon", + "('T068', 'T070', 'T073')": "named thing", + "('T068', 'T072', 'T170')": "named thing", "('T068', 'T073')": "phenomenon", + "('T068', 'T073', 'T093')": "named thing", + "('T068', 'T073', 'T098')": "named thing", + "('T068', 'T073', 'T170')": "named thing", + "('T068', 'T075')": "named thing", + "('T068', 'T078')": "phenomenon", + "('T068', 'T078', 'T079')": "phenomenon", + "('T068', 'T080')": "phenomenon", + "('T068', 'T081')": "phenomenon", + "('T068', 'T081', 'T102')": "phenomenon", + "('T068', 'T083')": "named thing", + "('T068', 'T090')": "named thing", + "('T068', 'T090', 'T096')": "named thing", + "('T068', 'T098')": "named thing", + "('T068', 'T167')": "named thing", + "('T068', 'T169')": "phenomenon", + "('T068', 'T170')": "named thing", "('T068',)": "phenomenon", + "('T069', 'T070')": "phenomenon", + "('T069', 'T080')": "phenomenon", "('T069',)": "phenomenon", + "('T070', 'T073')": "named thing", + "('T070', 'T073', 'T167')": "named thing", "('T070', 'T078')": "phenomenon", + "('T070', 'T080')": "phenomenon", + "('T070', 'T081', 'T082')": "phenomenon", + "('T070', 'T082')": "phenomenon", + "('T070', 'T082', 'T104')": "named thing", "('T070', 'T083')": "phenomenon", + "('T070', 'T091')": "phenomenon", + "('T070', 'T098')": "named thing", + "('T070', 'T104')": "named thing", + "('T070', 'T120')": "named thing", + "('T070', 'T129')": "phenomenon", + "('T070', 'T131')": "named thing", "('T070', 'T169', 'T170')": "named thing", + "('T070', 'T184')": "phenotypic feature", "('T070',)": "phenomenon", + "('T071', 'T073', 'T093', 'T122', 'T169')": "named thing", "('T071',)": "named thing", + "('T072', 'T131')": "chemical entity", + "('T072', 'T170')": "publication", "('T072',)": "physical entity", "('T073', 'T074')": "device", "('T073', 'T075')": "device", @@ -1008,40 +1339,89 @@ "('T073', 'T200')": "drug", "('T073', 'T201')": "physical entity", "('T073',)": "physical entity", + "('T074', 'T078')": "device", + "('T074', 'T081')": "device", + "('T074', 'T081', 'T201')": "device", + "('T074', 'T091')": "device", + "('T074', 'T098')": "named thing", "('T074', 'T109')": "device", "('T074', 'T109', 'T120')": "device", "('T074', 'T109', 'T121')": "drug", + "('T074', 'T109', 'T121', 'T125')": "drug", "('T074', 'T109', 'T121', 'T127')": "drug", + "('T074', 'T109', 'T121', 'T129')": "drug", + "('T074', 'T109', 'T121', 'T131')": "drug", + "('T074', 'T109', 'T121', 'T200')": "drug", "('T074', 'T109', 'T122')": "device", "('T074', 'T109', 'T130')": "device", + "('T074', 'T109', 'T131')": "chemical entity", + "('T074', 'T109', 'T168')": "food", "('T074', 'T109', 'T195')": "drug", + "('T074', 'T109', 'T200')": "drug", "('T074', 'T114', 'T121')": "drug", + "('T074', 'T116')": "polypeptide", "('T074', 'T116', 'T121')": "drug", + "('T074', 'T116', 'T121', 'T122')": "drug", + "('T074', 'T116', 'T121', 'T123')": "drug", + "('T074', 'T116', 'T121', 'T125')": "drug", + "('T074', 'T116', 'T121', 'T127')": "drug", + "('T074', 'T116', 'T121', 'T129')": "drug", "('T074', 'T116', 'T195')": "drug", "('T074', 'T121')": "drug", "('T074', 'T121', 'T123', 'T196')": "drug", "('T074', 'T121', 'T127')": "drug", "('T074', 'T121', 'T129')": "drug", + "('T074', 'T121', 'T130', 'T197')": "drug", "('T074', 'T121', 'T197')": "drug", "('T074', 'T122')": "device", + "('T074', 'T130')": "chemical entity", "('T074', 'T168')": "food", + "('T074', 'T170')": "named thing", + "('T074', 'T197')": "chemical entity", "('T074', 'T200')": "drug", "('T074', 'T203')": "device", + "('T074', 'T204')": "named thing", "('T074',)": "device", "('T075',)": "device", "('T077', 'T078')": "named thing", + "('T077', 'T090')": "individual organism", "('T077', 'T170')": "publication", "('T077',)": "named thing", "('T078', 'T079')": "named thing", + "('T078', 'T079', 'T081')": "named thing", + "('T078', 'T079', 'T169')": "named thing", "('T078', 'T079', 'T170')": "publication", "('T078', 'T080')": "named thing", + "('T078', 'T080', 'T081', 'T098', 'T102')": "population of individual organisms", "('T078', 'T080', 'T082', 'T099')": "cohort", + "('T078', 'T080', 'T089')": "named thing", + "('T078', 'T080', 'T090')": "individual organism", + "('T078', 'T080', 'T093', 'T169')": "agent", "('T078', 'T080', 'T170')": "publication", "('T078', 'T081')": "named thing", + "('T078', 'T081', 'T169')": "named thing", + "('T078', 'T081', 'T169', 'T170')": "publication", + "('T078', 'T081', 'T170')": "publication", + "('T078', 'T082')": "named thing", + "('T078', 'T082', 'T170')": "publication", + "('T078', 'T083')": "geographic location", "('T078', 'T089')": "named thing", + "('T078', 'T089', 'T170')": "publication", + "('T078', 'T090')": "individual organism", + "('T078', 'T090', 'T097')": "cohort", "('T078', 'T091')": "named thing", "('T078', 'T092')": "agent", + "('T078', 'T092', 'T098')": "named thing", + "('T078', 'T093')": "agent", + "('T078', 'T093', 'T169')": "agent", + "('T078', 'T095')": "agent", + "('T078', 'T096')": "agent", + "('T078', 'T097')": "cohort", "('T078', 'T098')": "population of individual organisms", + "('T078', 'T098', 'T121')": "named thing", + "('T078', 'T098', 'T121', 'T129')": "named thing", + "('T078', 'T102')": "named thing", + "('T078', 'T122')": "device", "('T078', 'T169')": "named thing", "('T078', 'T169', 'T170')": "publication", "('T078', 'T170')": "publication", @@ -1066,11 +1446,19 @@ "('T079', 'T170')": "publication", "('T079',)": "named thing", "('T080', 'T081')": "named thing", + "('T080', 'T081', 'T098')": "population of individual organisms", "('T080', 'T081', 'T169')": "named thing", + "('T080', 'T082')": "named thing", "('T080', 'T082', 'T169')": "named thing", + "('T080', 'T083', 'T093', 'T098')": "named thing", "('T080', 'T089')": "named thing", + "('T080', 'T098')": "population of individual organisms", + "('T080', 'T102')": "named thing", + "('T080', 'T121', 'T201')": "drug", + "('T080', 'T168')": "food", "('T080', 'T169')": "named thing", "('T080', 'T170')": "publication", + "('T080', 'T201')": "named thing", "('T080',)": "named thing", "('T081', 'T083')": "geographic location", "('T081', 'T085')": "named thing", @@ -1097,41 +1485,86 @@ "('T082', 'T190')": "disease", "('T082', 'T191')": "disease", "('T082',)": "named thing", + "('T083', 'T109', 'T130')": "named thing", + "('T083', 'T167')": "named thing", + "('T083', 'T167', 'T169')": "named thing", "('T083', 'T169')": "geographic location", + "('T083', 'T170')": "named thing", + "('T083', 'T204')": "named thing", "('T083',)": "geographic location", + "('T085', 'T123')": "chemical entity", "('T085',)": "named thing", + "('T086', 'T114')": "nucleic acid entity", + "('T086', 'T114', 'T123')": "nucleic acid entity", "('T086',)": "nucleic acid entity", + "('T087', 'T116', 'T123')": "polypeptide", + "('T087', 'T169')": "polypeptide", "('T087',)": "polypeptide", "('T088',)": "named thing", + "('T089', 'T092')": "agent", "('T089', 'T170')": "publication", "('T089',)": "named thing", "('T090', 'T091')": "individual organism", + "('T090', 'T097')": "cohort", + "('T090', 'T098')": "population of individual organisms", "('T090', 'T170')": "individual organism", + "('T090', 'T185')": "individual organism", "('T090',)": "individual organism", "('T091', 'T097')": "cohort", + "('T091', 'T109')": "chemical entity", + "('T091', 'T169')": "named thing", + "('T091', 'T170')": "publication", + "('T091', 'T191')": "disease", "('T091',)": "named thing", + "('T092', 'T093')": "agent", + "('T092', 'T094')": "agent", "('T092', 'T097', 'T170')": "agent", + "('T092', 'T099')": "named thing", "('T092', 'T170')": "named thing", "('T092',)": "agent", + "('T093', 'T097', 'T098', 'T170')": "named thing", + "('T093', 'T097', 'T170')": "named thing", + "('T093', 'T109', 'T121', 'T125')": "named thing", "('T093', 'T109', 'T123')": "agent", "('T093', 'T116', 'T123')": "polypeptide", "('T093', 'T121')": "drug", + "('T093', 'T169', 'T170')": "named thing", + "('T093', 'T170')": "named thing", "('T093',)": "agent", "('T094',)": "agent", + "('T095', 'T098')": "named thing", "('T095',)": "agent", + "('T096', 'T101')": "named thing", + "('T096', 'T170')": "named thing", "('T096',)": "agent", + "('T097', 'T098')": "cohort", + "('T097', 'T098', 'T099')": "cohort", + "('T097', 'T102')": "cohort", "('T097', 'T170')": "cohort", "('T097',)": "cohort", + "('T098', 'T099')": "cohort", + "('T098', 'T100')": "cohort", + "('T098', 'T101')": "cohort", + "('T098', 'T102')": "population of individual organisms", "('T098', 'T109', 'T121', 'T129')": "drug", "('T098', 'T116', 'T121', 'T129')": "drug", "('T098', 'T121', 'T129')": "named thing", + "('T098', 'T121', 'T129', 'T170')": "named thing", "('T098', 'T170')": "publication", "('T098',)": "population of individual organisms", + "('T099', 'T100')": "cohort", "('T099', 'T102')": "cohort", + "('T099', 'T170')": "named thing", "('T099',)": "cohort", "('T100',)": "cohort", + "('T101', 'T201')": "cohort", "('T101',)": "cohort", + "('T102', 'T122')": "device", "('T102',)": "named thing", + "('T103', 'T109')": "chemical entity", + "('T103', 'T109', 'T116', 'T121')": "drug", + "('T103', 'T120')": "chemical entity", + "('T103', 'T130')": "chemical entity", "('T103',)": "chemical entity", "('T104', 'T109')": "chemical entity", "('T104', 'T109', 'T116', 'T121', 'T123', 'T130')": "drug", @@ -1459,6 +1892,11 @@ "('T116', 'T200')": "drug", "('T116',)": "polypeptide", "('T120', 'T121')": "drug", + "('T120', 'T122')": "chemical entity", + "('T120', 'T130')": "chemical entity", + "('T120', 'T168')": "food", + "('T120', 'T197')": "chemical entity", + "('T120', 'T200')": "drug", "('T120',)": "chemical entity", "('T121', 'T122')": "drug", "('T121', 'T122', 'T127')": "drug", @@ -1544,6 +1982,7 @@ "('T125', 'T130')": "chemical entity", "('T125',)": "chemical entity", "('T126', 'T129')": "protein", + "('T126', 'T130')": "protein", "('T126',)": "protein", "('T127',)": "small molecule", "('T129', 'T130')": "chemical entity", @@ -1567,28 +2006,36 @@ "('T130', 'T200')": "drug", "('T130',)": "chemical entity", "('T131', 'T167')": "chemical entity", + "('T131', 'T167', 'T197')": "chemical entity", "('T131', 'T196')": "small molecule", "('T131', 'T196', 'T197')": "small molecule", "('T131', 'T197')": "chemical entity", "('T131', 'T197', 'T200')": "drug", "('T131',)": "chemical entity", + "('T167', 'T169')": "chemical entity", "('T167',)": "chemical entity", + "('T168', 'T196')": "food", + "('T168', 'T197')": "food", "('T168', 'T200')": "drug", "('T168',)": "food", "('T169', 'T170')": "publication", "('T169',)": "named thing", + "('T170', 'T171')": "publication", "('T170', 'T185')": "publication", "('T170',)": "publication", "('T171',)": "named thing", "('T184', 'T190')": "disease or phenotypic feature", "('T184',)": "phenotypic feature", "('T185',)": "named thing", + "('T190', 'T191')": "disease", "('T190',)": "disease", "('T191',)": "disease", "('T192',)": "protein", "('T194',)": "organism taxon", + "('T195', 'T200')": "drug", "('T195',)": "drug", "('T196', 'T197')": "small molecule", + "('T196', 'T200')": "small molecule", "('T196',)": "small molecule", "('T197', 'T200')": "drug", "('T197',)": "chemical entity", From 93f08ff5fb238de2e3ea94f75d1c94b42b9d67aa Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 18 Aug 2023 10:33:11 -0700 Subject: [PATCH 031/117] #316 clean up print statements for start/end --- umls_list_jsonl_to_kg_jsonl.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index c544fa97..6803ac1d 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -157,6 +157,7 @@ def process_chv_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed nodes_output.write(node) + def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): curie_prefix = kg2_util.CURIE_PREFIX_DRUGBANK provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) @@ -237,6 +238,7 @@ def process_fma_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed if __name__ == '__main__': + print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) args = get_args() input_file_name = args.inputFile test_mode = args.test @@ -260,7 +262,6 @@ def process_fma_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed for item in iri_mappings_raw: for prefix in item: iri_mappings[prefix] = item[prefix] - print(json.dumps(iri_mappings, indent=4, sort_keys=True)) for data in input_items: # There should only be one item in the data dictionary @@ -286,4 +287,5 @@ def process_fma_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed process_fma_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) kg2_util.end_read_jsonlines(input_read_jsonlines_info) - kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) \ No newline at end of file + kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) + print("Finishing umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) From fba2af0a7405c1982dfb68960d9d3063a514e660 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 18 Aug 2023 10:40:53 -0700 Subject: [PATCH 032/117] #316 swap order of source and id to make looking through a particular source easier --- umls_list_jsonl_to_kg_jsonl.py | 2 +- umls_mysql_to_list_jsonl.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 6803ac1d..a27e71b9 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -43,7 +43,7 @@ def get_args(): def extract_node_id(node_id_str): node_id_str = node_id_str.replace('(', '').replace(')', '').replace("'", '') node_id = node_id_str.split(',') - return node_id[1].strip(), node_id[0].strip() + return node_id[0].strip(), node_id[1].strip() def make_node_id(curie_prefix, node_id): diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index 88ff11a1..fc91b7ca 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -59,7 +59,7 @@ def code_sources(cursor, output): cursor.execute(names_sql_statement) for result in cursor.fetchall(): (node_id, node_source, cui, names) = result - key = (node_id, node_source) + key = (node_source, node_id) code_source_info[key] = dict() code_source_info[key][cui_key] = cui.split(',') if name_key not in code_source_info[key]: @@ -78,7 +78,7 @@ def code_sources(cursor, output): cursor.execute(extra_info_sql_statement) for result in cursor.fetchall(): (node_id, node_source, info) = result - key = (node_id, node_source) + key = (node_source, node_id) if key not in code_source_info: # This occurs if a node doesn't have a name. continue @@ -98,7 +98,7 @@ def code_sources(cursor, output): cursor.execute(tuis_sql_statement) for result in cursor.fetchall(): (node_id, node_source, tuis) = result - key = (node_id, node_source) + key = (node_source, node_id) if key not in code_source_info: # This occurs if a node doesn't have a name. continue @@ -134,7 +134,7 @@ def cui_sources(cursor, output, sources): cursor.execute(names_sql_statement) for result in cursor.fetchall(): (node_id, names) = result - key = (node_id, umls_source_name) + key = (umls_source_name, node_id) cui_source_info[key] = dict() cui_source_info[key][name_key] = dict() for name in names.split('\t'): @@ -157,7 +157,7 @@ def cui_sources(cursor, output, sources): cursor.execute(tuis_sql_statement) for result in cursor.fetchall(): (node_id, tuis) = result - key = (node_id, umls_source_name) + key = (umls_source_name, node_id) if key not in cui_source_info: # This happens if a node doesn't have an English name. See https://github.com/RTXteam/RTX-KG2/issues/316#issuecomment-1672074392 continue @@ -168,7 +168,7 @@ def cui_sources(cursor, output, sources): cursor.execute(relations_sql_statement) for result in cursor.fetchall(): (cui1, rel, rela, direction, cui2, source) = result - key = (cui1, umls_source_name) + key = (umls_source_name, cui1) if key not in cui_source_info: # See above for explanation continue @@ -187,7 +187,7 @@ def cui_sources(cursor, output, sources): cursor.execute(definitions_sql_statement) for result in cursor.fetchall(): (node_id, definition) = result - key = (node_id, umls_source_name) + key = (umls_source_name, node_id) if key not in cui_source_info: # See above for explanation continue From 0d61e8369fb7c5d260e59a5aaa167938edff83a9 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 18 Aug 2023 11:36:43 -0700 Subject: [PATCH 033/117] #316 factor out description creation --- umls_list_jsonl_to_kg_jsonl.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index a27e71b9..bfeb84f7 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -50,6 +50,14 @@ def make_node_id(curie_prefix, node_id): return curie_prefix + ':' + node_id +def create_description(tuis): + description = str() + for tui in tuis: + description += "; UMLS Semantic Type: STY:" + tui + description = description.strip("; ") + return description + + def process_atc_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): curie_prefix = kg2_util.CURIE_PREFIX_ATC provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) @@ -103,11 +111,7 @@ def process_atc_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed name = name[0] node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) node['synonym'] = synonyms - description = str() - for tui in tuis: - description += "; UMLS Semantic Type: STY:" + tui - description.strip("; ") - node['description'] = description + node['description'] = create_description(tuis) nodes_output.write(node) @@ -149,11 +153,7 @@ def process_chv_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) node['synonym'] = synonyms - description = str() - for tui in tuis: - description += "; UMLS Semantic Type: STY:" + tui - description.strip("; ") - node['description'] = description + node['description'] = create_description(tuis) nodes_output.write(node) @@ -184,11 +184,7 @@ def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_outpu # TODO: figure out update date node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) node['synonym'] = synonyms - description = str() - for tui in tuis: - description += "; UMLS Semantic Type: STY:" + tui - description.strip("; ") - node['description'] = description + node['description'] = create_description(tuis) nodes_output.write(node) @@ -228,11 +224,7 @@ def process_fma_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) node['synonym'] = synonyms - description = str() - for tui in tuis: - description += "; UMLS Semantic Type: STY:" + tui - description.strip("; ") - node['description'] = description + node['description'] = create_description(tuis) nodes_output.write(node) From b2f04f25a4f5f8bdee5a91e3f410cca1330888b7 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 18 Aug 2023 13:07:38 -0700 Subject: [PATCH 034/117] #316 go is working --- umls_list_jsonl_to_kg_jsonl.py | 157 +++++++++++++++------------------ 1 file changed, 73 insertions(+), 84 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index bfeb84f7..271e8855 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -50,14 +50,32 @@ def make_node_id(curie_prefix, node_id): return curie_prefix + ':' + node_id -def create_description(tuis): - description = str() +def create_description(comment, tuis): + description = comment for tui in tuis: description += "; UMLS Semantic Type: STY:" + tui description = description.strip("; ") return description +def get_name_synonyms(names_dict, accession_heirarchy): + names = list() + for key in accession_heirarchy: + names += [name for name in names_dict.get(key, dict()).get('Y', list())] + names += [name for name in names_dict.get(key, dict()).get('N', list())] + assert len(names) > 0 + if len(names) == 1: + return names[0], list() + return names[0], names[1:] + + +def get_name_keys(names_dict): + keys_list = [] + for key in names_dict: + keys_list.append(key) + return str(sorted(keys_list)) + + def process_atc_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): curie_prefix = kg2_util.CURIE_PREFIX_ATC provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) @@ -73,45 +91,11 @@ def process_atc_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed name = str() synonyms = list() names = info.get(NAMES_KEY, dict()) - if "RXN_PT" in names: - rxn_pt = names.get('RXN_PT', dict()) - if 'Y' in rxn_pt: - name = rxn_pt.get('Y', '') - assert len(name) == 1 - name = name[0] - else: - name = rxn_pt.get('N', '') - assert len(name) == 1 - name = name[0] - synonyms = [syn for syn in names.get('PT', dict()).get('Y', list())] - synonyms += [syn for syn in names.get('PT', dict()).get('N', list())] - synonyms += [syn for syn in names.get('IN', dict()).get('Y', list())] - synonyms += [syn for syn in names.get('IN', dict()).get('N', list())] - elif "PT" in names: - pt = names.get('PT', dict()) - if 'Y' in pt: - name = pt.get('Y', '') - assert len(name) == 1 - name = name[0] - else: - name = pt.get('N', '') - assert len(name) == 1 - name = name[0] - synonyms += [syn for syn in names.get('IN', dict()).get('Y', list())] - synonyms += [syn for syn in names.get('IN', dict()).get('N', list())] - else: - in_dict = names.get('IN', dict()) - if 'Y' in in_dict: - name = in_dict.get('Y', '') - assert len(name) == 1 - name = name[0] - else: - name = in_dict.get('N', '') - assert len(name) == 1 - name = name[0] + name, synonyms = get_name_synonyms(names, ['RXN_PT', 'PT', 'RXN_IN', 'IN']) + node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) node['synonym'] = synonyms - node['description'] = create_description(tuis) + node['description'] = create_description("", tuis) nodes_output.write(node) @@ -135,25 +119,11 @@ def process_chv_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed name = str() synonyms = list() names = info.get(NAMES_KEY, dict()) - pt = names.get('PT', dict()) - synonyms += [syn for syn in names.get('SY', dict()).get('Y', list())] - synonyms += [syn for syn in names.get('SY', dict()).get('N', list())] - if 'Y' in pt: - name = pt.get('Y', '') - assert len(name) == 1, str(name) + ' ' + node_curie - name = name[0] - elif 'N' in pt: - name = pt.get('N', '') - assert len(name) == 1, str(name) + ' ' + node_curie - name = name[0] - else: - name = synonyms[0] - synonyms = synonyms[1:] - name = name[0] + name, synonyms = get_name_synonyms(names, ['PT', 'SY']) node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) node['synonym'] = synonyms - node['description'] = create_description(tuis) + node['description'] = create_description("", tuis) nodes_output.write(node) @@ -170,21 +140,13 @@ def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_outpu fda_codes = info.get(INFO_KEY, dict()).get('FDA_UNII_CODE', list()) secondary_accession_keys = info.get(INFO_KEY, dict()).get('SID', list()) - name = info.get(NAMES_KEY, dict()).get('IN', dict()).get('N', list()) - if len(name) == 0: - name = info.get(NAMES_KEY, dict()).get('IN', dict()).get('Y', list()) - assert len(name) == 1, str(name) + " " + node_curie - name = name[0] - synonyms = list() - for syn_cat in info.get(NAMES_KEY, dict()).get('SY', dict()): - synonyms += info.get(NAMES_KEY, dict()).get('SY', dict())[syn_cat] - for syn_cat in info.get(NAMES_KEY, dict()).get('FSY', dict()): - synonyms += info.get(NAMES_KEY, dict()).get('FSY', dict())[syn_cat] + names = info.get(NAMES_KEY, dict()) + name, synonyms = get_name_synonyms(names, ['IN', 'SY', 'FSY']) # TODO: figure out update date node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) node['synonym'] = synonyms - node['description'] = create_description(tuis) + node['description'] = create_description("", tuis) nodes_output.write(node) @@ -204,31 +166,53 @@ def process_fma_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed name = str() synonyms = list() names = info.get(NAMES_KEY, dict()) - pt = names.get('PT', dict()) - synonyms += [syn for syn in names.get('SY', dict()).get('Y', list())] - synonyms += [syn for syn in names.get('SY', dict()).get('N', list())] - if 'Y' in pt: - name = pt.get('Y', '') - if len(name) > 1: - synonyms += name[1:] - name = name[0] - elif 'N' in pt: - name = pt.get('N', '') - if len(name) > 1: - synonyms += name[1:] - name = name[0] - else: - name = synonyms[0] - synonyms = synonyms[1:] - name = name[0] + name, synonyms = get_name_synonyms(names, ['PT', 'SY']) node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) node['synonym'] = synonyms - node['description'] = create_description(tuis) + node['description'] = create_description("", tuis) nodes_output.write(node) +def process_go_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): + curie_prefix = kg2_util.CURIE_PREFIX_GO + provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) + node_id = node_id.replace('GO:', '') + iri = iri_mappings[curie_prefix] + node_id + node_curie = make_node_id(curie_prefix, node_id) + cuis = info.get(CUIS_KEY, list()) + tuis = info.get(TUIS_KEY, list()) + go_namespace = info.get(INFO_KEY, dict()).get('GO_NAMESPACE', list()) + assert len(go_namespace) == 1 + go_namespace = go_namespace[0] + namespace_category_map = {'molecular_function': kg2_util.BIOLINK_CATEGORY_MOLECULAR_ACTIVITY, + 'cellular_component': kg2_util.BIOLINK_CATEGORY_CELLULAR_COMPONENT, + 'biological_process': kg2_util.BIOLINK_CATEGORY_BIOLOGICAL_PROCESS} + category = namespace_category_map.get(go_namespace, tui_mappings[str(tuple(tuis))]) + go_comment = info.get(INFO_KEY, dict()).get('GO_COMMENT', str()) + + # Currently not used, but extracting them in case we want them in the future + date_created = info.get(INFO_KEY, dict()).get('DATE_CREATED', list()) + go_subset = info.get(INFO_KEY, dict()).get('GO_SUBSET', list()) + gxr = info.get(INFO_KEY, dict()).get('GXR', list()) + ref = info.get(INFO_KEY, dict()).get('REF', list()) + sid = info.get(INFO_KEY, dict()).get('SID', list()) + + name = str() + synonyms = list() + names = info.get(NAMES_KEY, dict()) + name, synonyms = get_name_synonyms(names, ['PT', 'MTH_PT', 'SY', 'MTH_SY', 'ET', 'MTH_ET']) + + node = kg2_util.make_node(node_curie, iri, name, category, "2023", provided_by) + node['synonym'] = synonyms + if len(go_comment) > 0: + go_comment = go_comment[0] + go_comment = "// COMMENTS: " + go_comment + node['description'] = create_description(go_comment, tuis) + + nodes_output.write(node) + if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) args = get_args() @@ -245,6 +229,7 @@ def process_fma_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed input_items = input_read_jsonlines_info[0] tui_mappings = dict() + name_keys = set() with open('tui_combo_mappings.json') as mappings: tui_mappings = json.load(mappings) @@ -278,6 +263,10 @@ def process_fma_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed if source == 'FMA': process_fma_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) + if source == 'GO': + process_go_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) + kg2_util.end_read_jsonlines(input_read_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) + # print(json.dumps(name_keys, indent=4, sort_keys=True, default=list)) print("Finishing umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) From b3369f2e01975e6decb47e0463d1dc2b83257b42 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 18 Aug 2023 13:29:18 -0700 Subject: [PATCH 035/117] #316 lots of restructuring to streamline the code --- kg2_util.py | 2 + umls_list_jsonl_to_kg_jsonl.py | 111 ++++++++++++--------------------- 2 files changed, 42 insertions(+), 71 deletions(-) diff --git a/kg2_util.py b/kg2_util.py index 0c04d2aa..d03950b1 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -57,6 +57,7 @@ CURIE_PREFIX_CHEMBL_COMPOUND = 'CHEMBL.COMPOUND' CURIE_PREFIX_CHEMBL_MECHANISM = 'CHEMBL.MECHANISM' CURIE_PREFIX_CHEMBL_TARGET = 'CHEMBL.TARGET' +CURIE_PREFIX_CHV = 'CHV' CURIE_PREFIX_CLINICALTRIALS = 'clinicaltrials' CURIE_PREFIX_DCTERMS = 'dcterms' CURIE_PREFIX_DGIDB = 'DGIdb' @@ -65,6 +66,7 @@ CURIE_PREFIX_DRUGCENTRAL = 'DrugCentral' CURIE_PREFIX_ENSEMBL = 'ENSEMBL' CURIE_PREFIX_ENSEMBL_GENOMES = 'EnsemblGenomes' +CURIE_PREFIX_FMA = 'FMA' CURIE_PREFIX_GO = 'GO' CURIE_PREFIX_GTPI = 'GTPI' CURIE_PREFIX_GTPI_SOURCE = 'GTPI_source' diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 271e8855..ace05061 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -76,37 +76,41 @@ def get_name_keys(names_dict): return str(sorted(keys_list)) -def process_atc_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): +def make_umls_node(node_curie, iri, name, category, update_date, provided_by, synonyms, description, nodes_output): + node = kg2_util.make_node(node_curie, iri, name, category, "2023", provided_by) + node['synonym'] = synonyms + node['description'] = description + + nodes_output.write(node) + + +def get_basic_info(curie_prefix, node_id, tui_mappings, iri_mappings, info): curie_prefix = kg2_util.CURIE_PREFIX_ATC provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) iri = iri_mappings[curie_prefix] + node_id node_curie = make_node_id(curie_prefix, node_id) cuis = info.get(CUIS_KEY, list()) tuis = info.get(TUIS_KEY, list()) + category = tui_mappings[str(tuple(tuis))] + + return node_curie, iri, provided_by, category, cuis, tuis + + +def process_atc_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): + node_curie, iri, provided_by, category, cuis, tuis = get_basic_info(kg2_util.CURIE_PREFIX_ATC, node_id, tui_mappings, iri_mappings, info) # Currently not used, but extracting them in case we want them in the future atc_level = info.get(INFO_KEY, dict()).get('ATC_LEVEL', list())[0] is_drug_class = info.get(INFO_KEY, dict()).get('IS_DRUG_CLASS', list()) == ["Y"] - name = str() - synonyms = list() names = info.get(NAMES_KEY, dict()) name, synonyms = get_name_synonyms(names, ['RXN_PT', 'PT', 'RXN_IN', 'IN']) - node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) - node['synonym'] = synonyms - node['description'] = create_description("", tuis) - - nodes_output.write(node) + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) def process_chv_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): - curie_prefix = "CHV" # This should be replaced with a kg2_util prefix at some point - provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) - iri = iri_mappings[curie_prefix] + node_id - node_curie = make_node_id(curie_prefix, node_id) - cuis = info.get(CUIS_KEY, list()) - tuis = info.get(TUIS_KEY, list()) + node_curie, iri, provided_by, category, cuis, tuis = get_basic_info(kg2_util.CURIE_PREFIX_CHV, node_id, tui_mappings, iri_mappings, info) # Currently not used, but extracting them in case we want them in the future combo_score = info.get(INFO_KEY, dict()).get('COMBO_SCORE', list()) @@ -116,25 +120,14 @@ def process_chv_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed disparaged = info.get(INFO_KEY, dict()).get('DISPARAGED', list()) frequency = info.get(INFO_KEY, dict()).get('FREQUENCY', list()) - name = str() - synonyms = list() names = info.get(NAMES_KEY, dict()) name, synonyms = get_name_synonyms(names, ['PT', 'SY']) - node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) - node['synonym'] = synonyms - node['description'] = create_description("", tuis) - - nodes_output.write(node) + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): - curie_prefix = kg2_util.CURIE_PREFIX_DRUGBANK - provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) - iri = iri_mappings[curie_prefix] + node_id - node_curie = make_node_id(curie_prefix, node_id) - cuis = info.get(CUIS_KEY, list()) - tuis = info.get(TUIS_KEY, list()) + node_curie, iri, provided_by, category, cuis, tuis = get_basic_info(kg2_util.CURIE_PREFIX_DRUGBANK, node_id, tui_mappings, iri_mappings, info) # Currently not used, but extracting them in case we want them in the future fda_codes = info.get(INFO_KEY, dict()).get('FDA_UNII_CODE', list()) @@ -144,74 +137,50 @@ def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_outpu name, synonyms = get_name_synonyms(names, ['IN', 'SY', 'FSY']) # TODO: figure out update date - node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) - node['synonym'] = synonyms - node['description'] = create_description("", tuis) - - nodes_output.write(node) + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) def process_fma_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): - curie_prefix = "FMA" # This should be replaced with a kg2_util prefix at some point - provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) - iri = iri_mappings[curie_prefix] + node_id - node_curie = make_node_id(curie_prefix, node_id) - cuis = info.get(CUIS_KEY, list()) - tuis = info.get(TUIS_KEY, list()) + node_curie, iri, provided_by, category, cuis, tuis = get_basic_info(kg2_util.CURIE_PREFIX_FMA, node_id, tui_mappings, iri_mappings, info) # Currently not used, but extracting them in case we want them in the future authority = info.get(INFO_KEY, dict()).get('AUTHORITY', list()) date_last_modified = info.get(INFO_KEY, dict()).get('DATE_LAST_MODIFIED', list()) - name = str() - synonyms = list() names = info.get(NAMES_KEY, dict()) name, synonyms = get_name_synonyms(names, ['PT', 'SY']) - node = kg2_util.make_node(node_curie, iri, name, tui_mappings[str(tuple(tuis))], "2023", provided_by) - node['synonym'] = synonyms - node['description'] = create_description("", tuis) - - nodes_output.write(node) + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) def process_go_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): - curie_prefix = kg2_util.CURIE_PREFIX_GO - provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) - node_id = node_id.replace('GO:', '') - iri = iri_mappings[curie_prefix] + node_id - node_curie = make_node_id(curie_prefix, node_id) - cuis = info.get(CUIS_KEY, list()) - tuis = info.get(TUIS_KEY, list()) - go_namespace = info.get(INFO_KEY, dict()).get('GO_NAMESPACE', list()) + node_curie, iri, provided_by, category, cuis, tuis = get_basic_info(kg2_util.CURIE_PREFIX_GO, node_id, tui_mappings, iri_mappings, info) + + # GO-specific information + attributes = info.get(INFO_KEY, dict()) + go_namespace = attributes.get('GO_NAMESPACE', list()) assert len(go_namespace) == 1 go_namespace = go_namespace[0] namespace_category_map = {'molecular_function': kg2_util.BIOLINK_CATEGORY_MOLECULAR_ACTIVITY, 'cellular_component': kg2_util.BIOLINK_CATEGORY_CELLULAR_COMPONENT, 'biological_process': kg2_util.BIOLINK_CATEGORY_BIOLOGICAL_PROCESS} - category = namespace_category_map.get(go_namespace, tui_mappings[str(tuple(tuis))]) - go_comment = info.get(INFO_KEY, dict()).get('GO_COMMENT', str()) + category = namespace_category_map.get(go_namespace, category) + go_comment = attributes.get('GO_COMMENT', str()) + if len(go_comment) > 0: + go_comment = go_comment[0] + go_comment = "// COMMENTS: " + go_comment # Currently not used, but extracting them in case we want them in the future - date_created = info.get(INFO_KEY, dict()).get('DATE_CREATED', list()) - go_subset = info.get(INFO_KEY, dict()).get('GO_SUBSET', list()) - gxr = info.get(INFO_KEY, dict()).get('GXR', list()) - ref = info.get(INFO_KEY, dict()).get('REF', list()) - sid = info.get(INFO_KEY, dict()).get('SID', list()) - - name = str() - synonyms = list() + date_created = attributes.get('DATE_CREATED', list()) + go_subset = attributes.get('GO_SUBSET', list()) + gxr = attributes.get('GXR', list()) + ref = attributes.get('REF', list()) + sid = attributes.get('SID', list()) + names = info.get(NAMES_KEY, dict()) name, synonyms = get_name_synonyms(names, ['PT', 'MTH_PT', 'SY', 'MTH_SY', 'ET', 'MTH_ET']) - node = kg2_util.make_node(node_curie, iri, name, category, "2023", provided_by) - node['synonym'] = synonyms - if len(go_comment) > 0: - go_comment = go_comment[0] - go_comment = "// COMMENTS: " + go_comment - node['description'] = create_description(go_comment, tuis) - - nodes_output.write(node) + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description(go_comment, tuis), nodes_output) if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) From 2f8e61b51923211f5a743bbdfff5c286ae7cdc78 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 18 Aug 2023 14:06:16 -0700 Subject: [PATCH 036/117] #316 correcting a lot of typos, factoring more out, and HCPCS --- kg2_util.py | 1 + umls_list_jsonl_to_kg_jsonl.py | 97 ++++++++++++++++++++-------------- 2 files changed, 58 insertions(+), 40 deletions(-) diff --git a/kg2_util.py b/kg2_util.py index d03950b1..d7750974 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -70,6 +70,7 @@ CURIE_PREFIX_GO = 'GO' CURIE_PREFIX_GTPI = 'GTPI' CURIE_PREFIX_GTPI_SOURCE = 'GTPI_source' +CURIE_PREFIX_HCPCS = 'HCPCS' CURIE_PREFIX_HGNC = 'HGNC' CURIE_PREFIX_HMDB = 'HMDB' CURIE_PREFIX_IAO = 'IAO' diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index ace05061..22c2f676 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -28,6 +28,16 @@ NAMES_KEY = 'names' TUIS_KEY = 'tuis' +TUI_MAPPINGS = dict() +IRI_MAPPINGS = dict() + +ATC_PREFIX = kg2_util.CURIE_PREFIX_ATC +CHV_PREFIX = kg2_util.CURIE_PREFIX_CHV +DRUGBANK_PREFIX = kg2_util.CURIE_PREFIX_DRUGBANK +FMA_PREFIX = kg2_util.CURIE_PREFIX_FMA +GO_PREFIX = kg2_util.CURIE_PREFIX_GO +HCPCS_PREFIX = kg2_util.CURIE_PREFIX_HCPCS + UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -76,6 +86,13 @@ def get_name_keys(names_dict): return str(sorted(keys_list)) +def get_attribute_keys(attributes_dict): + keys_list = [] + for key in attributes_dict: + keys_list.append(key) + return str(sorted(keys_list)) + + def make_umls_node(node_curie, iri, name, category, update_date, provided_by, synonyms, description, nodes_output): node = kg2_util.make_node(node_curie, iri, name, category, "2023", provided_by) node['synonym'] = synonyms @@ -84,33 +101,32 @@ def make_umls_node(node_curie, iri, name, category, update_date, provided_by, sy nodes_output.write(node) -def get_basic_info(curie_prefix, node_id, tui_mappings, iri_mappings, info): - curie_prefix = kg2_util.CURIE_PREFIX_ATC +def get_basic_info(curie_prefix, node_id, info, accession_heirarchy): provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) - iri = iri_mappings[curie_prefix] + node_id + iri = IRI_MAPPINGS[curie_prefix] + node_id node_curie = make_node_id(curie_prefix, node_id) cuis = info.get(CUIS_KEY, list()) tuis = info.get(TUIS_KEY, list()) - category = tui_mappings[str(tuple(tuis))] + category = TUI_MAPPINGS[str(tuple(tuis))] - return node_curie, iri, provided_by, category, cuis, tuis + names = info.get(NAMES_KEY, dict()) + name, synonyms = get_name_synonyms(names, accession_heirarchy) + + return node_curie, iri, name, provided_by, category, synonyms, cuis, tuis -def process_atc_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): - node_curie, iri, provided_by, category, cuis, tuis = get_basic_info(kg2_util.CURIE_PREFIX_ATC, node_id, tui_mappings, iri_mappings, info) +def process_atc_item(node_id, info, nodes_output, edges_output): + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(ATC_PREFIX, node_id, info, ['RXN_PT', 'PT', 'RXN_IN', 'IN']) # Currently not used, but extracting them in case we want them in the future atc_level = info.get(INFO_KEY, dict()).get('ATC_LEVEL', list())[0] is_drug_class = info.get(INFO_KEY, dict()).get('IS_DRUG_CLASS', list()) == ["Y"] - names = info.get(NAMES_KEY, dict()) - name, synonyms = get_name_synonyms(names, ['RXN_PT', 'PT', 'RXN_IN', 'IN']) - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_chv_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): - node_curie, iri, provided_by, category, cuis, tuis = get_basic_info(kg2_util.CURIE_PREFIX_CHV, node_id, tui_mappings, iri_mappings, info) +def process_chv_item(node_id, info, nodes_output, edges_output): + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(CHV_PREFIX, node_id, info, ['PT', 'SY']) # Currently not used, but extracting them in case we want them in the future combo_score = info.get(INFO_KEY, dict()).get('COMBO_SCORE', list()) @@ -120,41 +136,33 @@ def process_chv_item(node_id, info, tui_mappings, iri_mappings, nodes_output, ed disparaged = info.get(INFO_KEY, dict()).get('DISPARAGED', list()) frequency = info.get(INFO_KEY, dict()).get('FREQUENCY', list()) - names = info.get(NAMES_KEY, dict()) - name, synonyms = get_name_synonyms(names, ['PT', 'SY']) - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_drugbank_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): - node_curie, iri, provided_by, category, cuis, tuis = get_basic_info(kg2_util.CURIE_PREFIX_DRUGBANK, node_id, tui_mappings, iri_mappings, info) +def process_drugbank_item(node_id, info, nodes_output, edges_output): + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(DRUGBANK_PREFIX, node_id, info, ['IN', 'SY', 'FSY']) # Currently not used, but extracting them in case we want them in the future fda_codes = info.get(INFO_KEY, dict()).get('FDA_UNII_CODE', list()) secondary_accession_keys = info.get(INFO_KEY, dict()).get('SID', list()) - names = info.get(NAMES_KEY, dict()) - name, synonyms = get_name_synonyms(names, ['IN', 'SY', 'FSY']) - # TODO: figure out update date make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_fma_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): - node_curie, iri, provided_by, category, cuis, tuis = get_basic_info(kg2_util.CURIE_PREFIX_FMA, node_id, tui_mappings, iri_mappings, info) +def process_fma_item(node_id, info, nodes_output, edges_output): + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(FMA_PREFIX, node_id, info, ['PT', 'SY']) # Currently not used, but extracting them in case we want them in the future authority = info.get(INFO_KEY, dict()).get('AUTHORITY', list()) date_last_modified = info.get(INFO_KEY, dict()).get('DATE_LAST_MODIFIED', list()) - names = info.get(NAMES_KEY, dict()) - name, synonyms = get_name_synonyms(names, ['PT', 'SY']) - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_go_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edges_output): - node_curie, iri, provided_by, category, cuis, tuis = get_basic_info(kg2_util.CURIE_PREFIX_GO, node_id, tui_mappings, iri_mappings, info) +def process_go_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PT', 'MTH_PT', 'SY', 'MTH_SY', 'ET', 'MTH_ET'] + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(GO_PREFIX, node_id.replace('GO:', ''), info, accession_heirarchy) # GO-specific information attributes = info.get(INFO_KEY, dict()) @@ -177,11 +185,19 @@ def process_go_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edg ref = attributes.get('REF', list()) sid = attributes.get('SID', list()) - names = info.get(NAMES_KEY, dict()) - name, synonyms = get_name_synonyms(names, ['PT', 'MTH_PT', 'SY', 'MTH_SY', 'ET', 'MTH_ET']) - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description(go_comment, tuis), nodes_output) + +def process_hcpcs_item(node_id, info, nodes_output, edges_output): + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HCPCS_PREFIX, node_id, info, ['PT', 'MTH_HT', 'MP']) + + # Currently not used, but extracting them in case we want them in the future + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + return get_attribute_keys(info.get(INFO_KEY, dict())) + + if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) args = get_args() @@ -197,17 +213,15 @@ def process_go_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edg input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) input_items = input_read_jsonlines_info[0] - tui_mappings = dict() name_keys = set() with open('tui_combo_mappings.json') as mappings: - tui_mappings = json.load(mappings) + TUI_MAPPINGS = json.load(mappings) - iri_mappings = dict() iri_mappings_raw = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string('curies-to-urls-map.yaml'))['use_for_bidirectional_mapping'] for item in iri_mappings_raw: for prefix in item: - iri_mappings[prefix] = item[prefix] + IRI_MAPPINGS[prefix] = item[prefix] for data in input_items: # There should only be one item in the data dictionary @@ -221,21 +235,24 @@ def process_go_item(node_id, info, tui_mappings, iri_mappings, nodes_output, edg # Process the data specifically by source if source == 'ATC': - process_atc_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) + process_atc_item(node_id, value, nodes_output, edges_output) if source == 'CHV': - process_chv_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) + process_chv_item(node_id, value, nodes_output, edges_output) if source == 'DRUGBANK': - process_drugbank_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) + process_drugbank_item(node_id, value, nodes_output, edges_output) if source == 'FMA': - process_fma_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) + process_fma_item(node_id, value, nodes_output, edges_output) if source == 'GO': - process_go_item(node_id, value, tui_mappings, iri_mappings, nodes_output, edges_output) + process_go_item(node_id, value, nodes_output, edges_output) + + if source == 'HCPCS': + name_keys.add(process_hcpcs_item(node_id, value, nodes_output, edges_output)) kg2_util.end_read_jsonlines(input_read_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) - # print(json.dumps(name_keys, indent=4, sort_keys=True, default=list)) + print(json.dumps(name_keys, indent=4, sort_keys=True, default=list)) print("Finishing umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) From c066784ee9bfb780e47cb221f672ef5996b0b815 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 18 Aug 2023 15:15:13 -0700 Subject: [PATCH 037/117] #316 HGNC --- umls_list_jsonl_to_kg_jsonl.py | 74 +++++++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 6 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 22c2f676..bffdfd4e 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -90,13 +90,14 @@ def get_attribute_keys(attributes_dict): keys_list = [] for key in attributes_dict: keys_list.append(key) - return str(sorted(keys_list)) + return set(keys_list) -def make_umls_node(node_curie, iri, name, category, update_date, provided_by, synonyms, description, nodes_output): +def make_umls_node(node_curie, iri, name, category, update_date, provided_by, synonyms, description, nodes_output, full_name=None): node = kg2_util.make_node(node_curie, iri, name, category, "2023", provided_by) node['synonym'] = synonyms node['description'] = description + node['full_name'] = full_name nodes_output.write(node) @@ -191,11 +192,68 @@ def process_go_item(node_id, info, nodes_output, edges_output): def process_hcpcs_item(node_id, info, nodes_output, edges_output): node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HCPCS_PREFIX, node_id, info, ['PT', 'MTH_HT', 'MP']) - # Currently not used, but extracting them in case we want them in the future + # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html + attributes = info.get(INFO_KEY, dict()) + had = attributes.get('HAD', list()) # HCPCS Action Effective Date - effective date of action to a procedure or modifier code. + hcc = attributes.get('HCC', list()) # HCPCS Coverage Code - code denoting Medicare coverage status. There are two subelements separated by "=". + hts = attributes.get('HTS', list()) # HCPCS Type of Service Code - carrier assigned HCFA Type of Service which describes the particular kind(s) of service represented by the procedure code. + hcd = attributes.get('HCD', list()) # HCPCS Code Added Date - year the HCPCS code was added to the HCFA Common Procedure Coding System. + hpn = attributes.get('HPN', list()) # HCPCS processing note number identifying the processing note contained in Appendix A of the HCPCS Manual. + haq = attributes.get('HAQ', list()) # HCPCS Anesthesia Base Unit Quantity - base unit represents the level of intensity for anesthesia procedure services that reflects all activities except time. + hlc = attributes.get('HLC', list()) # HCPCS Lab Certification Code - code used to classify laboratory procedures according to the specialty certification categories listed by CMS(formerly HCFA). + hsn = attributes.get('HSN', list()) # HCPCS Statute Number identifying statute reference for coverage or noncoverage of procedure or service. + hpd = attributes.get('HPD', list()) # HCPCS ASC payment group effective date - date the procedure is assigned to the ASC payment group. + hpg = attributes.get('HPG', list()) # HCPCS ASC payment group code which represents the dollar amount of the facility charge payable by Medicare for the procedure. + hmg = attributes.get('HMR', list()) # HCPCS Medicare Carriers Manual reference section number - number identifying a section of the Medicare Carriers Manual. + hir = attributes.get('HIR', list()) # HCPCS Coverage Issues Manual Reference Section Number - number identifying the Reference Section of the Coverage Issues Manual. + hxr = attributes.get('HXR', list()) # HCPCS Cross reference code - an explicit reference crosswalking a deleted code or a code that is not valid for Medicare to a valid current code (or range of codes). + hmp = attributes.get('HMP', list()) # HCPCS Multiple Pricing Indicator Code - code used to identify instances where a procedure could be priced. + hpi = attributes.get('HPI', list()) # HCPCS Pricing Indicator Code - used to identify the appropriate methodology for developing unique pricing amounts under Part B. + hac = attributes.get('HAC', list()) # HCPCS action code - code denoting the change made to a procedure or modifier code within the HCPCS system. + hbt = attributes.get('HBT', list()) # HCPCS Berenson-Eggers Type of Service Code - BETOS for the procedure code based on generally agreed upon clinically meaningful groupings of procedures and services. make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - return get_attribute_keys(info.get(INFO_KEY, dict())) + +def process_hgnc_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['ACR', 'PT', 'MTH_ACR', 'NA', 'NP', 'NS', 'SYN'] + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HCPCS_PREFIX, node_id.replace('HGNC:', ''), info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html + attributes = info.get(INFO_KEY, dict()) + mgd_id = attributes.get('MGD_ID', list()) + vega_id = attributes.get('VEGA_ID', list()) + genecc = attributes.get('GENCC', list()) + swp = attributes.get('SWP', list()) + mane_select = attributes.get('MANE_SELECT', list()) + local_specific_db_xr = attributes.get('LOCUS_SPECIFIC_DB_XR', list()) + locus_type = attributes.get('LOCUS_TYPE', list()) + agr = attributes.get('AGR', list()) + cytogenetic_location = attributes.get('CYTOGENETIC_LOCATION', list()) + date_created = attributes.get('DATE_CREATED', list()) + ensemblgene_id = attributes.get('ENSEMBLGENE_ID', list()) + db_xr_id = attributes.get('DB_XR_ID', list()) + locus_group = attributes.get('LOCUS_GROUP', list()) + entrezgene_id = attributes.get('ENTREZGENE_ID', list()) + date_name_changed = attributes.get('DATE_NAME_CHANGED', list()) + pmid = attributes.get('PMID', list()) + date_last_modified = attributes.get('DATE_LAST_MODIFIED', list()) + mapped_ucsc_id = attributes.get('MAPPED_UCSC_ID', list()) + refseq_id = attributes.get('REFSEQ_ID', list()) + ena = attributes.get('ENA', list()) + rgd_id = attributes.get('RGD_ID', list()) + date_symbol_changed = attributes.get('DATE_SYMBOL_CHANGED', list()) + omim_id = attributes.get('OMIM_ID', list()) + gene_fam_id = attributes.get('GENE_FAM_ID', list()) + gene_symbol = attributes.get('GENESYMBOL', list()) + ez = attributes.get('EZ', list()) + ccds_id = attributes.get('CCDS_ID', list()) + lncipedia = attributes.get('LNCIPEDIA', list()) + gene_fam_desc = attributes.get('GENE_FAM_DESC', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output, full_name) + + return get_attribute_keys(attributes) if __name__ == '__main__': @@ -214,6 +272,7 @@ def process_hcpcs_item(node_id, info, nodes_output, edges_output): input_items = input_read_jsonlines_info[0] name_keys = set() + attribute_keys = set() with open('tui_combo_mappings.json') as mappings: TUI_MAPPINGS = json.load(mappings) @@ -250,9 +309,12 @@ def process_hcpcs_item(node_id, info, nodes_output, edges_output): process_go_item(node_id, value, nodes_output, edges_output) if source == 'HCPCS': - name_keys.add(process_hcpcs_item(node_id, value, nodes_output, edges_output)) + process_hcpcs_item(node_id, value, nodes_output, edges_output) + + if source == 'HGNC': + attribute_keys.update(process_hgnc_item(node_id, value, nodes_output, edges_output)) kg2_util.end_read_jsonlines(input_read_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) - print(json.dumps(name_keys, indent=4, sort_keys=True, default=list)) + print(json.dumps(attribute_keys, indent=4, sort_keys=True, default=list)) print("Finishing umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) From 15984e9bf5dbef64eb15e8bc2fe68dfbd2c9ef07 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 18 Aug 2023 15:15:57 -0700 Subject: [PATCH 038/117] #316 don't actually need full name for HGNC, that's just because it merges with pr.owl --- umls_list_jsonl_to_kg_jsonl.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index bffdfd4e..a1eb1b88 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -93,11 +93,10 @@ def get_attribute_keys(attributes_dict): return set(keys_list) -def make_umls_node(node_curie, iri, name, category, update_date, provided_by, synonyms, description, nodes_output, full_name=None): +def make_umls_node(node_curie, iri, name, category, update_date, provided_by, synonyms, description, nodes_output): node = kg2_util.make_node(node_curie, iri, name, category, "2023", provided_by) node['synonym'] = synonyms node['description'] = description - node['full_name'] = full_name nodes_output.write(node) @@ -251,7 +250,7 @@ def process_hgnc_item(node_id, info, nodes_output, edges_output): lncipedia = attributes.get('LNCIPEDIA', list()) gene_fam_desc = attributes.get('GENE_FAM_DESC', list()) - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output, full_name) + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) return get_attribute_keys(attributes) From 5b5bbaea94b307ada37d1b354dd4a7ce5e503b6d Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 18 Aug 2023 15:19:40 -0700 Subject: [PATCH 039/117] #316 cleaning up (we don't actually want the return statement) --- umls_list_jsonl_to_kg_jsonl.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index a1eb1b88..c89c0aeb 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -252,8 +252,6 @@ def process_hgnc_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - return get_attribute_keys(attributes) - if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) @@ -311,9 +309,9 @@ def process_hgnc_item(node_id, info, nodes_output, edges_output): process_hcpcs_item(node_id, value, nodes_output, edges_output) if source == 'HGNC': - attribute_keys.update(process_hgnc_item(node_id, value, nodes_output, edges_output)) + process_hgnc_item(node_id, value, nodes_output, edges_output) kg2_util.end_read_jsonlines(input_read_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) - print(json.dumps(attribute_keys, indent=4, sort_keys=True, default=list)) + # print(json.dumps(attribute_keys, indent=4, sort_keys=True, default=list)) print("Finishing umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) From 0d4d89b6e70c9ab97f12a3d42080232bd4aebd09 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 18 Aug 2023 15:22:32 -0700 Subject: [PATCH 040/117] #316 correct prefix for HGNC --- umls_list_jsonl_to_kg_jsonl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index c89c0aeb..51c1cab9 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -37,6 +37,7 @@ FMA_PREFIX = kg2_util.CURIE_PREFIX_FMA GO_PREFIX = kg2_util.CURIE_PREFIX_GO HCPCS_PREFIX = kg2_util.CURIE_PREFIX_HCPCS +HGNC_PREFIX = kg2_util.CURIE_PREFIX_HGNC UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -216,7 +217,7 @@ def process_hcpcs_item(node_id, info, nodes_output, edges_output): def process_hgnc_item(node_id, info, nodes_output, edges_output): accession_heirarchy = ['ACR', 'PT', 'MTH_ACR', 'NA', 'NP', 'NS', 'SYN'] - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HCPCS_PREFIX, node_id.replace('HGNC:', ''), info, accession_heirarchy) + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HGNC_PREFIX, node_id.replace('HGNC:', ''), info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html attributes = info.get(INFO_KEY, dict()) From fd5495b05a0c7bf4d99a27e7d04641dc6b239240 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 18 Aug 2023 15:26:19 -0700 Subject: [PATCH 041/117] #316 correct the order of name priority for HGNC --- umls_list_jsonl_to_kg_jsonl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 51c1cab9..987f607b 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -216,7 +216,7 @@ def process_hcpcs_item(node_id, info, nodes_output, edges_output): def process_hgnc_item(node_id, info, nodes_output, edges_output): - accession_heirarchy = ['ACR', 'PT', 'MTH_ACR', 'NA', 'NP', 'NS', 'SYN'] + accession_heirarchy = ['PT', 'ACR', 'MTH_ACR', 'NA', 'NP', 'NS', 'SYN'] node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HGNC_PREFIX, node_id.replace('HGNC:', ''), info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html From bc698ee5fcd62c3876283a372a5f2e18ed54ae24 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 19 Aug 2023 10:41:14 -0700 Subject: [PATCH 042/117] #316 HL7, updated some of the accession hierarchies based on webpage --- kg2_util.py | 1 + umls_list_jsonl_to_kg_jsonl.py | 60 +++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/kg2_util.py b/kg2_util.py index d7750974..ba99bc70 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -72,6 +72,7 @@ CURIE_PREFIX_GTPI_SOURCE = 'GTPI_source' CURIE_PREFIX_HCPCS = 'HCPCS' CURIE_PREFIX_HGNC = 'HGNC' +CURIE_PREFIX_HL7 = 'HL7' CURIE_PREFIX_HMDB = 'HMDB' CURIE_PREFIX_IAO = 'IAO' CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY = 'identifiers_org_registry' diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 987f607b..a6b51336 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -20,7 +20,7 @@ DESIRED_CODES = ['ATC', 'CHV', 'DRUGBANK', 'FMA', 'GO', 'HCPCS', 'HGNC', 'HL7V3.0', - 'HL7', 'HPO', 'ICD10PCS', 'ICD9CM', 'MED-RT', 'MEDLINEPLUS', 'MSH', + 'HPO', 'ICD10PCS', 'ICD9CM', 'MED-RT', 'MEDLINEPLUS', 'MSH', 'MTH', 'NCBI', 'NCBITAXON', 'NCI', 'NDDF', 'NDFRT', 'OMIM', 'PDQ', 'PSY', 'RXNORM', 'VANDF'] CUIS_KEY = 'cuis' @@ -38,6 +38,7 @@ GO_PREFIX = kg2_util.CURIE_PREFIX_GO HCPCS_PREFIX = kg2_util.CURIE_PREFIX_HCPCS HGNC_PREFIX = kg2_util.CURIE_PREFIX_HGNC +HL7_PREFIX = kg2_util.CURIE_PREFIX_HL7 UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -162,7 +163,7 @@ def process_fma_item(node_id, info, nodes_output, edges_output): def process_go_item(node_id, info, nodes_output, edges_output): - accession_heirarchy = ['PT', 'MTH_PT', 'SY', 'MTH_SY', 'ET', 'MTH_ET'] + accession_heirarchy = ['PT', 'MTH_PT', 'ET', 'MTH_ET', 'SY', 'MTH_SY'] node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(GO_PREFIX, node_id.replace('GO:', ''), info, accession_heirarchy) # GO-specific information @@ -216,7 +217,7 @@ def process_hcpcs_item(node_id, info, nodes_output, edges_output): def process_hgnc_item(node_id, info, nodes_output, edges_output): - accession_heirarchy = ['PT', 'ACR', 'MTH_ACR', 'NA', 'NP', 'NS', 'SYN'] + accession_heirarchy = ['PT', 'ACR', 'MTH_ACR', 'NA', 'SYN', 'NP', 'NS'] node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HGNC_PREFIX, node_id.replace('HGNC:', ''), info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html @@ -254,6 +255,49 @@ def process_hgnc_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_hl7_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['CSY', 'PT', 'CDO', 'VS', 'BR', 'CPR', 'CR', 'NPT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HL7_PREFIX, node_id, info, accession_heirarchy) + + hl7at = attributes.get('HL7AT', list()) + hl7ii = attributes.get('HL7II', list()) + hl7im = attributes.get('HL7IM', list()) + hl7lt = attributes.get('HL7LT', list()) + hl7un = attributes.get('HL7UN', list()) + hl7oa = attributes.get('HL7OA', list()) + hl7scs = attributes.get('HL7SCS', list()) + hl7cc = attributes.get('HL7CC', list()) + hl7na = attributes.get('HL7NA', list()) + hl7in = attributes.get('HL7IN', list()) + hl7ap = attributes.get('HL7AP', list()) + hl7mi = attributes.get('HL7MI', list()) + hl7hi = attributes.get('HL7HI', list()) + hl7ir = attributes.get('HL7IR', list()) + hl7ai = attributes.get('HL7AI', list()) + hl7ha = attributes.get('HL7HA', list()) + hl7rf = attributes.get('HL7RF', list()) + hl7rd = attributes.get('HL7RD', list()) + hl7vd = attributes.get('HL7VD', list()) + hl7dc = attributes.get('HL7DC', list()) + hl7rk = attributes.get('HL7RK', list()) + hl7is = attributes.get('HL7IS', list()) + hl7sy = attributes.get('HL7SY', list()) + hl7cd = attributes.get('HL7CD', list()) + hl7sl = attributes.get('HL7SL', list()) + hl7pl = attributes.get('HL7PL', list()) + hl7vc = attributes.get('HL7VC', list()) + hl7ty = attributes.get('HL7TY', list()) + hl7rg = attributes.get('HL7RG', list()) + hl7csc = attributes.get('HL7CSC', list()) + hl7od = attributes.get('HL7OD', list()) + hl7id = attributes.get('HL7ID', list()) + hl7tr = attributes.get('HL7TR', list()) + hl7di = attributes.get('HL7DI', list()) + hl7cs = attributes.get('HL7CS', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) args = get_args() @@ -312,7 +356,15 @@ def process_hgnc_item(node_id, info, nodes_output, edges_output): if source == 'HGNC': process_hgnc_item(node_id, value, nodes_output, edges_output) + if source == 'HL7V3.0': + process_hl7_item(node_id, value, nodes_output, edges_output) + + if source == 'HPO': + name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) + attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) + kg2_util.end_read_jsonlines(input_read_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) - # print(json.dumps(attribute_keys, indent=4, sort_keys=True, default=list)) + print(json.dumps(name_keys, indent=4, sort_keys=True, default=list)) + print(json.dumps(attribute_keys, indent=4, sort_keys=True, default=list)) print("Finishing umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) From 86a1e160e0b684d70b71ab6ac7314e079b087c05 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 19 Aug 2023 10:44:28 -0700 Subject: [PATCH 043/117] #316 update more accession hierarchies based on webpage --- umls_list_jsonl_to_kg_jsonl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index a6b51336..2dc72e4e 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -153,7 +153,7 @@ def process_drugbank_item(node_id, info, nodes_output, edges_output): def process_fma_item(node_id, info, nodes_output, edges_output): - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(FMA_PREFIX, node_id, info, ['PT', 'SY']) + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(FMA_PREFIX, node_id, info, ['PT', 'SY', 'AB', 'OP', 'IS']) # Currently not used, but extracting them in case we want them in the future authority = info.get(INFO_KEY, dict()).get('AUTHORITY', list()) @@ -163,7 +163,7 @@ def process_fma_item(node_id, info, nodes_output, edges_output): def process_go_item(node_id, info, nodes_output, edges_output): - accession_heirarchy = ['PT', 'MTH_PT', 'ET', 'MTH_ET', 'SY', 'MTH_SY'] + accession_heirarchy = ['PT', 'MTH_PT', 'ET', 'MTH_ET', 'SY', 'MTH_SY', 'OP', 'MTH_OP', 'OET', 'MTH_OET', 'IS', 'MTH_IS'] node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(GO_PREFIX, node_id.replace('GO:', ''), info, accession_heirarchy) # GO-specific information @@ -191,7 +191,7 @@ def process_go_item(node_id, info, nodes_output, edges_output): def process_hcpcs_item(node_id, info, nodes_output, edges_output): - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HCPCS_PREFIX, node_id, info, ['PT', 'MTH_HT', 'MP']) + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HCPCS_PREFIX, node_id, info, ['PT', 'MP', 'MTH_HT']) # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html attributes = info.get(INFO_KEY, dict()) From 2cfa1485a09b731e873e2990dc6112ac10bf9aad Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 19 Aug 2023 11:33:34 -0700 Subject: [PATCH 044/117] #316 attempted work around for HL7 (a CUI source) --- kg2_util.py | 1 - umls_list_jsonl_to_kg_jsonl.py | 14 +++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/kg2_util.py b/kg2_util.py index ba99bc70..d7750974 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -72,7 +72,6 @@ CURIE_PREFIX_GTPI_SOURCE = 'GTPI_source' CURIE_PREFIX_HCPCS = 'HCPCS' CURIE_PREFIX_HGNC = 'HGNC' -CURIE_PREFIX_HL7 = 'HL7' CURIE_PREFIX_HMDB = 'HMDB' CURIE_PREFIX_IAO = 'IAO' CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY = 'identifiers_org_registry' diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 2dc72e4e..6a8a508d 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -38,7 +38,7 @@ GO_PREFIX = kg2_util.CURIE_PREFIX_GO HCPCS_PREFIX = kg2_util.CURIE_PREFIX_HCPCS HGNC_PREFIX = kg2_util.CURIE_PREFIX_HGNC -HL7_PREFIX = kg2_util.CURIE_PREFIX_HL7 +HL7_PREFIX = kg2_util.CURIE_PREFIX_UMLS UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -105,10 +105,14 @@ def make_umls_node(node_curie, iri, name, category, update_date, provided_by, sy def get_basic_info(curie_prefix, node_id, info, accession_heirarchy): provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) - iri = IRI_MAPPINGS[curie_prefix] + node_id - node_curie = make_node_id(curie_prefix, node_id) cuis = info.get(CUIS_KEY, list()) tuis = info.get(TUIS_KEY, list()) + iri = IRI_MAPPINGS[curie_prefix] + node_id + if curie_prefix == kg2_util.UMLS_SOURCE_PREFIX: + if len(cuis) != 1: + return None, None, None, None, None, None, None, None + node_id = cuis[0] + node_curie = make_node_id(curie_prefix, node_id) category = TUI_MAPPINGS[str(tuple(tuis))] names = info.get(NAMES_KEY, dict()) @@ -258,7 +262,11 @@ def process_hgnc_item(node_id, info, nodes_output, edges_output): def process_hl7_item(node_id, info, nodes_output, edges_output): accession_heirarchy = ['CSY', 'PT', 'CDO', 'VS', 'BR', 'CPR', 'CR', 'NPT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HL7_PREFIX, node_id, info, accession_heirarchy) + if node_curie == None: + return + provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'HL7') + # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html hl7at = attributes.get('HL7AT', list()) hl7ii = attributes.get('HL7II', list()) hl7im = attributes.get('HL7IM', list()) From ae432ac481bef8dafd3cce7d4dc8ac9302408065 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 19 Aug 2023 12:06:21 -0700 Subject: [PATCH 045/117] #316 improving HL7 to actually work and adding in HPO --- kg2_util.py | 1 + umls_list_jsonl_to_kg_jsonl.py | 24 ++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/kg2_util.py b/kg2_util.py index d7750974..e5f3d50d 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -73,6 +73,7 @@ CURIE_PREFIX_HCPCS = 'HCPCS' CURIE_PREFIX_HGNC = 'HGNC' CURIE_PREFIX_HMDB = 'HMDB' +CURIE_PREFIX_HP = 'HP' CURIE_PREFIX_IAO = 'IAO' CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY = 'identifiers_org_registry' CURIE_PREFIX_ISBN = 'ISBN' diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 6a8a508d..b05eb1b5 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -39,6 +39,7 @@ HCPCS_PREFIX = kg2_util.CURIE_PREFIX_HCPCS HGNC_PREFIX = kg2_util.CURIE_PREFIX_HGNC HL7_PREFIX = kg2_util.CURIE_PREFIX_UMLS +HPO_PREFIX = kg2_util.CURIE_PREFIX_HP UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -107,12 +108,12 @@ def get_basic_info(curie_prefix, node_id, info, accession_heirarchy): provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) cuis = info.get(CUIS_KEY, list()) tuis = info.get(TUIS_KEY, list()) - iri = IRI_MAPPINGS[curie_prefix] + node_id - if curie_prefix == kg2_util.UMLS_SOURCE_PREFIX: + if curie_prefix == kg2_util.CURIE_PREFIX_UMLS: if len(cuis) != 1: return None, None, None, None, None, None, None, None node_id = cuis[0] node_curie = make_node_id(curie_prefix, node_id) + iri = IRI_MAPPINGS[curie_prefix] + node_id category = TUI_MAPPINGS[str(tuple(tuis))] names = info.get(NAMES_KEY, dict()) @@ -267,6 +268,7 @@ def process_hl7_item(node_id, info, nodes_output, edges_output): provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'HL7') # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html + attributes = info.get(INFO_KEY, dict()) hl7at = attributes.get('HL7AT', list()) hl7ii = attributes.get('HL7II', list()) hl7im = attributes.get('HL7IM', list()) @@ -306,6 +308,21 @@ def process_hl7_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_hpo_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PT', 'SY', 'ET', 'OP', 'IS', 'OET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HPO_PREFIX, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + sid = attributes.get('SID', list()) + hpo_comment = attributes.get('HPO_COMMENT', list()) + date_created = attributes.get('DATE_CREATED', list()) + syn_qualifier = attributes.get('SYN_QUALIFIER', list()) + ref = attributes.get('REF', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) args = get_args() @@ -368,6 +385,9 @@ def process_hl7_item(node_id, info, nodes_output, edges_output): process_hl7_item(node_id, value, nodes_output, edges_output) if source == 'HPO': + process_hpo_item(node_id, value, nodes_output, edges_output) + + if source == 'ICD10PCS': name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) From 18c7366c4fee3bc0d35ab746b54ecbcfc3f5787f Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 19 Aug 2023 12:35:22 -0700 Subject: [PATCH 046/117] #316 ICD10 and ICD9 work --- kg2_util.py | 2 ++ umls_list_jsonl_to_kg_jsonl.py | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/kg2_util.py b/kg2_util.py index e5f3d50d..eb1b864f 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -75,6 +75,8 @@ CURIE_PREFIX_HMDB = 'HMDB' CURIE_PREFIX_HP = 'HP' CURIE_PREFIX_IAO = 'IAO' +CURIE_PREFIX_ICD10PCS = 'ICD10PCS' +CURIE_PREFIX_ICD9 = 'ICD9' CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY = 'identifiers_org_registry' CURIE_PREFIX_ISBN = 'ISBN' CURIE_PREFIX_KEGG = 'KEGG' diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index b05eb1b5..8df24faa 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -40,6 +40,8 @@ HGNC_PREFIX = kg2_util.CURIE_PREFIX_HGNC HL7_PREFIX = kg2_util.CURIE_PREFIX_UMLS HPO_PREFIX = kg2_util.CURIE_PREFIX_HP +ICD10PCS_PREFIX = kg2_util.CURIE_PREFIX_ICD10PCS +ICD9CM = kg2_util.CURIE_PREFIX_ICD9 UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -310,7 +312,7 @@ def process_hl7_item(node_id, info, nodes_output, edges_output): def process_hpo_item(node_id, info, nodes_output, edges_output): accession_heirarchy = ['PT', 'SY', 'ET', 'OP', 'IS', 'OET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HPO_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HPO_PREFIX, node_id.replace('HP:', ''), info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -323,6 +325,18 @@ def process_hpo_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_icd10_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PT', 'PX', 'HX', 'MTH_HX', 'HT', 'HS', 'AB'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(ICD10PCS_PREFIX, node_id, info, accession_heirarchy) + provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'ICD10PCS') + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + added_meaning = attributes.get('ADDED_MEANING', list()) + order_no = attributes.get('ORDER_NO', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) args = get_args() From 84a4a3b2223a44a44b1b68e9e867624463f30235 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 19 Aug 2023 12:55:40 -0700 Subject: [PATCH 047/117] #316 ICD9 work, surveying MED-RT --- umls_list_jsonl_to_kg_jsonl.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 8df24faa..d8ad0d43 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -41,7 +41,7 @@ HL7_PREFIX = kg2_util.CURIE_PREFIX_UMLS HPO_PREFIX = kg2_util.CURIE_PREFIX_HP ICD10PCS_PREFIX = kg2_util.CURIE_PREFIX_ICD10PCS -ICD9CM = kg2_util.CURIE_PREFIX_ICD9 +ICD9CM_PREFIX = kg2_util.CURIE_PREFIX_ICD9 UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -325,10 +325,9 @@ def process_hpo_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_icd10_item(node_id, info, nodes_output, edges_output): +def process_icd10pcs_item(node_id, info, nodes_output, edges_output): accession_heirarchy = ['PT', 'PX', 'HX', 'MTH_HX', 'HT', 'HS', 'AB'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(ICD10PCS_PREFIX, node_id, info, accession_heirarchy) - provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'ICD10PCS') # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -337,6 +336,24 @@ def process_icd10_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + +def process_icd9cm_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PT', 'HT', 'AB'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(ICD9CM_PREFIX, node_id, info, accession_heirarchy) + provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'ICD9CM') + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + icc = attributes.get('ICC', list()) + ice = attributes.get('ICE', list()) + icf = attributes.get('ICF', list()) + sos = attributes.get('SOS', list()) + icn = attributes.get('ICN', list()) + ica = attributes.get('ICA', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) args = get_args() @@ -402,6 +419,12 @@ def process_icd10_item(node_id, info, nodes_output, edges_output): process_hpo_item(node_id, value, nodes_output, edges_output) if source == 'ICD10PCS': + process_icd10pcs_item(node_id, value, nodes_output, edges_output) + + if source == 'ICD9CM': + process_icd9cm_item(node_id, value, nodes_output, edges_output) + + if source == 'MED-RT': name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) From 24048572615851aebe79cefb7e7eeb2862dcc8e4 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 19 Aug 2023 13:05:17 -0700 Subject: [PATCH 048/117] #316 MED-RT --- umls_list_jsonl_to_kg_jsonl.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index d8ad0d43..4ca099ba 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -42,6 +42,7 @@ HPO_PREFIX = kg2_util.CURIE_PREFIX_HP ICD10PCS_PREFIX = kg2_util.CURIE_PREFIX_ICD10PCS ICD9CM_PREFIX = kg2_util.CURIE_PREFIX_ICD9 +MEDRT_PREFIX = kg2_util.CURIE_PREFIX_UMLS UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -353,6 +354,19 @@ def process_icd9cm_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_medrt_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PT', 'FN', 'SY'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(MEDRT_PREFIX, node_id, info, accession_heirarchy) + if node_curie == None: + return + provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'MED-RT') + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + term_status = attributes.get('TERM_STATUS', list()) + concept_type = attributes.get('CONCEPT_TYPE', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) @@ -425,6 +439,9 @@ def process_icd9cm_item(node_id, info, nodes_output, edges_output): process_icd9cm_item(node_id, value, nodes_output, edges_output) if source == 'MED-RT': + process_medrt_item(node_id, value, nodes_output, edges_output) + + if source == 'MEDLINEPLUS' name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) From 512f93d29ed411e672d7c8176d6e0c42ac6a640b Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 19 Aug 2023 13:17:06 -0700 Subject: [PATCH 049/117] #316 addressing small typo --- umls_list_jsonl_to_kg_jsonl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 4ca099ba..a127bcf0 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -441,7 +441,7 @@ def process_medrt_item(node_id, info, nodes_output, edges_output): if source == 'MED-RT': process_medrt_item(node_id, value, nodes_output, edges_output) - if source == 'MEDLINEPLUS' + if source == 'MEDLINEPLUS': name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) From 8d55c50a5c6b70483fae00531c69d584aa49a455 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 21 Aug 2023 09:53:10 -0700 Subject: [PATCH 050/117] #316 MEDLINEPLUS and MSH --- umls_list_jsonl_to_kg_jsonl.py | 67 ++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index a127bcf0..1958b85f 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -43,6 +43,8 @@ ICD10PCS_PREFIX = kg2_util.CURIE_PREFIX_ICD10PCS ICD9CM_PREFIX = kg2_util.CURIE_PREFIX_ICD9 MEDRT_PREFIX = kg2_util.CURIE_PREFIX_UMLS +MEDLINEPLUS_PREFIX = kg2_util.CURIE_PREFIX_UMLS +MSH_PREFIX = kg2_util.CURIE_PREFIX_UMLS UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -368,6 +370,65 @@ def process_medrt_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + +def process_medlineplus_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PT', 'ET', 'SY', 'HT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(MEDLINEPLUS_PREFIX, node_id, info, accession_heirarchy) + if node_curie == None: + return + provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'MEDLINEPLUS') + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + sos = attributes.get('SOS', list()) + date_created = attributes.get('DATE_CREATED', list()) + mp_group_url = attributes.get('MP_GROUP_URL', list()) + mp_primary_institute_url = attributes.get('MP_PRIMARY_INSTITUTE_URL', list()) + mp_other_language_url = attributes.get('MP_OTHER_LANGUAGE_URL', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_msh_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['MH', 'TQ', 'PEP', 'ET', 'XQ', 'PXQ', 'NM', 'N1', 'PCE', 'CE', 'HT', 'HS', 'DEV', 'DSV', 'QAB', 'QEV', 'QSV', 'PM'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(MSH_PREFIX, node_id, info, accession_heirarchy) + if node_curie == None: + return + provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'MSH') + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + mmr = attributes.get('MMR', list()) + fx = attributes.get('FX', list()) + lt = attributes.get('LT', list()) + dc = attributes.get('DC', list()) + pa = attributes.get('PA', list()) + rr = attributes.get('RR', list()) + hm = attributes.get('HM', list()) + pi = attributes.get('PI', list()) + ec = attributes.get('EC', list()) + hn = attributes.get('HN', list()) + termui = attributes.get('TERMUI', list()) + th = attributes.get('TH', list()) + sos = attributes.get('SOS', list()) + ii = attributes.get('II', list()) + rn = attributes.get('RN', list()) + an = attributes.get('AN', list()) + cx = attributes.get('CX', list()) + dq = attributes.get('DQ', list()) + dx = attributes.get('DX', list()) + pm = attributes.get('PM', list()) + aql = attributes.get('AQL', list()) + sc = attributes.get('SC', list()) + fr = attributes.get('FR', list()) + mda = attributes.get('MDA', list()) + src = attributes.get('SRC', list()) + ol = attributes.get('OL', list()) + mn = attributes.get('MN', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) args = get_args() @@ -442,6 +503,12 @@ def process_medrt_item(node_id, info, nodes_output, edges_output): process_medrt_item(node_id, value, nodes_output, edges_output) if source == 'MEDLINEPLUS': + process_medlineplus_item(node_id, value, nodes_output, edges_output) + + if source == 'MSH': + process_msh_item(node_id, value, nodes_output, edges_output) + + if source == 'MTH': name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) From efcc7ef1ea1d43cd5a807cd0a83fb09723f87285 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 21 Aug 2023 10:14:21 -0700 Subject: [PATCH 051/117] #316 dictionary mapping for functions --- umls_list_jsonl_to_kg_jsonl.py | 85 ++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 1958b85f..b222d5bf 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -19,10 +19,7 @@ import json -DESIRED_CODES = ['ATC', 'CHV', 'DRUGBANK', 'FMA', 'GO', 'HCPCS', 'HGNC', 'HL7V3.0', - 'HPO', 'ICD10PCS', 'ICD9CM', 'MED-RT', 'MEDLINEPLUS', 'MSH', - 'MTH', 'NCBI', 'NCBITAXON', 'NCI', 'NDDF', 'NDFRT', 'OMIM', 'PDQ', - 'PSY', 'RXNORM', 'VANDF'] + CUIS_KEY = 'cuis' INFO_KEY = 'attributes' NAMES_KEY = 'names' @@ -429,6 +426,21 @@ def process_msh_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +DESIRED_CODES = {'ATC': process_atc_item, + 'CHV': process_chv_item, + 'DRUGBANK': process_drugbank_item, + 'FMA': process_fma_item, + 'GO': process_go_item, + 'HCPCS': process_hcpcs_item, + 'HGNC': process_hgnc_item, + 'HL7V3.0': process_hl7_item, + 'HPO': process_hpo_item, + 'ICD10PCS': process_icd10pcs_item, + 'ICD9CM': process_icd9cm_item} + # , 'MED-RT', 'MEDLINEPLUS', 'MSH', + # 'MTH', 'NCBI', 'NCBITAXON', 'NCI', 'NDDF', 'NDFRT', 'OMIM', 'PDQ', + # 'PSY', 'RXNORM', 'VANDF'} + if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) args = get_args() @@ -462,55 +474,56 @@ def process_msh_item(node_id, info, nodes_output, edges_output): continue value = data[entity] source, node_id = extract_node_id(entity) - if source not in DESIRED_CODES and source != 'UMLS': + if source not in DESIRED_CODES: continue # Process the data specifically by source - if source == 'ATC': - process_atc_item(node_id, value, nodes_output, edges_output) + DESIRED_CODES[source](node_id, value, nodes_output, edges_output) + # if source == 'ATC': + # process_atc_item(node_id, value, nodes_output, edges_output) - if source == 'CHV': - process_chv_item(node_id, value, nodes_output, edges_output) + # if source == 'CHV': + # process_chv_item(node_id, value, nodes_output, edges_output) - if source == 'DRUGBANK': - process_drugbank_item(node_id, value, nodes_output, edges_output) + # if source == 'DRUGBANK': + # process_drugbank_item(node_id, value, nodes_output, edges_output) - if source == 'FMA': - process_fma_item(node_id, value, nodes_output, edges_output) + # if source == 'FMA': + # process_fma_item(node_id, value, nodes_output, edges_output) - if source == 'GO': - process_go_item(node_id, value, nodes_output, edges_output) + # if source == 'GO': + # process_go_item(node_id, value, nodes_output, edges_output) - if source == 'HCPCS': - process_hcpcs_item(node_id, value, nodes_output, edges_output) + # if source == 'HCPCS': + # process_hcpcs_item(node_id, value, nodes_output, edges_output) - if source == 'HGNC': - process_hgnc_item(node_id, value, nodes_output, edges_output) + # if source == 'HGNC': + # process_hgnc_item(node_id, value, nodes_output, edges_output) - if source == 'HL7V3.0': - process_hl7_item(node_id, value, nodes_output, edges_output) + # if source == 'HL7V3.0': + # process_hl7_item(node_id, value, nodes_output, edges_output) - if source == 'HPO': - process_hpo_item(node_id, value, nodes_output, edges_output) + # if source == 'HPO': + # process_hpo_item(node_id, value, nodes_output, edges_output) - if source == 'ICD10PCS': - process_icd10pcs_item(node_id, value, nodes_output, edges_output) + # if source == 'ICD10PCS': + # process_icd10pcs_item(node_id, value, nodes_output, edges_output) - if source == 'ICD9CM': - process_icd9cm_item(node_id, value, nodes_output, edges_output) + # if source == 'ICD9CM': + # process_icd9cm_item(node_id, value, nodes_output, edges_output) - if source == 'MED-RT': - process_medrt_item(node_id, value, nodes_output, edges_output) + # if source == 'MED-RT': + # process_medrt_item(node_id, value, nodes_output, edges_output) - if source == 'MEDLINEPLUS': - process_medlineplus_item(node_id, value, nodes_output, edges_output) + # if source == 'MEDLINEPLUS': + # process_medlineplus_item(node_id, value, nodes_output, edges_output) - if source == 'MSH': - process_msh_item(node_id, value, nodes_output, edges_output) + # if source == 'MSH': + # process_msh_item(node_id, value, nodes_output, edges_output) - if source == 'MTH': - name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) - attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) + # if source == 'MTH': + # name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) + # attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) kg2_util.end_read_jsonlines(input_read_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) From c5c2195d9ffbe0f75ab50875e1928fe009654bd3 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 21 Aug 2023 14:00:45 -0700 Subject: [PATCH 052/117] #316 MTH --- umls_list_jsonl_to_kg_jsonl.py | 115 ++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 51 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index b222d5bf..08a6be34 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -41,7 +41,9 @@ ICD9CM_PREFIX = kg2_util.CURIE_PREFIX_ICD9 MEDRT_PREFIX = kg2_util.CURIE_PREFIX_UMLS MEDLINEPLUS_PREFIX = kg2_util.CURIE_PREFIX_UMLS -MSH_PREFIX = kg2_util.CURIE_PREFIX_UMLS +MSH_PREFIX = kg2_util.CURIE_PREFIX_MESH +MTH_PREFIX = kg2_util.CURIE_PREFIX_UMLS +NCBI_PREFIX = kg2_util.CURIE_PREFIX_NCBI_TAXON UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -389,8 +391,6 @@ def process_medlineplus_item(node_id, info, nodes_output, edges_output): def process_msh_item(node_id, info, nodes_output, edges_output): accession_heirarchy = ['MH', 'TQ', 'PEP', 'ET', 'XQ', 'PXQ', 'NM', 'N1', 'PCE', 'CE', 'HT', 'HS', 'DEV', 'DSV', 'QAB', 'QEV', 'QSV', 'PM'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(MSH_PREFIX, node_id, info, accession_heirarchy) - if node_curie == None: - return provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'MSH') # Currently not used, but extracting them in case we want them in the future @@ -426,6 +426,48 @@ def process_msh_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_mth_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PN', 'CV', 'XM', 'PT', 'SY', 'RT', 'DT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(MTH_PREFIX, node_id, info, accession_heirarchy) + if node_curie == None: + return + provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'MTH') + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + mth_mapsetcomplexity = attributes.get('MTH_MAPSETCOMPLEXITY', list()) + fromvsab = attributes.get('FROMVSAB', list()) + mapsetrsab = attributes.get('MAPSETRSAB', list()) + mapsetversion = attributes.get('MAPSETVERSION', list()) + mapsetvsab = attributes.get('MAPSETVSAB', list()) + tovsab = attributes.get('TOVSAB', list()) + mth_mapfromexhaustive = attributes.get('MTH_MAPFROMEXHAUSTIVE', list()) + torsab = attributes.get('TORSAB', list()) + mapsetsid = attributes.get('MAPSETSID', list()) + mapsetgrammar = attributes.get('MAPSETGRAMMAR', list()) + mapsettype = attributes.get('MAPSETTYPE', list()) + mth_maptoexhaustive = attributes.get('MTH_MAPTOEXHAUSTIVE', list()) + fromrsab = attributes.get('FROMRSAB', list()) + mth_mapfromcomplexity = attributes.get('MTH_MAPFROMCOMPLEXITY', list()) + lt = attributes.get('LT', list()) + mth_maptocomplexity = attributes.get('MTH_MAPTOCOMPLEXITY', list()) + sos = attributes.get('SOS', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_ncbi_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['SCN', 'USN', 'USY', 'SY', 'UCN', 'CMN', 'UE', 'EQ'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(NCBI_PREFIX, node_id, info, accession_heirarchy) + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + div = attributes.get('DIV', list()) + authority_name = attributes.get('AUTHORITY_NAME', list()) + rank = attributes.get('RANK', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + DESIRED_CODES = {'ATC': process_atc_item, 'CHV': process_chv_item, 'DRUGBANK': process_drugbank_item, @@ -436,10 +478,20 @@ def process_msh_item(node_id, info, nodes_output, edges_output): 'HL7V3.0': process_hl7_item, 'HPO': process_hpo_item, 'ICD10PCS': process_icd10pcs_item, - 'ICD9CM': process_icd9cm_item} - # , 'MED-RT', 'MEDLINEPLUS', 'MSH', - # 'MTH', 'NCBI', 'NCBITAXON', 'NCI', 'NDDF', 'NDFRT', 'OMIM', 'PDQ', - # 'PSY', 'RXNORM', 'VANDF'} + 'ICD9CM': process_icd9cm_item, + 'MED-RT': process_medrt_item, + 'MEDLINEPLUS': process_medlineplus_item, + 'MSH': process_msh_item, + 'MTH': process_mth_item, + 'NCBI': process_ncbi_item} + # 'NCI': process_nci_item, + # 'NDDF': process_nddf_item, + # 'NDFRT': process_ndfrt_item, + # 'OMIM': process_omim_item, + # 'PDQ': process_pdq_item, + # 'PSY': process_psy_item, + # 'RXNORM': process_rxnorm_item, + # 'VANDF': process_vandf_item} if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) @@ -474,56 +526,17 @@ def process_msh_item(node_id, info, nodes_output, edges_output): continue value = data[entity] source, node_id = extract_node_id(entity) + + if source == 'NCI': + name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) + attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) + if source not in DESIRED_CODES: continue # Process the data specifically by source DESIRED_CODES[source](node_id, value, nodes_output, edges_output) - # if source == 'ATC': - # process_atc_item(node_id, value, nodes_output, edges_output) - - # if source == 'CHV': - # process_chv_item(node_id, value, nodes_output, edges_output) - - # if source == 'DRUGBANK': - # process_drugbank_item(node_id, value, nodes_output, edges_output) - - # if source == 'FMA': - # process_fma_item(node_id, value, nodes_output, edges_output) - - # if source == 'GO': - # process_go_item(node_id, value, nodes_output, edges_output) - - # if source == 'HCPCS': - # process_hcpcs_item(node_id, value, nodes_output, edges_output) - - # if source == 'HGNC': - # process_hgnc_item(node_id, value, nodes_output, edges_output) - - # if source == 'HL7V3.0': - # process_hl7_item(node_id, value, nodes_output, edges_output) - - # if source == 'HPO': - # process_hpo_item(node_id, value, nodes_output, edges_output) - - # if source == 'ICD10PCS': - # process_icd10pcs_item(node_id, value, nodes_output, edges_output) - - # if source == 'ICD9CM': - # process_icd9cm_item(node_id, value, nodes_output, edges_output) - - # if source == 'MED-RT': - # process_medrt_item(node_id, value, nodes_output, edges_output) - - # if source == 'MEDLINEPLUS': - # process_medlineplus_item(node_id, value, nodes_output, edges_output) - - # if source == 'MSH': - # process_msh_item(node_id, value, nodes_output, edges_output) - # if source == 'MTH': - # name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) - # attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) kg2_util.end_read_jsonlines(input_read_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) From e1e205afa374322e96732cf8a0c95e311c83b945 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 21 Aug 2023 14:45:19 -0700 Subject: [PATCH 053/117] #316 NCI --- umls_list_jsonl_to_kg_jsonl.py | 60 ++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 08a6be34..a20a2e63 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -468,6 +468,60 @@ def process_ncbi_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_nci_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PT', 'SY', 'CSN', 'DN', 'FBD', 'HD', 'CCN', 'AD', 'CA2', 'CA3', 'BN', 'AB', 'CCS', 'OP'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(NCBI_PREFIX, node_id, info, accession_heirarchy) + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + clinvar_variation_id = attributes.get('CLINVAR_VARIATION_ID', list()) + micronutrient = attributes.get('MICRONUTRIENT', list()) + genbank_accession_number = attributes.get('GENBANK_ACCESSION_NUMBER', list()) + fda_table = attributes.get('FDA_TABLE', list()) + usda_id = attributes.get('USDA_ID', list()) + icd_o_3_code = attributes.get('ICD-O-3_CODE', list()) + tolerable_level = attributes.get('TOLERABLE_LEVEL', list()) + ncbi_taxon_id = attributes.get('NCBI_TAXON_ID', list()) + mgi_accession_id = attributes.get('MGI_ACCESSION_ID', list()) + homologous_gene = attributes.get('HOMOLOGOUS_GENE', list()) + pid_id = attributes.get('PID_ID', list()) + swiss_prot = attributes.get('SWISS_PROT', list()) + essential_amino_acid = attributes.get('ESSENTIAL_AMINO_ACID', list()) + publish_value_set = attributes.get('PUBLISH_VALUE_SET', list()) + cas_registry = attributes.get('CAS_REGISTRY', list()) + value_set_pair = attributes.get('VALUE_SET_PAIR', list()) + accepted_therapeutic_use_for = attributes.get('ACCEPTED_THERAPEUTIC_USE_FOR', list()) + hgnc_id = attributes.get('HGNC_ID', list()) + nci_drug_dictionary_id = attributes.get('NCI_DRUG_DICTIONARY_ID', list()) + chebi_id = attributes.get('CHEBI_ID', list()) + cnu = attributes.get('CNU', list()) + mirbase_id = attributes.get('MIRBASE_ID', list()) + macronutrient = attributes.get('MACRONUTRIENT', list()) + essential_fatty_acid = attributes.get('ESSENTIAL_FATTY_ACID', list()) + unit = attributes.get('UNIT', list()) + pdq_open_trial_search_id = attributes.get('PDQ_OPEN_TRIAL_SEARCH_ID', list()) + term_browser_value_set_description = attributes.get('TERM_BROWSER_VALUE_SET_DESCRIPTION', list()) + entrezgene_id = attributes.get('ENTREZGENE_ID', list()) + infoods = attributes.get('INFOODS', list()) + pubmedid_primary_reference = attributes.get('PUBMEDID_PRIMARY_REFERENCE', list()) + biocarta_id = attributes.get('BIOCARTA_ID', list()) + extensible_list = attributes.get('EXTENSIBLE_LIST', list()) + use_for = attributes.get('USE_FOR', list()) + neoplastic_status = attributes.get('NEOPLASTIC_STATUS', list()) + nsc_number = attributes.get('NSC_NUMBER', list()) + omim_number = attributes.get('OMIM_NUMBER', list()) + lt = attributes.get('LT', list()) + kegg_id = attributes.get('KEGG_ID', list()) + gene_encodes_product = attributes.get('GENE_ENCODES_PRODUCT', list()) + pdq_closed_trial_search_id = attributes.get('PDQ_CLOSED_TRIAL_SEARCH_ID', list()) + design_note = attributes.get('DESIGN_NOTE', list()) + nutrient = attributes.get('NUTRIENT', list()) + fda_unii_code = attributes.get('FDA_UNII_CODE', list()) + us_recommended_intake = attributes.get('US_RECOMMENDED_INTAKE', list()) + chemical_formula = attributes.get('CHEMICAL_FORMULA', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + DESIRED_CODES = {'ATC': process_atc_item, 'CHV': process_chv_item, 'DRUGBANK': process_drugbank_item, @@ -483,8 +537,8 @@ def process_ncbi_item(node_id, info, nodes_output, edges_output): 'MEDLINEPLUS': process_medlineplus_item, 'MSH': process_msh_item, 'MTH': process_mth_item, - 'NCBI': process_ncbi_item} - # 'NCI': process_nci_item, + 'NCBI': process_ncbi_item, + 'NCI': process_nci_item} # 'NDDF': process_nddf_item, # 'NDFRT': process_ndfrt_item, # 'OMIM': process_omim_item, @@ -527,7 +581,7 @@ def process_ncbi_item(node_id, info, nodes_output, edges_output): value = data[entity] source, node_id = extract_node_id(entity) - if source == 'NCI': + if source == 'NDDF': name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) From f0564510b0924cf6639d054b95756ae615a4b490 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 21 Aug 2023 14:54:52 -0700 Subject: [PATCH 054/117] #316 NDDF and NCI updates --- kg2_util.py | 1 + umls_list_jsonl_to_kg_jsonl.py | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/kg2_util.py b/kg2_util.py index eb1b864f..63de5879 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -93,6 +93,7 @@ CURIE_PREFIX_NCBI_GENE = 'NCBIGene' CURIE_PREFIX_NCBI_TAXON = 'NCBITaxon' CURIE_PREFIX_NCIT = 'NCIT' +CURIE_PREFIX_NDDF = 'NDDF' CURIE_PREFIX_OBO = 'OBO' CURIE_PREFIX_OBO_FORMAT = 'oboFormat' CURIE_PREFIX_OIO = 'OIO' diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index a20a2e63..c1a23a1d 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -44,6 +44,8 @@ MSH_PREFIX = kg2_util.CURIE_PREFIX_MESH MTH_PREFIX = kg2_util.CURIE_PREFIX_UMLS NCBI_PREFIX = kg2_util.CURIE_PREFIX_NCBI_TAXON +NCI_PREFIX = kg2_util.CURIE_PREFIX_NCIT +NDDF_PREFIX = kg2_util.CURIE_PREFIX_NDDF UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -459,6 +461,7 @@ def process_mth_item(node_id, info, nodes_output, edges_output): def process_ncbi_item(node_id, info, nodes_output, edges_output): accession_heirarchy = ['SCN', 'USN', 'USY', 'SY', 'UCN', 'CMN', 'UE', 'EQ'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(NCBI_PREFIX, node_id, info, accession_heirarchy) + # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) div = attributes.get('DIV', list()) @@ -470,7 +473,9 @@ def process_ncbi_item(node_id, info, nodes_output, edges_output): def process_nci_item(node_id, info, nodes_output, edges_output): accession_heirarchy = ['PT', 'SY', 'CSN', 'DN', 'FBD', 'HD', 'CCN', 'AD', 'CA2', 'CA3', 'BN', 'AB', 'CCS', 'OP'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(NCBI_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(NCI_PREFIX, node_id, info, accession_heirarchy) + provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'NCI') + # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) clinvar_variation_id = attributes.get('CLINVAR_VARIATION_ID', list()) @@ -521,6 +526,16 @@ def process_nci_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_nddf_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['MTH_RXN_CDC', 'CDC', 'CDD', 'CDA', 'IN', 'DF'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(NDDF_PREFIX, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + ndc = attributes.get('NDC', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + DESIRED_CODES = {'ATC': process_atc_item, 'CHV': process_chv_item, @@ -538,8 +553,8 @@ def process_nci_item(node_id, info, nodes_output, edges_output): 'MSH': process_msh_item, 'MTH': process_mth_item, 'NCBI': process_ncbi_item, - 'NCI': process_nci_item} - # 'NDDF': process_nddf_item, + 'NCI': process_nci_item, + 'NDDF': process_nddf_item} # 'NDFRT': process_ndfrt_item, # 'OMIM': process_omim_item, # 'PDQ': process_pdq_item, From c2322b2e0dc97ab39d07d14479fea6d6df4b55d3 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 21 Aug 2023 15:00:55 -0700 Subject: [PATCH 055/117] #316 NDFRT doesn't exist --- umls_list_jsonl_to_kg_jsonl.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index c1a23a1d..72d63111 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -555,7 +555,6 @@ def process_nddf_item(node_id, info, nodes_output, edges_output): 'NCBI': process_ncbi_item, 'NCI': process_nci_item, 'NDDF': process_nddf_item} - # 'NDFRT': process_ndfrt_item, # 'OMIM': process_omim_item, # 'PDQ': process_pdq_item, # 'PSY': process_psy_item, @@ -596,7 +595,7 @@ def process_nddf_item(node_id, info, nodes_output, edges_output): value = data[entity] source, node_id = extract_node_id(entity) - if source == 'NDDF': + if source == 'OMIM': name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) From c4e19b872b7b83fb974c968c96ff556e8cb4619d Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 21 Aug 2023 15:08:56 -0700 Subject: [PATCH 056/117] #316 OMIM --- umls_list_jsonl_to_kg_jsonl.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 72d63111..7d1b26b2 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -46,6 +46,7 @@ NCBI_PREFIX = kg2_util.CURIE_PREFIX_NCBI_TAXON NCI_PREFIX = kg2_util.CURIE_PREFIX_NCIT NDDF_PREFIX = kg2_util.CURIE_PREFIX_NDDF +OMIM_PREFIX = kg2_util.CURIE_PREFIX_OMIM UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -536,6 +537,22 @@ def process_nddf_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_omim_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PT', 'PHENO', 'PHENO_ET', 'PTAV', 'PTCS', 'ETAL', 'ET', 'HT', 'ACR'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(OMIM_PREFIX, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + genesymbol = attributes.get('GENESYMBOL', list()) + mimtypevalue = attributes.get('MIMTYPEVALUE', list()) + moved_from = attributes.get('MOVED_FROM', list()) + sos = attributes.get('SOS', list()) + genelocus = attributes.get('GENELOCUS', list()) + mimtypemeaning = attributes.get('MIMTYPEMEANING', list()) + mimtype = attributes.get('MIMTYPE', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + DESIRED_CODES = {'ATC': process_atc_item, 'CHV': process_chv_item, @@ -554,8 +571,8 @@ def process_nddf_item(node_id, info, nodes_output, edges_output): 'MTH': process_mth_item, 'NCBI': process_ncbi_item, 'NCI': process_nci_item, - 'NDDF': process_nddf_item} - # 'OMIM': process_omim_item, + 'NDDF': process_nddf_item, + 'OMIM': process_omim_item} # 'PDQ': process_pdq_item, # 'PSY': process_psy_item, # 'RXNORM': process_rxnorm_item, @@ -595,7 +612,7 @@ def process_nddf_item(node_id, info, nodes_output, edges_output): value = data[entity] source, node_id = extract_node_id(entity) - if source == 'OMIM': + if source == 'PDQ': name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) From 65d1dc3e8d931edbf8934c7a8b5c5d7fd2edd67a Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 21 Aug 2023 15:16:57 -0700 Subject: [PATCH 057/117] #316 PDQ --- kg2_util.py | 1 + umls_list_jsonl_to_kg_jsonl.py | 29 ++++++++++++++++++++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/kg2_util.py b/kg2_util.py index 63de5879..1873f418 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -106,6 +106,7 @@ CURIE_PREFIX_PATHWHIZ_REACTION = 'PathWhiz.Reaction' CURIE_PREFIX_PATHWHIZ_BOUND = 'PathWhiz.Bound' CURIE_PREFIX_PATHWHIZ_PROTEIN_COMPLEX = 'PathWhiz.ProteinComplex' +CURIE_PREFIX_PDQ = 'PDQ' CURIE_PREFIX_PMID = 'PMID' CURIE_PREFIX_RDF = 'rdf' CURIE_PREFIX_RDFS = 'rdfs' diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 7d1b26b2..8a33c6cc 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -47,6 +47,7 @@ NCI_PREFIX = kg2_util.CURIE_PREFIX_NCIT NDDF_PREFIX = kg2_util.CURIE_PREFIX_NDDF OMIM_PREFIX = kg2_util.CURIE_PREFIX_OMIM +PDQ_PREFIX = kg2_util.CURIE_PREFIX_PDQ UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -554,6 +555,28 @@ def process_omim_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_pdq_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PT', 'HT', 'PSC', 'SY', 'ET', 'CU', 'LV', 'ACR', 'AB', 'BN', 'FBD', 'CCN', 'CHN', 'OP', 'IS'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(PDQ_PREFIX, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + lt = attributes.get('LT', list()) + cas_registry = attributes.get('CAS_REGISTRY', list()) + date_first_published = attributes.get('DATE_FIRST_PUBLISHED', list()) + date_last_modified = attributes.get('DATE_LAST_MODIFIED', list()) + ind_code = attributes.get('IND_CODE', list()) + pid = attributes.get('PID', list()) + nsc_code = attributes.get('NSC_CODE', list()) + pxc = attributes.get('PXC', list()) + menu_parent = attributes.get('MENU_PARENT', list()) + nci_id = attributes.get('NCI_ID', list()) + orig_sty = attributes.get('ORIG_STY', list()) + menu_type = attributes.get('MENU_TYPE', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + DESIRED_CODES = {'ATC': process_atc_item, 'CHV': process_chv_item, 'DRUGBANK': process_drugbank_item, @@ -572,8 +595,8 @@ def process_omim_item(node_id, info, nodes_output, edges_output): 'NCBI': process_ncbi_item, 'NCI': process_nci_item, 'NDDF': process_nddf_item, - 'OMIM': process_omim_item} - # 'PDQ': process_pdq_item, + 'OMIM': process_omim_item, + 'PDQ': process_pdq_item} # 'PSY': process_psy_item, # 'RXNORM': process_rxnorm_item, # 'VANDF': process_vandf_item} @@ -612,7 +635,7 @@ def process_omim_item(node_id, info, nodes_output, edges_output): value = data[entity] source, node_id = extract_node_id(entity) - if source == 'PDQ': + if source == 'PSY': name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) From 147a2338a2d86c590cf23a5a15927661dc27623b Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 21 Aug 2023 15:25:37 -0700 Subject: [PATCH 058/117] #316 PSY --- kg2_util.py | 1 + umls_list_jsonl_to_kg_jsonl.py | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/kg2_util.py b/kg2_util.py index 1873f418..92990b0d 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -108,6 +108,7 @@ CURIE_PREFIX_PATHWHIZ_PROTEIN_COMPLEX = 'PathWhiz.ProteinComplex' CURIE_PREFIX_PDQ = 'PDQ' CURIE_PREFIX_PMID = 'PMID' +CURIE_PREFIX_PSY = 'PSY' CURIE_PREFIX_RDF = 'rdf' CURIE_PREFIX_RDFS = 'rdfs' CURIE_PREFIX_REACTOME='REACT' diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 8a33c6cc..43a83d95 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -48,6 +48,7 @@ NDDF_PREFIX = kg2_util.CURIE_PREFIX_NDDF OMIM_PREFIX = kg2_util.CURIE_PREFIX_OMIM PDQ_PREFIX = kg2_util.CURIE_PREFIX_PDQ +PSY_PREFIX = kg2_util.CURIE_PREFIX_PSY UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -577,6 +578,18 @@ def process_pdq_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_psy_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PT', 'HT', 'ET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(PSY_PREFIX, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + hn = attributes.get('HN', list()) + pyr = attributes.get('PYR', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + DESIRED_CODES = {'ATC': process_atc_item, 'CHV': process_chv_item, 'DRUGBANK': process_drugbank_item, @@ -596,8 +609,8 @@ def process_pdq_item(node_id, info, nodes_output, edges_output): 'NCI': process_nci_item, 'NDDF': process_nddf_item, 'OMIM': process_omim_item, - 'PDQ': process_pdq_item} - # 'PSY': process_psy_item, + 'PDQ': process_pdq_item, + 'PSY': process_psy_item} # 'RXNORM': process_rxnorm_item, # 'VANDF': process_vandf_item} @@ -635,7 +648,7 @@ def process_pdq_item(node_id, info, nodes_output, edges_output): value = data[entity] source, node_id = extract_node_id(entity) - if source == 'PSY': + if source == 'RXNORM': name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) From 2267bd124ff4f09c08646db384c354c322ea3b65 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 21 Aug 2023 15:34:09 -0700 Subject: [PATCH 059/117] #316 RXNORM --- kg2_util.py | 1 + umls_list_jsonl_to_kg_jsonl.py | 41 +++++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/kg2_util.py b/kg2_util.py index 92990b0d..584792ee 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -117,6 +117,7 @@ CURIE_PREFIX_RHEA_COMP = 'RHEA.COMP' CURIE_PREFIX_RO = 'RO' CURIE_PREFIX_RTX = 'RTX' +CURIE_PREFIX_RXNORM = 'RXNORM' CURIE_PREFIX_SEMMEDDB = 'SEMMEDDB' CURIE_PREFIX_SKOS = 'skos' CURIE_PREFIX_SMPDB = 'SMPDB' diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 43a83d95..841b88b5 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -49,6 +49,7 @@ OMIM_PREFIX = kg2_util.CURIE_PREFIX_OMIM PDQ_PREFIX = kg2_util.CURIE_PREFIX_PDQ PSY_PREFIX = kg2_util.CURIE_PREFIX_PSY +RXNORM_PREFIX = kg2_util.CURIE_PREFIX_RXNORM UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -590,6 +591,40 @@ def process_psy_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_rxnorm_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['SCD', 'SBD', 'SCDG', 'SBDG', 'BPCK', 'GPCK', 'IN', 'PSN', 'MIN', 'SCDF', 'SBDF', 'SCDC', 'DFG', 'DF', 'SBDC', 'BN', 'PIN', 'TMSY', 'SY', 'ET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(RXNORM_PREFIX, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + ndc = attributes.get('NDC', list()) + rxn_obsoleted = attributes.get('RXN_OBSOLETED', list()) + rxn_available_strength = attributes.get('RXN_AVAILABLE_STRENGTH', list()) + rxn_human_drug = attributes.get('RXN_HUMAN_DRUG', list()) + rxn_quantity = attributes.get('RXN_QUANTITY', list()) + rxterm_form = attributes.get('RXTERM_FORM', list()) + rxn_in_expressed_flag = attributes.get('RXN_IN_EXPRESSED_FLAG', list()) + rxaui = attributes.get('RXAUI', list()) + rxn_bn_cardinality = attributes.get('RXN_BN_CARDINALITY', list()) + rxn_activated = attributes.get('RXN_ACTIVATED', list()) + rxn_boss_strength_denom_unit = attributes.get('RXN_BOSS_STRENGTH_DENOM_UNIT', list()) + ambiguity_flag = attributes.get('AMBIGUITY_FLAG', list()) + rxn_strength = attributes.get('RXN_STRENGTH', list()) + rxcui = attributes.get('RXCUI', list()) + rxn_ai = attributes.get('RXN_AI', list()) + rxn_boss_from = attributes.get('RXN_BOSS_FROM', list()) + rxn_boss_strength_num_unit = attributes.get('RXN_BOSS_STRENGTH_NUM_UNIT', list()) + rxn_vet_drug = attributes.get('RXN_VET_DRUG', list()) + orig_code = attributes.get('ORIG_CODE', list()) + rxn_am = attributes.get('RXN_AM', list()) + rxn_boss_strength_denom_value = attributes.get('RXN_BOSS_STRENGTH_DENOM_VALUE', list()) + rxn_boss_strength_num_value = attributes.get('RXN_BOSS_STRENGTH_NUM_VALUE', list()) + rxn_qualitative_distinction = attributes.get('RXN_QUALITATIVE_DISTINCTION', list()) + orig_source = attributes.get('ORIG_SOURCE', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + DESIRED_CODES = {'ATC': process_atc_item, 'CHV': process_chv_item, 'DRUGBANK': process_drugbank_item, @@ -610,8 +645,8 @@ def process_psy_item(node_id, info, nodes_output, edges_output): 'NDDF': process_nddf_item, 'OMIM': process_omim_item, 'PDQ': process_pdq_item, - 'PSY': process_psy_item} - # 'RXNORM': process_rxnorm_item, + 'PSY': process_psy_item, + 'RXNORM': process_rxnorm_item} # 'VANDF': process_vandf_item} if __name__ == '__main__': @@ -648,7 +683,7 @@ def process_psy_item(node_id, info, nodes_output, edges_output): value = data[entity] source, node_id = extract_node_id(entity) - if source == 'RXNORM': + if source == 'VANDF': name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) From c2c791a96cb4bd6176b88e9cba189c6d2e0d8cc8 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 21 Aug 2023 15:44:36 -0700 Subject: [PATCH 060/117] #316 VANDF --- umls_list_jsonl_to_kg_jsonl.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 841b88b5..753ea42e 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -50,6 +50,7 @@ PDQ_PREFIX = kg2_util.CURIE_PREFIX_PDQ PSY_PREFIX = kg2_util.CURIE_PREFIX_PSY RXNORM_PREFIX = kg2_util.CURIE_PREFIX_RXNORM +VANDF_PREFIX = kg2_util.CURIE_PREFIX_VANDF UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE @@ -625,6 +626,31 @@ def process_rxnorm_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +def process_vandf_item(node_id, info, nodes_output, edges_output): + accession_heirarchy = ['PT', 'CD', 'IN', 'AB', 'MTH_RXN_CD'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(VANDF_PREFIX, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + ndf_transmit_to_cmop = attributes.get('NDF_TRANSMIT_TO_CMOP', list()) + sngl_or_mult_src_prd = attributes.get('SNGL_OR_MULT_SRC_PRD', list()) + dcsa = attributes.get('DCSA', list()) + exclude_di_check = attributes.get('EXCLUDE_DI_CHECK', list()) + nfi = attributes.get('NFI', list()) + va_class_name = attributes.get('VA_CLASS_NAME', list()) + vmo = attributes.get('VMO', list()) + drug_class_type = attributes.get('DRUG_CLASS_TYPE', list()) + nf_name = attributes.get('NF_NAME', list()) + ndc = attributes.get('NDC', list()) + vac = attributes.get('VAC', list()) + va_generic_name = attributes.get('VA_GENERIC_NAME', list()) + parent_class = attributes.get('PARENT_CLASS', list()) + va_dispense_unit = attributes.get('VA_DISPENSE_UNIT', list()) + ddf = attributes.get('DDF', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + DESIRED_CODES = {'ATC': process_atc_item, 'CHV': process_chv_item, 'DRUGBANK': process_drugbank_item, @@ -646,8 +672,8 @@ def process_rxnorm_item(node_id, info, nodes_output, edges_output): 'OMIM': process_omim_item, 'PDQ': process_pdq_item, 'PSY': process_psy_item, - 'RXNORM': process_rxnorm_item} - # 'VANDF': process_vandf_item} + 'RXNORM': process_rxnorm_item, + 'VANDF': process_vandf_item} if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) @@ -683,7 +709,7 @@ def process_rxnorm_item(node_id, info, nodes_output, edges_output): value = data[entity] source, node_id = extract_node_id(entity) - if source == 'VANDF': + if source == 'UMLS': name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) From e0cec4395e0e0e6a17304564126838e11ded6984 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 21 Aug 2023 16:30:56 -0700 Subject: [PATCH 061/117] #316 Global Accession Heirarchy --- umls_list_jsonl_to_kg_jsonl.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 753ea42e..2a36c339 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -54,6 +54,31 @@ UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE +# Mined from HTML Page Source of https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html +ACCESSION_HEIRARCHY = [('MTH', 'PN'), ('RXNORM', 'SCD'), ('RXNORM', 'SBD'), ('RXNORM', 'SCDG'), ('RXNORM', 'SBDG'), ('RXNORM', 'BPCK'), ('RXNORM', 'GPCK'), + ('RXNORM', 'IN'), ('RXNORM', 'PSN'), ('RXNORM', 'MIN'), ('RXNORM', 'SCDF'), ('RXNORM', 'SBDF'), ('RXNORM', 'SCDC'), ('RXNORM', 'DFG'), + ('RXNORM', 'DF'), ('RXNORM', 'SBDC'), ('RXNORM', 'BN'), ('RXNORM', 'PIN'), ('RXNORM', 'TMSY'), ('RXNORM', 'SY'), ('MSH', 'MH'), + ('MSH', 'TQ'), ('MSH', 'PEP'), ('MSH', 'ET'), ('MSH', 'XQ'), ('MSH', 'PXQ'), ('MSH', 'NM'), ('HPO', 'PT'), ('HPO', 'SY'), ('HPO', 'ET'), + ('NCBI', 'SCN'), ('ATC', 'RXN_PT'), ('ATC', 'PT'), ('VANDF', 'PT'), ('VANDF', 'CD'), ('VANDF', 'IN'), ('DRUGBANK', 'IN'), + ('DRUGBANK', 'SY'), ('DRUGBANK', 'FSY'), ('MSH', 'N1'), ('MSH', 'PCE'), ('MSH', 'CE'), ('FMA', 'PT'), ('FMA', 'SY'), ('FMA', 'AB'), + ('ATC', 'RXN_IN'), ('ATC', 'IN'), ('VANDF', 'AB'), ('VANDF', 'MTH_RXN_CD'), ('NDDF', 'MTH_RXN_CDC'), ('NDDF', 'CDC'), ('NDDF', 'CDD'), + ('NDDF', 'CDA'), ('NDDF', 'IN'), ('NDDF', 'DF'), ('MED-RT', 'PT'), ('MED-RT', 'FN'), ('MED-RT', 'SY'), ('HCPCS', 'PT'), ('HCPCS', 'MP'), + ('OMIM', 'PT'), ('OMIM', 'PHENO'), ('OMIM', 'PHENO_ET'), ('OMIM', 'PTAV'), ('OMIM', 'PTCS'), ('OMIM', 'ETAL'), ('OMIM', 'ET'), + ('OMIM', 'HT'), ('OMIM', 'ACR'), ('HGNC', 'PT'), ('HGNC', 'ACR'), ('HGNC', 'MTH_ACR'), ('HGNC', 'NA'), ('HGNC', 'SYN'), ('HGNC', 'NP'), + ('HGNC', 'NS'), ('NCI', 'PT'), ('NCI', 'SY'), ('NCI', 'CSN'), ('NCI', 'DN'), ('NCI', 'FBD'), ('NCI', 'HD'), ('NCI', 'CCN'), + ('NCI', 'AD'), ('NCI', 'CA2'), ('NCI', 'CA3'), ('NCI', 'BN'), ('NCI', 'AB'), ('NCI', 'CCS'), ('PDQ', 'PT'), ('PDQ', 'HT'), + ('PDQ', 'PSC'), ('PDQ', 'SY'), ('CHV', 'PT'), ('MEDLINEPLUS', 'PT'), ('GO', 'PT'), ('GO', 'MTH_PT'), ('GO', 'ET'), ('GO', 'MTH_ET'), + ('GO', 'SY'), ('GO', 'MTH_SY'), ('PDQ', 'ET'), ('PDQ', 'CU'), ('PDQ', 'LV'), ('PDQ', 'ACR'), ('PDQ', 'AB'), ('PDQ', 'BN'), ('PDQ', 'FBD'), + ('PDQ', 'CCN'), ('PDQ', 'CHN'), ('NCBI', 'USN'), ('NCBI', 'USY'), ('NCBI', 'SY'), ('NCBI', 'UCN'), ('NCBI', 'CMN'), ('NCBI', 'UE'), + ('NCBI', 'EQ'), ('ICD9CM', 'PT'), ('ICD9CM', 'HT'), ('ICD10PCS', 'PT'), ('ICD10PCS', 'PX'), ('ICD10PCS', 'HX'), ('ICD10PCS', 'MTH_HX'), + ('ICD10PCS', 'HT'), ('ICD10PCS', 'HS'), ('ICD10PCS', 'AB'), ('HL7V3.0', 'CSY'), ('HL7V3.0', 'PT'), ('HL7V3.0', 'CDO'), ('HL7V3.0', 'VS'), + ('HL7V3.0', 'BR'), ('HL7V3.0', 'CPR'), ('HL7V3.0', 'CR'), ('HL7V3.0', 'NPT'), ('HCPCS', 'MTH_HT'), ('MTH', 'CV'), ('MTH', 'XM'), + ('MTH', 'PT'), ('MTH', 'SY'), ('MTH', 'RT'), ('ICD9CM', 'AB'), ('PSY', 'PT'), ('PSY', 'HT'), ('PSY', 'ET'), ('MEDLINEPLUS', 'ET'), + ('MEDLINEPLUS', 'SY'), ('MEDLINEPLUS', 'HT'), ('MSH', 'HT'), ('MSH', 'HS'), ('MSH', 'DEV'), ('MSH', 'DSV'), ('MSH', 'QAB'), + ('MSH', 'QEV'), ('MSH', 'QSV'), ('MSH', 'PM'), ('HCPCS', 'AB'), ('MTH', 'DT'), ('HCPCS', 'AM'), ('CHV', 'SY'), ('RXNORM', 'ET'), + ('HPO', 'OP'), ('HPO', 'IS'), ('NCI', 'OP'), ('HPO', 'OET'), ('HCPCS', 'OP'), ('HCPCS', 'OM'), ('HCPCS', 'OAM'), ('GO', 'OP'), + ('GO', 'MTH_OP'), ('GO', 'OET'), ('GO', 'MTH_OET'), ('GO', 'IS'), ('GO', 'MTH_IS'), ('PDQ', 'OP'), ('PDQ', 'IS'), ('HL7V3.0', 'OP'), + ('HL7V3.0', 'ONP'), ('HCPCS', 'OA'), ('FMA', 'OP'), ('FMA', 'IS')] def get_args(): arg_parser = argparse.ArgumentParser(description='umls_list_jsonl_to_kg_jsonl.py: converts UMLS MySQL JSON Lines dump into KG2 JSON format') From e13e436f0ec506b54d54e9e8765bf847aca062c7 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 22 Aug 2023 13:03:14 -0700 Subject: [PATCH 062/117] #316 commit before I start deleting stuff (halve two of the refactor) --- umls-name-heirarchy.yaml | 2598 ++++++++++++++++++++++++++++++++ umls_list_jsonl_to_kg_jsonl.py | 212 ++- umls_util.py | 573 +++++++ 3 files changed, 3275 insertions(+), 108 deletions(-) create mode 100644 umls-name-heirarchy.yaml create mode 100644 umls_util.py diff --git a/umls-name-heirarchy.yaml b/umls-name-heirarchy.yaml new file mode 100644 index 00000000..a5c2996a --- /dev/null +++ b/umls-name-heirarchy.yaml @@ -0,0 +1,2598 @@ +- + - MTH + - PN +- + - MTHCMSFRF + - PT +- + - RXNORM + - SCD +- + - RXNORM + - SBD +- + - RXNORM + - SCDG +- + - RXNORM + - SBDG +- + - RXNORM + - BPCK +- + - RXNORM + - GPCK +- + - RXNORM + - IN +- + - RXNORM + - PSN +- + - RXNORM + - MIN +- + - RXNORM + - SCDF +- + - RXNORM + - SBDF +- + - RXNORM + - SCDC +- + - RXNORM + - DFG +- + - RXNORM + - DF +- + - RXNORM + - SBDC +- + - RXNORM + - BN +- + - RXNORM + - PIN +- + - RXNORM + - TMSY +- + - RXNORM + - SY +- + - MSH + - MH +- + - MSH + - TQ +- + - MSH + - PEP +- + - MSH + - ET +- + - MSH + - XQ +- + - MSH + - PXQ +- + - MSH + - NM +- + - SNOMEDCT_US + - PT +- + - SNOMEDCT_US + - FN +- + - SNOMEDCT_US + - SY +- + - SNOMEDCT_US + - PTGB +- + - SNOMEDCT_US + - SYGB +- + - SNOMEDCT_US + - MTH_PT +- + - SNOMEDCT_US + - MTH_FN +- + - SNOMEDCT_US + - MTH_SY +- + - SNOMEDCT_US + - MTH_PTGB +- + - SNOMEDCT_US + - MTH_SYGB +- + - SNOMEDCT_US + - SB +- + - SNOMEDCT_US + - XM +- + - SNOMEDCT_VET + - PT +- + - SNOMEDCT_VET + - FN +- + - SNOMEDCT_VET + - SY +- + - SNOMEDCT_VET + - SB +- + - HPO + - PT +- + - HPO + - SY +- + - HPO + - ET +- + - NCBI + - SCN +- + - MTHSPL + - MTH_RXN_DP +- + - MTHSPL + - DP +- + - MTHSPL + - SU +- + - ATC + - RXN_PT +- + - ATC + - PT +- + - VANDF + - PT +- + - VANDF + - CD +- + - VANDF + - IN +- + - USP + - CD +- + - USP + - IN +- + - USPMG + - HC +- + - USPMG + - PT +- + - MMX + - MTH_RXN_CD +- + - MMX + - MTH_RXN_BD +- + - MMX + - CD +- + - MMX + - BD +- + - DRUGBANK + - IN +- + - DRUGBANK + - SY +- + - DRUGBANK + - FSY +- + - MSH + - N1 +- + - MSH + - PCE +- + - MSH + - CE +- + - CPM + - PT +- + - NEU + - PT +- + - NEU + - SY +- + - FMA + - PT +- + - FMA + - SY +- + - FMA + - AB +- + - UWDA + - PT +- + - UWDA + - SY +- + - UMD + - PT +- + - UMD + - SY +- + - UMD + - ET +- + - UMD + - RT +- + - GS + - CD +- + - MMSL + - CD +- + - GS + - MTH_RXN_BD +- + - GS + - BD +- + - GS + - IN +- + - MMSL + - MTH_RXN_BD +- + - MMSL + - BD +- + - MMSL + - SC +- + - MMSL + - MS +- + - MMSL + - GN +- + - MMSL + - BN +- + - ATC + - RXN_IN +- + - ATC + - IN +- + - MMSL + - IN +- + - VANDF + - AB +- + - GS + - MTH_RXN_CD +- + - VANDF + - MTH_RXN_CD +- + - NDDF + - MTH_RXN_CDC +- + - NDDF + - CDC +- + - NDDF + - CDD +- + - NDDF + - CDA +- + - NDDF + - IN +- + - NDDF + - DF +- + - MED-RT + - PT +- + - MED-RT + - FN +- + - MED-RT + - SY +- + - SPN + - PT +- + - MDR + - PT +- + - MDR + - MTH_PT +- + - MDR + - HG +- + - MDR + - MTH_HG +- + - MDR + - HT +- + - MDR + - MTH_HT +- + - MDR + - LLT +- + - MDR + - MTH_LLT +- + - MDR + - SMQ +- + - MDR + - MTH_SMQ +- + - MDR + - OS +- + - MDR + - AB +- + - CPT + - PT +- + - CPT + - SY +- + - CPT + - ETCLIN +- + - CPT + - POS +- + - CPT + - GLP +- + - CPT + - ETCF +- + - CPT + - MP +- + - HCPT + - PT +- + - HCPCS + - PT +- + - CDT + - PT +- + - MVX + - PT +- + - CVX + - PT +- + - CVX + - RXN_PT +- + - CVX + - AB +- + - HCDT + - PT +- + - HCPCS + - MP +- + - HCPT + - MP +- + - ICD10AE + - PT +- + - ICD10 + - PT +- + - ICD10AE + - PX +- + - ICD10 + - PX +- + - ICD10AE + - PS +- + - ICD10 + - PS +- + - ICD10AMAE + - PT +- + - ICD10AM + - PT +- + - ICD10AMAE + - PX +- + - ICD10AM + - PX +- + - ICD10AMAE + - PS +- + - ICD10AM + - PS +- + - OMIM + - PT +- + - OMIM + - PHENO +- + - OMIM + - PHENO_ET +- + - OMIM + - PTAV +- + - OMIM + - PTCS +- + - OMIM + - ETAL +- + - OMIM + - ET +- + - OMIM + - HT +- + - OMIM + - ACR +- + - MEDCIN + - PT +- + - MEDCIN + - FN +- + - MEDCIN + - XM +- + - MEDCIN + - SY +- + - HGNC + - PT +- + - HGNC + - ACR +- + - HGNC + - MTH_ACR +- + - HGNC + - NA +- + - HGNC + - SYN +- + - HGNC + - NP +- + - HGNC + - NS +- + - ICNP + - PT +- + - PNDS + - PT +- + - PNDS + - HT +- + - PNDS + - XM +- + - NCI + - PT +- + - NCI + - SY +- + - NCI + - CSN +- + - NCI + - DN +- + - NCI + - FBD +- + - NCI + - HD +- + - NCI + - CCN +- + - NCI + - AD +- + - NCI + - CA2 +- + - NCI + - CA3 +- + - NCI + - BN +- + - NCI + - AB +- + - NCI + - CCS +- + - PDQ + - PT +- + - PDQ + - HT +- + - PDQ + - PSC +- + - PDQ + - SY +- + - CHV + - PT +- + - MEDLINEPLUS + - PT +- + - MTHICPC2EAE + - PT +- + - ICPC2EENG + - PT +- + - MTHICPC2ICD10AE + - PT +- + - SOP + - PT +- + - ICF + - HT +- + - ICF + - PT +- + - ICF + - MTH_HT +- + - ICF + - MTH_PT +- + - ICF-CY + - HT +- + - ICF-CY + - PT +- + - ICF-CY + - MTH_HT +- + - ICF-CY + - MTH_PT +- + - ICPC2ICD10ENG + - PT +- + - ICPC + - PX +- + - ICPC + - PT +- + - ICPC + - PS +- + - ICPC + - PC +- + - ICPC + - CX +- + - ICPC + - CP +- + - ICPC + - CS +- + - ICPC + - CC +- + - ICPC2EENG + - CO +- + - ICPC + - CO +- + - MTHICPC2EAE + - AB +- + - ICPC2EENG + - AB +- + - ICPC2P + - PTN +- + - ICPC2P + - MTH_PTN +- + - ICPC2P + - PT +- + - ICPC2P + - MTH_PT +- + - AOT + - PT +- + - AOT + - ET +- + - GO + - PT +- + - GO + - MTH_PT +- + - GO + - ET +- + - GO + - MTH_ET +- + - GO + - SY +- + - GO + - MTH_SY +- + - PDQ + - ET +- + - PDQ + - CU +- + - PDQ + - LV +- + - PDQ + - ACR +- + - PDQ + - AB +- + - PDQ + - BN +- + - PDQ + - FBD +- + - PDQ + - CCN +- + - PDQ + - CHN +- + - NCBI + - USN +- + - NCBI + - USY +- + - NCBI + - SY +- + - NCBI + - UCN +- + - NCBI + - CMN +- + - NCBI + - UE +- + - NCBI + - EQ +- + - LNC + - LN +- + - LNC + - MTH_LN +- + - LNC + - OSN +- + - LNC + - DN +- + - LNC + - CN +- + - LNC + - MTH_CN +- + - LNC + - LPDN +- + - LNC + - LPN +- + - LNC + - HC +- + - LNC + - HS +- + - LNC + - OLC +- + - LNC + - LC +- + - LNC + - LS +- + - LNC + - LG +- + - LNC + - LA +- + - ICD10CM + - PT +- + - ICD9CM + - PT +- + - ICD10CM + - HT +- + - ICD9CM + - HT +- + - CCSR_ICD10PCS + - HT +- + - CCSR_ICD10CM + - SD +- + - CCSR_ICD10PCS + - SP +- + - CCSR_ICD10CM + - XM +- + - CCSR_ICD10PCS + - XM +- + - CCS + - HT +- + - CCS + - MD +- + - CCS + - SD +- + - CCS + - MV +- + - CCS + - SP +- + - CCS + - XM +- + - ICPC2ICD10ENG + - XM +- + - ICD10AE + - HT +- + - ICD10PCS + - PT +- + - ICD10PCS + - PX +- + - ICD10PCS + - HX +- + - ICD10PCS + - MTH_HX +- + - ICD10PCS + - HT +- + - ICD10PCS + - HS +- + - ICD10PCS + - AB +- + - ICD10 + - HT +- + - ICD10AE + - HX +- + - ICD10 + - HX +- + - ICD10AE + - HS +- + - ICD10 + - HS +- + - ICD10AMAE + - HT +- + - ICD10AM + - HT +- + - UMD + - HT +- + - ICPC + - HT +- + - ORPHANET + - PT +- + - ORPHANET + - SY +- + - NUCCHCPT + - PT +- + - HL7V3.0 + - CSY +- + - CDCREC + - PT +- + - HL7V3.0 + - PT +- + - HL7V2.5 + - PT +- + - HL7V3.0 + - CDO +- + - HL7V3.0 + - VS +- + - HL7V3.0 + - BR +- + - HL7V3.0 + - CPR +- + - HL7V3.0 + - CR +- + - HL7V3.0 + - NPT +- + - HL7V2.5 + - HTN +- + - CPT + - HT +- + - CDT + - HT +- + - HCPCS + - MTH_HT +- + - CCC + - PT +- + - CCC + - HT +- + - NIC + - IV +- + - NIC + - HC +- + - NANDA-I + - PT +- + - NANDA-I + - HT +- + - NANDA-I + - HC +- + - NANDA-I + - RT +- + - OMS + - MTH_SI +- + - OMS + - PR +- + - OMS + - TG +- + - OMS + - HT +- + - OMS + - PQ +- + - OMS + - IVC +- + - OMS + - SI +- + - OMS + - SCALE +- + - NIC + - AC +- + - NOC + - OC +- + - NOC + - ID +- + - NIC + - HT +- + - NOC + - HT +- + - NOC + - HC +- + - CCC + - MTH_HT +- + - CCC + - MP +- + - ALT + - PT +- + - ALT + - HT +- + - MTH + - CV +- + - MTH + - XM +- + - MTH + - PT +- + - MTH + - SY +- + - MTH + - RT +- + - ICD10CM + - ET +- + - MTHICD9 + - ET +- + - ICD10CM + - AB +- + - ICD9CM + - AB +- + - PSY + - PT +- + - PSY + - HT +- + - PSY + - ET +- + - MEDLINEPLUS + - ET +- + - MEDLINEPLUS + - SY +- + - MEDLINEPLUS + - HT +- + - LCH_NW + - PT +- + - LCH + - PT +- + - MSH + - HT +- + - MSH + - HS +- + - MSH + - DEV +- + - MSH + - DSV +- + - MSH + - QAB +- + - MSH + - QEV +- + - MSH + - QSV +- + - MSH + - PM +- + - LCH_NW + - XM +- + - CPT + - AB +- + - HCPT + - AB +- + - HCPCS + - AB +- + - WHO + - PT +- + - WHO + - HT +- + - WHO + - IT +- + - SNMI + - PT +- + - SNMI + - PX +- + - SNMI + - HT +- + - SNMI + - HX +- + - SNMI + - RT +- + - SNMI + - SY +- + - SNMI + - SX +- + - SNMI + - AD +- + - SNM + - PT +- + - SNM + - RT +- + - SNM + - HT +- + - SNM + - SY +- + - SNM + - RS +- + - RCD + - PT +- + - RCD + - SY +- + - RCD + - AT +- + - RCD + - AS +- + - RCD + - AB +- + - RCDSA + - PT +- + - RCDSY + - PT +- + - RCDAE + - PT +- + - RCDSA + - SY +- + - RCDSY + - SY +- + - RCDAE + - SY +- + - RCDAE + - AT +- + - RCDSA + - AB +- + - RCDSY + - AB +- + - RCDAE + - AB +- + - RCDAE + - AA +- + - RCD + - AA +- + - CSP + - PT +- + - CSP + - SY +- + - CSP + - ET +- + - CSP + - AB +- + - MTH + - DT +- + - HCPT + - AM +- + - HCPCS + - AM +- + - HCDT + - AB +- + - ALT + - AB +- + - CHV + - SY +- + - RXNORM + - ET +- + - SNOMEDCT_VET + - OAP +- + - SNOMEDCT_VET + - OP +- + - SNOMEDCT_US + - OAP +- + - SNOMEDCT_US + - OP +- + - SNOMEDCT_VET + - OAF +- + - SNOMEDCT_VET + - OF +- + - SNOMEDCT_US + - OAF +- + - SNOMEDCT_US + - OF +- + - SNOMEDCT_VET + - OAS +- + - SNOMEDCT_VET + - IS +- + - SNOMEDCT_US + - OAS +- + - SNOMEDCT_US + - IS +- + - SNOMEDCT_US + - MTH_OAP +- + - SNOMEDCT_US + - MTH_OP +- + - SNOMEDCT_US + - MTH_OAF +- + - SNOMEDCT_US + - MTH_OF +- + - SNOMEDCT_US + - MTH_OAS +- + - SNOMEDCT_US + - MTH_IS +- + - HPO + - OP +- + - HPO + - IS +- + - NCI + - OP +- + - LNC + - LO +- + - LNC + - MTH_LO +- + - LNC + - OOSN +- + - LNC + - OLG +- + - HPO + - OET +- + - NEU + - OP +- + - NEU + - IS +- + - NEU + - ACR +- + - MDR + - MTH_OS +- + - CDT + - OP +- + - ICPC2P + - OPN +- + - ICPC2P + - MTH_OPN +- + - ICPC2P + - OP +- + - ICPC2P + - MTH_OP +- + - HCPCS + - OP +- + - HCDT + - OP +- + - HCPT + - OP +- + - HCPCS + - OM +- + - HCPCS + - OAM +- + - GO + - OP +- + - GO + - MTH_OP +- + - GO + - OET +- + - GO + - MTH_OET +- + - GO + - IS +- + - GO + - MTH_IS +- + - PDQ + - OP +- + - PDQ + - IS +- + - MDR + - OL +- + - MDR + - MTH_OL +- + - NUCCHCPT + - OP +- + - HL7V3.0 + - OP +- + - HL7V3.0 + - ONP +- + - WHO + - OS +- + - RCD + - OP +- + - RCD + - IS +- + - RCDSA + - OP +- + - RCDSY + - OP +- + - RCDAE + - OP +- + - RCDSA + - IS +- + - RCDSY + - IS +- + - RCDAE + - IS +- + - RCDSA + - OA +- + - RCDSY + - OA +- + - RCDAE + - OA +- + - RCD + - OA +- + - HCPT + - OA +- + - HCPCS + - OA +- + - HCDT + - OA +- + - FMA + - OP +- + - FMA + - IS +- + - DSM-5 + - DC10 +- + - DSM-5 + - DC9 +- + - DXP + - DI +- + - DXP + - FI +- + - DXP + - SY +- + - RAM + - PT +- + - RAM + - RT +- + - ULT + - PT +- + - BI + - PT +- + - BI + - AB +- + - BI + - SY +- + - BI + - RT +- + - PCDS + - GO +- + - PCDS + - OR +- + - PCDS + - PR +- + - PCDS + - CO +- + - PCDS + - HX +- + - PCDS + - HT +- + - MTHMST + - PT +- + - MTHMST + - SY +- + - DDB + - PT +- + - DDB + - SY +- + - CST + - PT +- + - COSTAR + - PT +- + - CST + - SC +- + - CST + - HT +- + - CST + - GT +- + - CCPSS + - TX +- + - CCPSS + - TC +- + - CCPSS + - PT +- + - CCPSS + - MP +- + - AOD + - DE +- + - AOD + - DS +- + - AOD + - XD +- + - AOD + - FN +- + - AOD + - ET +- + - AOD + - ES +- + - AOD + - EX +- + - AOD + - NP +- + - AOD + - NS +- + - AOD + - NX +- + - QMR + - PT +- + - JABL + - PC +- + - JABL + - PT +- + - JABL + - SS +- + - JABL + - SY +- + - AIR + - FI +- + - AIR + - DI +- + - AIR + - SY +- + - AIR + - HT +- + - PPAC + - DO +- + - PPAC + - CL +- + - PPAC + - AC +- + - PPAC + - ST +- + - PPAC + - TA +- + - MCM + - PT +- + - MCM + - RT +- + - SCTSPA + - PT +- + - SCTSPA + - FN +- + - SCTSPA + - SY +- + - SCTSPA + - MTH_PT +- + - SCTSPA + - MTH_FN +- + - SCTSPA + - MTH_SY +- + - SCTSPA + - SB +- + - SCTSPA + - OP +- + - SCTSPA + - OAF +- + - SCTSPA + - OAP +- + - SCTSPA + - OAS +- + - SCTSPA + - OF +- + - SCTSPA + - IS +- + - SCTSPA + - MTH_OP +- + - SCTSPA + - MTH_OAF +- + - SCTSPA + - MTH_OAP +- + - SCTSPA + - MTH_OAS +- + - SCTSPA + - MTH_OF +- + - SCTSPA + - MTH_IS +- + - MSHPOR + - MH +- + - MSHPOR + - PEP +- + - MSHPOR + - ET +- + - MSHSPA + - MH +- + - MSHSPA + - PEP +- + - MSHSPA + - ET +- + - MSHCZE + - MH +- + - MSHCZE + - PEP +- + - MSHCZE + - ET +- + - MSHCZE + - TQ +- + - MSHCZE + - XQ +- + - MSHCZE + - PXQ +- + - MSHDUT + - MH +- + - MSHSWE + - MH +- + - MSHSWE + - ET +- + - MSHSWE + - TQ +- + - MSHNOR + - MH +- + - MSHGER + - MH +- + - MSHNOR + - PEP +- + - MSHGER + - PEP +- + - MSHNOR + - DSV +- + - MSHGER + - DSV +- + - MSHNOR + - ET +- + - MSHGER + - ET +- + - MSHFIN + - MH +- + - MSHLAV + - MH +- + - MSHSCR + - MH +- + - MSHFRE + - MH +- + - MSHLAV + - PEP +- + - MSHSCR + - PEP +- + - MSHFRE + - PEP +- + - MSHLAV + - EP +- + - MSHSCR + - ET +- + - MSHFRE + - ET +- + - MSHITA + - MH +- + - MSHITA + - PEP +- + - MSHITA + - ET +- + - MSHJPN + - PT +- + - MSHPOL + - MH +- + - MSHRUS + - MH +- + - MSHJPN + - SY +- + - KCD5 + - HT +- + - TKMT + - PT +- + - KCD5 + - PT +- + - MSHPOL + - SY +- + - MSHRUS + - SY +- + - MSHDUT + - SY +- + - MDRSPA + - PT +- + - MDRSPA + - HG +- + - MDRSPA + - HT +- + - MDRSPA + - LLT +- + - MDRSPA + - OS +- + - MDRSPA + - SMQ +- + - MDRSPA + - OL +- + - MDRSPA + - AB +- + - MDRDUT + - PT +- + - MDRDUT + - HG +- + - MDRDUT + - HT +- + - MDRDUT + - LLT +- + - MDRDUT + - OS +- + - MDRDUT + - SMQ +- + - MDRDUT + - OL +- + - MDRDUT + - AB +- + - MDRFRE + - PT +- + - MDRFRE + - HG +- + - MDRFRE + - HT +- + - MDRFRE + - LLT +- + - MDRFRE + - SMQ +- + - MDRFRE + - OS +- + - MDRFRE + - OL +- + - MDRFRE + - AB +- + - MDRGER + - PT +- + - MDRGER + - HG +- + - MDRGER + - HT +- + - MDRGER + - LLT +- + - MDRGER + - SMQ +- + - MDRGER + - OS +- + - MDRGER + - OL +- + - MDRGER + - AB +- + - MDRITA + - PT +- + - MDRITA + - HG +- + - MDRITA + - HT +- + - MDRITA + - LLT +- + - MDRITA + - SMQ +- + - MDRITA + - OS +- + - MDRITA + - OL +- + - MDRITA + - AB +- + - MDRJPN + - PT +- + - MDRJPN + - PTJKN +- + - MDRJPN + - PTJKN1 +- + - MDRJPN + - HG +- + - MDRJPN + - HGJKN +- + - MDRJPN + - HGJKN1 +- + - MDRJPN + - HT +- + - MDRJPN + - HTJKN +- + - MDRJPN + - HTJKN1 +- + - MDRJPN + - LLT +- + - MDRJPN + - LLTJKN +- + - MDRJPN + - LLTJKN1 +- + - MDRJPN + - OS +- + - MDRJPN + - SMQ +- + - MDRJPN + - OL +- + - MDRJPN + - OLJKN +- + - MDRJPN + - OLJKN1 +- + - MDRCZE + - PT +- + - MDRKOR + - PT +- + - MDRHUN + - PT +- + - MDRBPO + - PT +- + - MDRPOR + - PT +- + - MDRLAV + - PT +- + - MDRSWE + - PT +- + - MDRARA + - PT +- + - MDRRUS + - PT +- + - MDRPOL + - PT +- + - MDRGRE + - PT +- + - MDRCZE + - HG +- + - MDRKOR + - HG +- + - MDRHUN + - HG +- + - MDRBPO + - HG +- + - MDRPOR + - HG +- + - MDRLAV + - HG +- + - MDRSWE + - HG +- + - MDRARA + - HG +- + - MDRRUS + - HG +- + - MDRPOL + - HG +- + - MDRGRE + - HG +- + - MDRCZE + - HT +- + - MDRKOR + - HT +- + - MDRHUN + - HT +- + - MDRBPO + - HT +- + - MDRPOR + - HT +- + - MDRLAV + - HT +- + - MDRSWE + - HT +- + - MDRARA + - HT +- + - MDRRUS + - HT +- + - MDRPOL + - HT +- + - MDRGRE + - HT +- + - MDRCZE + - LLT +- + - MDRKOR + - LLT +- + - MDRHUN + - LLT +- + - MDRBPO + - LLT +- + - MDRPOR + - LLT +- + - MDRLAV + - LLT +- + - MDRSWE + - LLT +- + - MDRARA + - LLT +- + - MDRRUS + - LLT +- + - MDRPOL + - LLT +- + - MDRGRE + - LLT +- + - MDRCZE + - OS +- + - MDRKOR + - OS +- + - MDRHUN + - OS +- + - MDRBPO + - OS +- + - MDRPOR + - OS +- + - MDRLAV + - OS +- + - MDRSWE + - OS +- + - MDRARA + - OS +- + - MDRRUS + - OS +- + - MDRPOL + - OS +- + - MDRGRE + - OS +- + - MDRCZE + - SMQ +- + - MDRKOR + - SMQ +- + - MDRHUN + - SMQ +- + - MDRLAV + - SMQ +- + - MDRSWE + - SMQ +- + - MDRARA + - SMQ +- + - MDRRUS + - SMQ +- + - MDRPOL + - SMQ +- + - MDRGRE + - SMQ +- + - MDRBPO + - SMQ +- + - MDRPOR + - SMQ +- + - MDRCZE + - OL +- + - MDRKOR + - OL +- + - MDRHUN + - OL +- + - MDRLAV + - OL +- + - MDRSWE + - OL +- + - MDRARA + - OL +- + - MDRRUS + - OL +- + - MDRPOL + - OL +- + - MDRGRE + - OL +- + - MDRBPO + - OL +- + - MDRPOR + - OL +- + - MDRCZE + - AB +- + - MDRKOR + - AB +- + - MDRHUN + - AB +- + - MDRLAV + - AB +- + - MDRSWE + - AB +- + - MDRARA + - AB +- + - MDRRUS + - AB +- + - MDRPOL + - AB +- + - MDRGRE + - AB +- + - MDRBPO + - AB +- + - MDRPOR + - AB +- + - MDRJPN + - OSJKN +- + - MDRJPN + - OSJKN1 +- + - WHOFRE + - HT +- + - WHOGER + - HT +- + - WHOPOR + - HT +- + - WHOSPA + - HT +- + - LNC-DE-DE + - LN +- + - LNC-DE-DE + - LC +- + - LNC-DE-DE + - OLC +- + - LNC-DE-DE + - LO +- + - LNC-EL-GR + - LN +- + - LNC-EL-GR + - LO +- + - LNC-ES-AR + - LN +- + - LNC-ES-AR + - OSN +- + - LNC-ES-AR + - LO +- + - LNC-ES-AR + - OOSN +- + - LNC-ES-MX + - LN +- + - LNC-ES-MX + - LO +- + - LNC-ES-MX + - LC +- + - LNC-ES-MX + - OLC +- + - LNC-ES-ES + - LN +- + - LNC-ES-ES + - LO +- + - LNC-ET-EE + - LN +- + - LNC-ET-EE + - LO +- + - LNC-FR-BE + - LN +- + - LNC-FR-BE + - LO +- + - LNC-FR-CA + - LN +- + - LNC-FR-CA + - LO +- + - LNC-FR-FR + - LN +- + - LNC-FR-FR + - LC +- + - LNC-FR-FR + - OLC +- + - LNC-FR-FR + - LO +- + - LNC-IT-IT + - LN +- + - LNC-IT-IT + - LO +- + - LNC-KO-KR + - LN +- + - LNC-KO-KR + - LO +- + - LNC-PL-PL + - LN +- + - LNC-PL-PL + - LO +- + - LNC-NL-NL + - LN +- + - LNC-NL-NL + - LO +- + - LNC-PT-BR + - LN +- + - LNC-PT-BR + - OSN +- + - LNC-PT-BR + - LO +- + - LNC-PT-BR + - OOSN +- + - LNC-RU-RU + - LN +- + - LNC-RU-RU + - LO +- + - LNC-TR-TR + - LN +- + - LNC-TR-TR + - LO +- + - LNC-UK-UA + - LN +- + - LNC-UK-UA + - LC +- + - LNC-UK-UA + - OSN +- + - LNC-UK-UA + - LVDN +- + - LNC-ZH-CN + - LN +- + - LNC-ZH-CN + - LO +- + - LNC-DE-AT + - LN +- + - LNC-DE-AT + - LO +- + - LNC-DE-AT + - LVDN +- + - MEDLINEPLUS_SPA + - PT +- + - MEDLINEPLUS_SPA + - HT +- + - WHOFRE + - PT +- + - WHOGER + - PT +- + - WHOPOR + - PT +- + - WHOSPA + - PT +- + - WHOFRE + - IT +- + - WHOGER + - IT +- + - WHOPOR + - IT +- + - WHOSPA + - IT +- + - WHOFRE + - OS +- + - WHOGER + - OS +- + - WHOPOR + - OS +- + - WHOSPA + - OS +- + - CPTSP + - PT +- + - DMDUMD + - PT +- + - DMDUMD + - ET +- + - DMDUMD + - RT +- + - DMDICD10 + - PT +- + - DMDICD10 + - HT +- + - ICPCBAQ + - PT +- + - ICPCDAN + - PT +- + - ICPC2EDUT + - PT +- + - ICD10DUT + - PT +- + - ICD10DUT + - HT +- + - ICPC2ICD10DUT + - PT +- + - ICPCDUT + - PT +- + - ICPCFIN + - PT +- + - ICPCFRE + - PT +- + - ICPCGER + - PT +- + - ICPCHEB + - PT +- + - ICPCHUN + - PT +- + - ICPCITA + - PT +- + - ICPCNOR + - PT +- + - ICPCPOR + - PT +- + - ICPCSPA + - PT +- + - ICPCSWE + - PT +- + - ICPCBAQ + - CP +- + - ICPCDAN + - CP +- + - ICPCDUT + - CP +- + - ICPCFIN + - CP +- + - ICPCFRE + - CP +- + - ICPCGER + - CP +- + - ICPCHEB + - CP +- + - ICPCHUN + - CP +- + - ICPCITA + - CP +- + - ICPCNOR + - CP +- + - ICPCPOR + - CP +- + - ICPCSPA + - CP +- + - ICPCSWE + - CP +- + - MTHMSTFRE + - PT +- + - MTHMSTITA + - PT +- + - SRC + - RPT +- + - SRC + - RHT +- + - SRC + - RAB +- + - SRC + - RSY +- + - SRC + - VPT +- + - SRC + - VAB +- + - SRC + - VSY +- + - SRC + - SSN diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 2a36c339..7636bec4 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -18,8 +18,6 @@ import kg2_util import json - - CUIS_KEY = 'cuis' INFO_KEY = 'attributes' NAMES_KEY = 'names' @@ -55,30 +53,32 @@ UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE # Mined from HTML Page Source of https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html -ACCESSION_HEIRARCHY = [('MTH', 'PN'), ('RXNORM', 'SCD'), ('RXNORM', 'SBD'), ('RXNORM', 'SCDG'), ('RXNORM', 'SBDG'), ('RXNORM', 'BPCK'), ('RXNORM', 'GPCK'), - ('RXNORM', 'IN'), ('RXNORM', 'PSN'), ('RXNORM', 'MIN'), ('RXNORM', 'SCDF'), ('RXNORM', 'SBDF'), ('RXNORM', 'SCDC'), ('RXNORM', 'DFG'), - ('RXNORM', 'DF'), ('RXNORM', 'SBDC'), ('RXNORM', 'BN'), ('RXNORM', 'PIN'), ('RXNORM', 'TMSY'), ('RXNORM', 'SY'), ('MSH', 'MH'), - ('MSH', 'TQ'), ('MSH', 'PEP'), ('MSH', 'ET'), ('MSH', 'XQ'), ('MSH', 'PXQ'), ('MSH', 'NM'), ('HPO', 'PT'), ('HPO', 'SY'), ('HPO', 'ET'), - ('NCBI', 'SCN'), ('ATC', 'RXN_PT'), ('ATC', 'PT'), ('VANDF', 'PT'), ('VANDF', 'CD'), ('VANDF', 'IN'), ('DRUGBANK', 'IN'), - ('DRUGBANK', 'SY'), ('DRUGBANK', 'FSY'), ('MSH', 'N1'), ('MSH', 'PCE'), ('MSH', 'CE'), ('FMA', 'PT'), ('FMA', 'SY'), ('FMA', 'AB'), - ('ATC', 'RXN_IN'), ('ATC', 'IN'), ('VANDF', 'AB'), ('VANDF', 'MTH_RXN_CD'), ('NDDF', 'MTH_RXN_CDC'), ('NDDF', 'CDC'), ('NDDF', 'CDD'), - ('NDDF', 'CDA'), ('NDDF', 'IN'), ('NDDF', 'DF'), ('MED-RT', 'PT'), ('MED-RT', 'FN'), ('MED-RT', 'SY'), ('HCPCS', 'PT'), ('HCPCS', 'MP'), - ('OMIM', 'PT'), ('OMIM', 'PHENO'), ('OMIM', 'PHENO_ET'), ('OMIM', 'PTAV'), ('OMIM', 'PTCS'), ('OMIM', 'ETAL'), ('OMIM', 'ET'), - ('OMIM', 'HT'), ('OMIM', 'ACR'), ('HGNC', 'PT'), ('HGNC', 'ACR'), ('HGNC', 'MTH_ACR'), ('HGNC', 'NA'), ('HGNC', 'SYN'), ('HGNC', 'NP'), - ('HGNC', 'NS'), ('NCI', 'PT'), ('NCI', 'SY'), ('NCI', 'CSN'), ('NCI', 'DN'), ('NCI', 'FBD'), ('NCI', 'HD'), ('NCI', 'CCN'), - ('NCI', 'AD'), ('NCI', 'CA2'), ('NCI', 'CA3'), ('NCI', 'BN'), ('NCI', 'AB'), ('NCI', 'CCS'), ('PDQ', 'PT'), ('PDQ', 'HT'), - ('PDQ', 'PSC'), ('PDQ', 'SY'), ('CHV', 'PT'), ('MEDLINEPLUS', 'PT'), ('GO', 'PT'), ('GO', 'MTH_PT'), ('GO', 'ET'), ('GO', 'MTH_ET'), - ('GO', 'SY'), ('GO', 'MTH_SY'), ('PDQ', 'ET'), ('PDQ', 'CU'), ('PDQ', 'LV'), ('PDQ', 'ACR'), ('PDQ', 'AB'), ('PDQ', 'BN'), ('PDQ', 'FBD'), - ('PDQ', 'CCN'), ('PDQ', 'CHN'), ('NCBI', 'USN'), ('NCBI', 'USY'), ('NCBI', 'SY'), ('NCBI', 'UCN'), ('NCBI', 'CMN'), ('NCBI', 'UE'), - ('NCBI', 'EQ'), ('ICD9CM', 'PT'), ('ICD9CM', 'HT'), ('ICD10PCS', 'PT'), ('ICD10PCS', 'PX'), ('ICD10PCS', 'HX'), ('ICD10PCS', 'MTH_HX'), - ('ICD10PCS', 'HT'), ('ICD10PCS', 'HS'), ('ICD10PCS', 'AB'), ('HL7V3.0', 'CSY'), ('HL7V3.0', 'PT'), ('HL7V3.0', 'CDO'), ('HL7V3.0', 'VS'), - ('HL7V3.0', 'BR'), ('HL7V3.0', 'CPR'), ('HL7V3.0', 'CR'), ('HL7V3.0', 'NPT'), ('HCPCS', 'MTH_HT'), ('MTH', 'CV'), ('MTH', 'XM'), - ('MTH', 'PT'), ('MTH', 'SY'), ('MTH', 'RT'), ('ICD9CM', 'AB'), ('PSY', 'PT'), ('PSY', 'HT'), ('PSY', 'ET'), ('MEDLINEPLUS', 'ET'), - ('MEDLINEPLUS', 'SY'), ('MEDLINEPLUS', 'HT'), ('MSH', 'HT'), ('MSH', 'HS'), ('MSH', 'DEV'), ('MSH', 'DSV'), ('MSH', 'QAB'), - ('MSH', 'QEV'), ('MSH', 'QSV'), ('MSH', 'PM'), ('HCPCS', 'AB'), ('MTH', 'DT'), ('HCPCS', 'AM'), ('CHV', 'SY'), ('RXNORM', 'ET'), - ('HPO', 'OP'), ('HPO', 'IS'), ('NCI', 'OP'), ('HPO', 'OET'), ('HCPCS', 'OP'), ('HCPCS', 'OM'), ('HCPCS', 'OAM'), ('GO', 'OP'), - ('GO', 'MTH_OP'), ('GO', 'OET'), ('GO', 'MTH_OET'), ('GO', 'IS'), ('GO', 'MTH_IS'), ('PDQ', 'OP'), ('PDQ', 'IS'), ('HL7V3.0', 'OP'), - ('HL7V3.0', 'ONP'), ('HCPCS', 'OA'), ('FMA', 'OP'), ('FMA', 'IS')] +ACCESSION_HEIRARCHY = list() +ACCESSION_SOURCES_HEIRARCHY = dict() + # [('MTH', 'PN'), ('RXNORM', 'SCD'), ('RXNORM', 'SBD'), ('RXNORM', 'SCDG'), ('RXNORM', 'SBDG'), ('RXNORM', 'BPCK'), ('RXNORM', 'GPCK'), + # ('RXNORM', 'IN'), ('RXNORM', 'PSN'), ('RXNORM', 'MIN'), ('RXNORM', 'SCDF'), ('RXNORM', 'SBDF'), ('RXNORM', 'SCDC'), ('RXNORM', 'DFG'), + # ('RXNORM', 'DF'), ('RXNORM', 'SBDC'), ('RXNORM', 'BN'), ('RXNORM', 'PIN'), ('RXNORM', 'TMSY'), ('RXNORM', 'SY'), ('MSH', 'MH'), + # ('MSH', 'TQ'), ('MSH', 'PEP'), ('MSH', 'ET'), ('MSH', 'XQ'), ('MSH', 'PXQ'), ('MSH', 'NM'), ('HPO', 'PT'), ('HPO', 'SY'), ('HPO', 'ET'), + # ('NCBI', 'SCN'), ('ATC', 'RXN_PT'), ('ATC', 'PT'), ('VANDF', 'PT'), ('VANDF', 'CD'), ('VANDF', 'IN'), ('DRUGBANK', 'IN'), + # ('DRUGBANK', 'SY'), ('DRUGBANK', 'FSY'), ('MSH', 'N1'), ('MSH', 'PCE'), ('MSH', 'CE'), ('FMA', 'PT'), ('FMA', 'SY'), ('FMA', 'AB'), + # ('ATC', 'RXN_IN'), ('ATC', 'IN'), ('VANDF', 'AB'), ('VANDF', 'MTH_RXN_CD'), ('NDDF', 'MTH_RXN_CDC'), ('NDDF', 'CDC'), ('NDDF', 'CDD'), + # ('NDDF', 'CDA'), ('NDDF', 'IN'), ('NDDF', 'DF'), ('MED-RT', 'PT'), ('MED-RT', 'FN'), ('MED-RT', 'SY'), ('HCPCS', 'PT'), ('HCPCS', 'MP'), + # ('OMIM', 'PT'), ('OMIM', 'PHENO'), ('OMIM', 'PHENO_ET'), ('OMIM', 'PTAV'), ('OMIM', 'PTCS'), ('OMIM', 'ETAL'), ('OMIM', 'ET'), + # ('OMIM', 'HT'), ('OMIM', 'ACR'), ('HGNC', 'PT'), ('HGNC', 'ACR'), ('HGNC', 'MTH_ACR'), ('HGNC', 'NA'), ('HGNC', 'SYN'), ('HGNC', 'NP'), + # ('HGNC', 'NS'), ('NCI', 'PT'), ('NCI', 'SY'), ('NCI', 'CSN'), ('NCI', 'DN'), ('NCI', 'FBD'), ('NCI', 'HD'), ('NCI', 'CCN'), + # ('NCI', 'AD'), ('NCI', 'CA2'), ('NCI', 'CA3'), ('NCI', 'BN'), ('NCI', 'AB'), ('NCI', 'CCS'), ('PDQ', 'PT'), ('PDQ', 'HT'), + # ('PDQ', 'PSC'), ('PDQ', 'SY'), ('CHV', 'PT'), ('MEDLINEPLUS', 'PT'), ('GO', 'PT'), ('GO', 'MTH_PT'), ('GO', 'ET'), ('GO', 'MTH_ET'), + # ('GO', 'SY'), ('GO', 'MTH_SY'), ('PDQ', 'ET'), ('PDQ', 'CU'), ('PDQ', 'LV'), ('PDQ', 'ACR'), ('PDQ', 'AB'), ('PDQ', 'BN'), ('PDQ', 'FBD'), + # ('PDQ', 'CCN'), ('PDQ', 'CHN'), ('NCBI', 'USN'), ('NCBI', 'USY'), ('NCBI', 'SY'), ('NCBI', 'UCN'), ('NCBI', 'CMN'), ('NCBI', 'UE'), + # ('NCBI', 'EQ'), ('ICD9CM', 'PT'), ('ICD9CM', 'HT'), ('ICD10PCS', 'PT'), ('ICD10PCS', 'PX'), ('ICD10PCS', 'HX'), ('ICD10PCS', 'MTH_HX'), + # ('ICD10PCS', 'HT'), ('ICD10PCS', 'HS'), ('ICD10PCS', 'AB'), ('HL7V3.0', 'CSY'), ('HL7V3.0', 'PT'), ('HL7V3.0', 'CDO'), ('HL7V3.0', 'VS'), + # ('HL7V3.0', 'BR'), ('HL7V3.0', 'CPR'), ('HL7V3.0', 'CR'), ('HL7V3.0', 'NPT'), ('HCPCS', 'MTH_HT'), ('MTH', 'CV'), ('MTH', 'XM'), + # ('MTH', 'PT'), ('MTH', 'SY'), ('MTH', 'RT'), ('ICD9CM', 'AB'), ('PSY', 'PT'), ('PSY', 'HT'), ('PSY', 'ET'), ('MEDLINEPLUS', 'ET'), + # ('MEDLINEPLUS', 'SY'), ('MEDLINEPLUS', 'HT'), ('MSH', 'HT'), ('MSH', 'HS'), ('MSH', 'DEV'), ('MSH', 'DSV'), ('MSH', 'QAB'), + # ('MSH', 'QEV'), ('MSH', 'QSV'), ('MSH', 'PM'), ('HCPCS', 'AB'), ('MTH', 'DT'), ('HCPCS', 'AM'), ('CHV', 'SY'), ('RXNORM', 'ET'), + # ('HPO', 'OP'), ('HPO', 'IS'), ('NCI', 'OP'), ('HPO', 'OET'), ('HCPCS', 'OP'), ('HCPCS', 'OM'), ('HCPCS', 'OAM'), ('GO', 'OP'), + # ('GO', 'MTH_OP'), ('GO', 'OET'), ('GO', 'MTH_OET'), ('GO', 'IS'), ('GO', 'MTH_IS'), ('PDQ', 'OP'), ('PDQ', 'IS'), ('HL7V3.0', 'OP'), + # ('HL7V3.0', 'ONP'), ('HCPCS', 'OA'), ('FMA', 'OP'), ('FMA', 'IS')] def get_args(): arg_parser = argparse.ArgumentParser(description='umls_list_jsonl_to_kg_jsonl.py: converts UMLS MySQL JSON Lines dump into KG2 JSON format') @@ -140,8 +140,10 @@ def make_umls_node(node_curie, iri, name, category, update_date, provided_by, sy nodes_output.write(node) -def get_basic_info(curie_prefix, node_id, info, accession_heirarchy): - provided_by = make_node_id(UMLS_SOURCE_PREFIX, curie_prefix) +def get_basic_info(curie_prefix, node_id, info, umls_code): + # accession_heirarchy + # for (umls_code_compare, name_key) in ACCESSION_HEIRARCHY: + cuis = info.get(CUIS_KEY, list()) tuis = info.get(TUIS_KEY, list()) if curie_prefix == kg2_util.CURIE_PREFIX_UMLS: @@ -155,11 +157,11 @@ def get_basic_info(curie_prefix, node_id, info, accession_heirarchy): names = info.get(NAMES_KEY, dict()) name, synonyms = get_name_synonyms(names, accession_heirarchy) - return node_curie, iri, name, provided_by, category, synonyms, cuis, tuis + return node_curie, iri, name, category, synonyms, cuis, tuis -def process_atc_item(node_id, info, nodes_output, edges_output): - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(ATC_PREFIX, node_id, info, ['RXN_PT', 'PT', 'RXN_IN', 'IN']) +def process_atc_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['RXN_PT', 'PT', 'RXN_IN', 'IN']) # Currently not used, but extracting them in case we want them in the future atc_level = info.get(INFO_KEY, dict()).get('ATC_LEVEL', list())[0] @@ -168,8 +170,8 @@ def process_atc_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_chv_item(node_id, info, nodes_output, edges_output): - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(CHV_PREFIX, node_id, info, ['PT', 'SY']) +def process_chv_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['PT', 'SY']) # Currently not used, but extracting them in case we want them in the future combo_score = info.get(INFO_KEY, dict()).get('COMBO_SCORE', list()) @@ -182,8 +184,8 @@ def process_chv_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_drugbank_item(node_id, info, nodes_output, edges_output): - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(DRUGBANK_PREFIX, node_id, info, ['IN', 'SY', 'FSY']) +def process_drugbank_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['IN', 'SY', 'FSY']) # Currently not used, but extracting them in case we want them in the future fda_codes = info.get(INFO_KEY, dict()).get('FDA_UNII_CODE', list()) @@ -193,8 +195,8 @@ def process_drugbank_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_fma_item(node_id, info, nodes_output, edges_output): - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(FMA_PREFIX, node_id, info, ['PT', 'SY', 'AB', 'OP', 'IS']) +def process_fma_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['PT', 'SY', 'AB', 'OP', 'IS']) # Currently not used, but extracting them in case we want them in the future authority = info.get(INFO_KEY, dict()).get('AUTHORITY', list()) @@ -203,9 +205,9 @@ def process_fma_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_go_item(node_id, info, nodes_output, edges_output): +def process_go_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PT', 'MTH_PT', 'ET', 'MTH_ET', 'SY', 'MTH_SY', 'OP', 'MTH_OP', 'OET', 'MTH_OET', 'IS', 'MTH_IS'] - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(GO_PREFIX, node_id.replace('GO:', ''), info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id.replace('GO:', ''), info, accession_heirarchy) # GO-specific information attributes = info.get(INFO_KEY, dict()) @@ -231,8 +233,8 @@ def process_go_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description(go_comment, tuis), nodes_output) -def process_hcpcs_item(node_id, info, nodes_output, edges_output): - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HCPCS_PREFIX, node_id, info, ['PT', 'MP', 'MTH_HT']) +def process_hcpcs_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['PT', 'MP', 'MTH_HT']) # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html attributes = info.get(INFO_KEY, dict()) @@ -257,9 +259,9 @@ def process_hcpcs_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_hgnc_item(node_id, info, nodes_output, edges_output): +def process_hgnc_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PT', 'ACR', 'MTH_ACR', 'NA', 'SYN', 'NP', 'NS'] - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HGNC_PREFIX, node_id.replace('HGNC:', ''), info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id.replace('HGNC:', ''), info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html attributes = info.get(INFO_KEY, dict()) @@ -296,12 +298,11 @@ def process_hgnc_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_hl7_item(node_id, info, nodes_output, edges_output): +def process_hl7_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['CSY', 'PT', 'CDO', 'VS', 'BR', 'CPR', 'CR', 'NPT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HL7_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) if node_curie == None: return - provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'HL7') # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html attributes = info.get(INFO_KEY, dict()) @@ -344,9 +345,9 @@ def process_hl7_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_hpo_item(node_id, info, nodes_output, edges_output): +def process_hpo_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PT', 'SY', 'ET', 'OP', 'IS', 'OET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(HPO_PREFIX, node_id.replace('HP:', ''), info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id.replace('HP:', ''), info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -359,9 +360,9 @@ def process_hpo_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_icd10pcs_item(node_id, info, nodes_output, edges_output): +def process_icd10pcs_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PT', 'PX', 'HX', 'MTH_HX', 'HT', 'HS', 'AB'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(ICD10PCS_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -371,9 +372,9 @@ def process_icd10pcs_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_icd9cm_item(node_id, info, nodes_output, edges_output): +def process_icd9cm_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PT', 'HT', 'AB'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(ICD9CM_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'ICD9CM') # Currently not used, but extracting them in case we want them in the future @@ -387,12 +388,11 @@ def process_icd9cm_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_medrt_item(node_id, info, nodes_output, edges_output): +def process_medrt_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PT', 'FN', 'SY'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(MEDRT_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) if node_curie == None: return - provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'MED-RT') # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -402,12 +402,11 @@ def process_medrt_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_medlineplus_item(node_id, info, nodes_output, edges_output): +def process_medlineplus_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PT', 'ET', 'SY', 'HT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(MEDLINEPLUS_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) if node_curie == None: return - provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'MEDLINEPLUS') # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -420,9 +419,9 @@ def process_medlineplus_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_msh_item(node_id, info, nodes_output, edges_output): +def process_msh_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['MH', 'TQ', 'PEP', 'ET', 'XQ', 'PXQ', 'NM', 'N1', 'PCE', 'CE', 'HT', 'HS', 'DEV', 'DSV', 'QAB', 'QEV', 'QSV', 'PM'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(MSH_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'MSH') # Currently not used, but extracting them in case we want them in the future @@ -458,12 +457,11 @@ def process_msh_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_mth_item(node_id, info, nodes_output, edges_output): +def process_mth_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PN', 'CV', 'XM', 'PT', 'SY', 'RT', 'DT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(MTH_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) if node_curie == None: return - provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'MTH') # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -488,9 +486,9 @@ def process_mth_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_ncbi_item(node_id, info, nodes_output, edges_output): +def process_ncbi_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['SCN', 'USN', 'USY', 'SY', 'UCN', 'CMN', 'UE', 'EQ'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(NCBI_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -501,9 +499,9 @@ def process_ncbi_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_nci_item(node_id, info, nodes_output, edges_output): +def process_nci_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PT', 'SY', 'CSN', 'DN', 'FBD', 'HD', 'CCN', 'AD', 'CA2', 'CA3', 'BN', 'AB', 'CCS', 'OP'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(NCI_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'NCI') # Currently not used, but extracting them in case we want them in the future @@ -556,9 +554,9 @@ def process_nci_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_nddf_item(node_id, info, nodes_output, edges_output): +def process_nddf_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['MTH_RXN_CDC', 'CDC', 'CDD', 'CDA', 'IN', 'DF'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(NDDF_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -566,9 +564,9 @@ def process_nddf_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_omim_item(node_id, info, nodes_output, edges_output): +def process_omim_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PT', 'PHENO', 'PHENO_ET', 'PTAV', 'PTCS', 'ETAL', 'ET', 'HT', 'ACR'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(OMIM_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -583,9 +581,9 @@ def process_omim_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_pdq_item(node_id, info, nodes_output, edges_output): +def process_pdq_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PT', 'HT', 'PSC', 'SY', 'ET', 'CU', 'LV', 'ACR', 'AB', 'BN', 'FBD', 'CCN', 'CHN', 'OP', 'IS'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(PDQ_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -605,9 +603,9 @@ def process_pdq_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_psy_item(node_id, info, nodes_output, edges_output): +def process_psy_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PT', 'HT', 'ET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(PSY_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -617,9 +615,9 @@ def process_psy_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_rxnorm_item(node_id, info, nodes_output, edges_output): +def process_rxnorm_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['SCD', 'SBD', 'SCDG', 'SBDG', 'BPCK', 'GPCK', 'IN', 'PSN', 'MIN', 'SCDF', 'SBDF', 'SCDC', 'DFG', 'DF', 'SBDC', 'BN', 'PIN', 'TMSY', 'SY', 'ET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(RXNORM_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -651,9 +649,9 @@ def process_rxnorm_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -def process_vandf_item(node_id, info, nodes_output, edges_output): +def process_vandf_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): accession_heirarchy = ['PT', 'CD', 'IN', 'AB', 'MTH_RXN_CD'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, provided_by, category, synonyms, cuis, tuis = get_basic_info(VANDF_PREFIX, node_id, info, accession_heirarchy) + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) # Currently not used, but extracting them in case we want them in the future attributes = info.get(INFO_KEY, dict()) @@ -676,29 +674,29 @@ def process_vandf_item(node_id, info, nodes_output, edges_output): make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) -DESIRED_CODES = {'ATC': process_atc_item, - 'CHV': process_chv_item, - 'DRUGBANK': process_drugbank_item, - 'FMA': process_fma_item, - 'GO': process_go_item, - 'HCPCS': process_hcpcs_item, - 'HGNC': process_hgnc_item, - 'HL7V3.0': process_hl7_item, - 'HPO': process_hpo_item, - 'ICD10PCS': process_icd10pcs_item, - 'ICD9CM': process_icd9cm_item, - 'MED-RT': process_medrt_item, - 'MEDLINEPLUS': process_medlineplus_item, - 'MSH': process_msh_item, - 'MTH': process_mth_item, - 'NCBI': process_ncbi_item, - 'NCI': process_nci_item, - 'NDDF': process_nddf_item, - 'OMIM': process_omim_item, - 'PDQ': process_pdq_item, - 'PSY': process_psy_item, - 'RXNORM': process_rxnorm_item, - 'VANDF': process_vandf_item} +DESIRED_CODES = {'ATC': [process_atc_item, kg2_util.CURIE_PREFIX_ATC, make_node_id(UMLS_SOURCE_PREFIX, 'ATC')], + 'CHV': [process_chv_item, kg2_util.CURIE_PREFIX_CHV, make_node_id(UMLS_SOURCE_PREFIX, 'CHV')], + 'DRUGBANK': [process_drugbank_item, kg2_util.CURIE_PREFIX_DRUGBANK, make_node_id(UMLS_SOURCE_PREFIX, 'DRUGBANK')], + 'FMA': [process_fma_item, kg2_util.CURIE_PREFIX_FMA, make_node_id(UMLS_SOURCE_PREFIX, 'FMA')], + 'GO': [process_go_item, kg2_util.CURIE_PREFIX_GO, make_node_id(UMLS_SOURCE_PREFIX, 'GO')], + 'HCPCS': [process_hcpcs_item, kg2_util.CURIE_PREFIX_HCPCS, make_node_id(UMLS_SOURCE_PREFIX, 'HCPCS')], + 'HGNC': [process_hgnc_item, kg2_util.CURIE_PREFIX_HGNC, make_node_id(UMLS_SOURCE_PREFIX, 'HGNC')], + 'HL7V3.0': [process_hl7_item, kg2_util.CURIE_PREFIX_UMLS, make_node_id(UMLS_SOURCE_PREFIX, 'HL7')], + 'HPO': [process_hpo_item, kg2_util.CURIE_PREFIX_HP, make_node_id(UMLS_SOURCE_PREFIX, 'HPO')], + 'ICD10PCS': [process_icd10pcs_item, kg2_util.CURIE_PREFIX_ICD10PCS, make_node_id(UMLS_SOURCE_PREFIX, 'ICD10PCS')], + 'ICD9CM': [process_icd9cm_item, kg2_util.CURIE_PREFIX_ICD9, make_node_id(UMLS_SOURCE_PREFIX, 'ICD9CM')], + 'MED-RT': [process_medrt_item, kg2_util.CURIE_PREFIX_UMLS, make_node_id(UMLS_SOURCE_PREFIX, 'MED-RT')], + 'MEDLINEPLUS': [process_medlineplus_item, kg2_util.CURIE_PREFIX_UMLS, make_node_id(UMLS_SOURCE_PREFIX, 'MEDLINEPLUS')], + 'MSH': [process_msh_item, kg2_util.CURIE_PREFIX_MESH, make_node_id(UMLS_SOURCE_PREFIX, 'MSH')], + 'MTH': [process_mth_item, kg2_util.CURIE_PREFIX_UMLS, make_node_id(UMLS_SOURCE_PREFIX, 'MTH')], + 'NCBI': [process_ncbi_item, kg2_util.CURIE_PREFIX_NCBI_TAXON, make_node_id(UMLS_SOURCE_PREFIX, 'NCBITAXON')], + 'NCI': [process_nci_item, kg2_util.CURIE_PREFIX_NCIT, make_node_id(UMLS_SOURCE_PREFIX, 'NCI')], + 'NDDF': [process_nddf_item, kg2_util.CURIE_PREFIX_NDDF, make_node_id(UMLS_SOURCE_PREFIX, 'NCI')], + 'OMIM': [process_omim_item, kg2_util.CURIE_PREFIX_OMIM, make_node_id(UMLS_SOURCE_PREFIX, 'OMIM')], + 'PDQ': [process_pdq_item, kg2_util.CURIE_PREFIX_PDQ, make_node_id(UMLS_SOURCE_PREFIX, 'PDQ')], + 'PSY': [process_psy_item, kg2_util.CURIE_PREFIX_PSY, make_node_id(UMLS_SOURCE_PREFIX, 'PSY')], + 'RXNORM': [process_rxnorm_item, kg2_util.CURIE_PREFIX_RXNORM, make_node_id(UMLS_SOURCE_PREFIX, 'RXNORM')], + 'VANDF': [process_vandf_item, kg2_util.CURIE_PREFIX_VANDF, make_node_id(UMLS_SOURCE_PREFIX, 'VANDF')]} if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) @@ -722,6 +720,8 @@ def process_vandf_item(node_id, info, nodes_output, edges_output): TUI_MAPPINGS = json.load(mappings) iri_mappings_raw = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string('curies-to-urls-map.yaml'))['use_for_bidirectional_mapping'] + heirarchy = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string('umls-name-heirarchy.yaml')) + print(json.dumps(heirarchy, indent=4)) for item in iri_mappings_raw: for prefix in item: IRI_MAPPINGS[prefix] = item[prefix] @@ -734,16 +734,12 @@ def process_vandf_item(node_id, info, nodes_output, edges_output): value = data[entity] source, node_id = extract_node_id(entity) - if source == 'UMLS': - name_keys.add(get_name_keys(value.get(NAMES_KEY, dict()))) - attribute_keys.update(get_attribute_keys(value.get(INFO_KEY, dict()))) - if source not in DESIRED_CODES: continue # Process the data specifically by source - DESIRED_CODES[source](node_id, value, nodes_output, edges_output) - + [source_function, curie_prefix, provided_by] = DESIRED_CODES[source] + source_function(node_id, value, nodes_output, edges_output, source, curie_prefix, provided_by) kg2_util.end_read_jsonlines(input_read_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) diff --git a/umls_util.py b/umls_util.py new file mode 100644 index 00000000..84d76c55 --- /dev/null +++ b/umls_util.py @@ -0,0 +1,573 @@ +#!/usr/bin/env python3 +'''umls_list_jsonl_to_kg_jsonl.py: converts UMLS MySQL JSON Lines dump into KG2 JSON format + + Usage: umls_list_jsonl_to_kg_jsonl.py [--test] +''' + +__author__ = 'Erica Wood' +__copyright__ = 'Oregon State University' +__credits__ = ['Stephen Ramsey', 'Erica Wood'] +__license__ = 'MIT' +__version__ = '0.1.0' +__maintainer__ = '' +__email__ = '' +__status__ = 'Prototype' + + +import kg2_util + + +def make_node_id(curie_prefix, node_id): + return curie_prefix + ':' + node_id + + +def get_name_synonyms(names_dict, accession_heirarchy): + names = list() + for key in accession_heirarchy: + names += [name for name in names_dict.get(key, dict()).get('Y', list())] + names += [name for name in names_dict.get(key, dict()).get('N', list())] + assert len(names) > 0 + if len(names) == 1: + return names[0], list() + return names[0], names[1:] + + +def make_umls_node(node_curie, iri, name, category, update_date, provided_by, synonyms, description, nodes_output): + node = kg2_util.make_node(node_curie, iri, name, category, "2023", provided_by) + node['synonym'] = synonyms + node['description'] = description + + nodes_output.write(node) + + +def get_basic_info(curie_prefix, node_id, info, umls_code): + # accession_heirarchy + # for (umls_code_compare, name_key) in ACCESSION_HEIRARCHY: + + cuis = info.get(CUIS_KEY, list()) + tuis = info.get(TUIS_KEY, list()) + if curie_prefix == kg2_util.CURIE_PREFIX_UMLS: + if len(cuis) != 1: + return None, None, None, None, None, None, None, None + node_id = cuis[0] + node_curie = make_node_id(curie_prefix, node_id) + iri = IRI_MAPPINGS[curie_prefix] + node_id + category = TUI_MAPPINGS[str(tuple(tuis))] + + names = info.get(NAMES_KEY, dict()) + name, synonyms = get_name_synonyms(names, accession_heirarchy) + + return node_curie, iri, name, category, synonyms, cuis, tuis + +def process_atc_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['RXN_PT', 'PT', 'RXN_IN', 'IN']) + + # Currently not used, but extracting them in case we want them in the future + atc_level = info.get(INFO_KEY, dict()).get('ATC_LEVEL', list())[0] + is_drug_class = info.get(INFO_KEY, dict()).get('IS_DRUG_CLASS', list()) == ["Y"] + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_chv_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['PT', 'SY']) + + # Currently not used, but extracting them in case we want them in the future + combo_score = info.get(INFO_KEY, dict()).get('COMBO_SCORE', list()) + combo_score_no_top_words = info.get(INFO_KEY, dict()).get('COMBO_SCORE_NO_TOP_WORDS', list()) + context_score = info.get(INFO_KEY, dict()).get('CONTEXT_SCORE', list()) + cui_score = info.get(INFO_KEY, dict()).get('CUI_SCORE', list()) + disparaged = info.get(INFO_KEY, dict()).get('DISPARAGED', list()) + frequency = info.get(INFO_KEY, dict()).get('FREQUENCY', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_drugbank_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['IN', 'SY', 'FSY']) + + # Currently not used, but extracting them in case we want them in the future + fda_codes = info.get(INFO_KEY, dict()).get('FDA_UNII_CODE', list()) + secondary_accession_keys = info.get(INFO_KEY, dict()).get('SID', list()) + + # TODO: figure out update date + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_fma_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['PT', 'SY', 'AB', 'OP', 'IS']) + + # Currently not used, but extracting them in case we want them in the future + authority = info.get(INFO_KEY, dict()).get('AUTHORITY', list()) + date_last_modified = info.get(INFO_KEY, dict()).get('DATE_LAST_MODIFIED', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_go_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PT', 'MTH_PT', 'ET', 'MTH_ET', 'SY', 'MTH_SY', 'OP', 'MTH_OP', 'OET', 'MTH_OET', 'IS', 'MTH_IS'] + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id.replace('GO:', ''), info, accession_heirarchy) + + # GO-specific information + attributes = info.get(INFO_KEY, dict()) + go_namespace = attributes.get('GO_NAMESPACE', list()) + assert len(go_namespace) == 1 + go_namespace = go_namespace[0] + namespace_category_map = {'molecular_function': kg2_util.BIOLINK_CATEGORY_MOLECULAR_ACTIVITY, + 'cellular_component': kg2_util.BIOLINK_CATEGORY_CELLULAR_COMPONENT, + 'biological_process': kg2_util.BIOLINK_CATEGORY_BIOLOGICAL_PROCESS} + category = namespace_category_map.get(go_namespace, category) + go_comment = attributes.get('GO_COMMENT', str()) + if len(go_comment) > 0: + go_comment = go_comment[0] + go_comment = "// COMMENTS: " + go_comment + + # Currently not used, but extracting them in case we want them in the future + date_created = attributes.get('DATE_CREATED', list()) + go_subset = attributes.get('GO_SUBSET', list()) + gxr = attributes.get('GXR', list()) + ref = attributes.get('REF', list()) + sid = attributes.get('SID', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description(go_comment, tuis), nodes_output) + + +def process_hcpcs_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['PT', 'MP', 'MTH_HT']) + + # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html + attributes = info.get(INFO_KEY, dict()) + had = attributes.get('HAD', list()) # HCPCS Action Effective Date - effective date of action to a procedure or modifier code. + hcc = attributes.get('HCC', list()) # HCPCS Coverage Code - code denoting Medicare coverage status. There are two subelements separated by "=". + hts = attributes.get('HTS', list()) # HCPCS Type of Service Code - carrier assigned HCFA Type of Service which describes the particular kind(s) of service represented by the procedure code. + hcd = attributes.get('HCD', list()) # HCPCS Code Added Date - year the HCPCS code was added to the HCFA Common Procedure Coding System. + hpn = attributes.get('HPN', list()) # HCPCS processing note number identifying the processing note contained in Appendix A of the HCPCS Manual. + haq = attributes.get('HAQ', list()) # HCPCS Anesthesia Base Unit Quantity - base unit represents the level of intensity for anesthesia procedure services that reflects all activities except time. + hlc = attributes.get('HLC', list()) # HCPCS Lab Certification Code - code used to classify laboratory procedures according to the specialty certification categories listed by CMS(formerly HCFA). + hsn = attributes.get('HSN', list()) # HCPCS Statute Number identifying statute reference for coverage or noncoverage of procedure or service. + hpd = attributes.get('HPD', list()) # HCPCS ASC payment group effective date - date the procedure is assigned to the ASC payment group. + hpg = attributes.get('HPG', list()) # HCPCS ASC payment group code which represents the dollar amount of the facility charge payable by Medicare for the procedure. + hmg = attributes.get('HMR', list()) # HCPCS Medicare Carriers Manual reference section number - number identifying a section of the Medicare Carriers Manual. + hir = attributes.get('HIR', list()) # HCPCS Coverage Issues Manual Reference Section Number - number identifying the Reference Section of the Coverage Issues Manual. + hxr = attributes.get('HXR', list()) # HCPCS Cross reference code - an explicit reference crosswalking a deleted code or a code that is not valid for Medicare to a valid current code (or range of codes). + hmp = attributes.get('HMP', list()) # HCPCS Multiple Pricing Indicator Code - code used to identify instances where a procedure could be priced. + hpi = attributes.get('HPI', list()) # HCPCS Pricing Indicator Code - used to identify the appropriate methodology for developing unique pricing amounts under Part B. + hac = attributes.get('HAC', list()) # HCPCS action code - code denoting the change made to a procedure or modifier code within the HCPCS system. + hbt = attributes.get('HBT', list()) # HCPCS Berenson-Eggers Type of Service Code - BETOS for the procedure code based on generally agreed upon clinically meaningful groupings of procedures and services. + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_hgnc_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PT', 'ACR', 'MTH_ACR', 'NA', 'SYN', 'NP', 'NS'] + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id.replace('HGNC:', ''), info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + mgd_id = attributes.get('MGD_ID', list()) + vega_id = attributes.get('VEGA_ID', list()) + genecc = attributes.get('GENCC', list()) + swp = attributes.get('SWP', list()) + mane_select = attributes.get('MANE_SELECT', list()) + local_specific_db_xr = attributes.get('LOCUS_SPECIFIC_DB_XR', list()) + locus_type = attributes.get('LOCUS_TYPE', list()) + agr = attributes.get('AGR', list()) + cytogenetic_location = attributes.get('CYTOGENETIC_LOCATION', list()) + date_created = attributes.get('DATE_CREATED', list()) + ensemblgene_id = attributes.get('ENSEMBLGENE_ID', list()) + db_xr_id = attributes.get('DB_XR_ID', list()) + locus_group = attributes.get('LOCUS_GROUP', list()) + entrezgene_id = attributes.get('ENTREZGENE_ID', list()) + date_name_changed = attributes.get('DATE_NAME_CHANGED', list()) + pmid = attributes.get('PMID', list()) + date_last_modified = attributes.get('DATE_LAST_MODIFIED', list()) + mapped_ucsc_id = attributes.get('MAPPED_UCSC_ID', list()) + refseq_id = attributes.get('REFSEQ_ID', list()) + ena = attributes.get('ENA', list()) + rgd_id = attributes.get('RGD_ID', list()) + date_symbol_changed = attributes.get('DATE_SYMBOL_CHANGED', list()) + omim_id = attributes.get('OMIM_ID', list()) + gene_fam_id = attributes.get('GENE_FAM_ID', list()) + gene_symbol = attributes.get('GENESYMBOL', list()) + ez = attributes.get('EZ', list()) + ccds_id = attributes.get('CCDS_ID', list()) + lncipedia = attributes.get('LNCIPEDIA', list()) + gene_fam_desc = attributes.get('GENE_FAM_DESC', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_hl7_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['CSY', 'PT', 'CDO', 'VS', 'BR', 'CPR', 'CR', 'NPT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + if node_curie == None: + return + + # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html + attributes = info.get(INFO_KEY, dict()) + hl7at = attributes.get('HL7AT', list()) + hl7ii = attributes.get('HL7II', list()) + hl7im = attributes.get('HL7IM', list()) + hl7lt = attributes.get('HL7LT', list()) + hl7un = attributes.get('HL7UN', list()) + hl7oa = attributes.get('HL7OA', list()) + hl7scs = attributes.get('HL7SCS', list()) + hl7cc = attributes.get('HL7CC', list()) + hl7na = attributes.get('HL7NA', list()) + hl7in = attributes.get('HL7IN', list()) + hl7ap = attributes.get('HL7AP', list()) + hl7mi = attributes.get('HL7MI', list()) + hl7hi = attributes.get('HL7HI', list()) + hl7ir = attributes.get('HL7IR', list()) + hl7ai = attributes.get('HL7AI', list()) + hl7ha = attributes.get('HL7HA', list()) + hl7rf = attributes.get('HL7RF', list()) + hl7rd = attributes.get('HL7RD', list()) + hl7vd = attributes.get('HL7VD', list()) + hl7dc = attributes.get('HL7DC', list()) + hl7rk = attributes.get('HL7RK', list()) + hl7is = attributes.get('HL7IS', list()) + hl7sy = attributes.get('HL7SY', list()) + hl7cd = attributes.get('HL7CD', list()) + hl7sl = attributes.get('HL7SL', list()) + hl7pl = attributes.get('HL7PL', list()) + hl7vc = attributes.get('HL7VC', list()) + hl7ty = attributes.get('HL7TY', list()) + hl7rg = attributes.get('HL7RG', list()) + hl7csc = attributes.get('HL7CSC', list()) + hl7od = attributes.get('HL7OD', list()) + hl7id = attributes.get('HL7ID', list()) + hl7tr = attributes.get('HL7TR', list()) + hl7di = attributes.get('HL7DI', list()) + hl7cs = attributes.get('HL7CS', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_hpo_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PT', 'SY', 'ET', 'OP', 'IS', 'OET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id.replace('HP:', ''), info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + sid = attributes.get('SID', list()) + hpo_comment = attributes.get('HPO_COMMENT', list()) + date_created = attributes.get('DATE_CREATED', list()) + syn_qualifier = attributes.get('SYN_QUALIFIER', list()) + ref = attributes.get('REF', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_icd10pcs_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PT', 'PX', 'HX', 'MTH_HX', 'HT', 'HS', 'AB'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + added_meaning = attributes.get('ADDED_MEANING', list()) + order_no = attributes.get('ORDER_NO', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_icd9cm_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PT', 'HT', 'AB'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'ICD9CM') + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + icc = attributes.get('ICC', list()) + ice = attributes.get('ICE', list()) + icf = attributes.get('ICF', list()) + sos = attributes.get('SOS', list()) + icn = attributes.get('ICN', list()) + ica = attributes.get('ICA', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + +def process_medrt_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PT', 'FN', 'SY'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + if node_curie == None: + return + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + term_status = attributes.get('TERM_STATUS', list()) + concept_type = attributes.get('CONCEPT_TYPE', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_medlineplus_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PT', 'ET', 'SY', 'HT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + if node_curie == None: + return + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + sos = attributes.get('SOS', list()) + date_created = attributes.get('DATE_CREATED', list()) + mp_group_url = attributes.get('MP_GROUP_URL', list()) + mp_primary_institute_url = attributes.get('MP_PRIMARY_INSTITUTE_URL', list()) + mp_other_language_url = attributes.get('MP_OTHER_LANGUAGE_URL', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_msh_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['MH', 'TQ', 'PEP', 'ET', 'XQ', 'PXQ', 'NM', 'N1', 'PCE', 'CE', 'HT', 'HS', 'DEV', 'DSV', 'QAB', 'QEV', 'QSV', 'PM'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'MSH') + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + mmr = attributes.get('MMR', list()) + fx = attributes.get('FX', list()) + lt = attributes.get('LT', list()) + dc = attributes.get('DC', list()) + pa = attributes.get('PA', list()) + rr = attributes.get('RR', list()) + hm = attributes.get('HM', list()) + pi = attributes.get('PI', list()) + ec = attributes.get('EC', list()) + hn = attributes.get('HN', list()) + termui = attributes.get('TERMUI', list()) + th = attributes.get('TH', list()) + sos = attributes.get('SOS', list()) + ii = attributes.get('II', list()) + rn = attributes.get('RN', list()) + an = attributes.get('AN', list()) + cx = attributes.get('CX', list()) + dq = attributes.get('DQ', list()) + dx = attributes.get('DX', list()) + pm = attributes.get('PM', list()) + aql = attributes.get('AQL', list()) + sc = attributes.get('SC', list()) + fr = attributes.get('FR', list()) + mda = attributes.get('MDA', list()) + src = attributes.get('SRC', list()) + ol = attributes.get('OL', list()) + mn = attributes.get('MN', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_mth_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PN', 'CV', 'XM', 'PT', 'SY', 'RT', 'DT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + if node_curie == None: + return + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + mth_mapsetcomplexity = attributes.get('MTH_MAPSETCOMPLEXITY', list()) + fromvsab = attributes.get('FROMVSAB', list()) + mapsetrsab = attributes.get('MAPSETRSAB', list()) + mapsetversion = attributes.get('MAPSETVERSION', list()) + mapsetvsab = attributes.get('MAPSETVSAB', list()) + tovsab = attributes.get('TOVSAB', list()) + mth_mapfromexhaustive = attributes.get('MTH_MAPFROMEXHAUSTIVE', list()) + torsab = attributes.get('TORSAB', list()) + mapsetsid = attributes.get('MAPSETSID', list()) + mapsetgrammar = attributes.get('MAPSETGRAMMAR', list()) + mapsettype = attributes.get('MAPSETTYPE', list()) + mth_maptoexhaustive = attributes.get('MTH_MAPTOEXHAUSTIVE', list()) + fromrsab = attributes.get('FROMRSAB', list()) + mth_mapfromcomplexity = attributes.get('MTH_MAPFROMCOMPLEXITY', list()) + lt = attributes.get('LT', list()) + mth_maptocomplexity = attributes.get('MTH_MAPTOCOMPLEXITY', list()) + sos = attributes.get('SOS', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_ncbi_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['SCN', 'USN', 'USY', 'SY', 'UCN', 'CMN', 'UE', 'EQ'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + div = attributes.get('DIV', list()) + authority_name = attributes.get('AUTHORITY_NAME', list()) + rank = attributes.get('RANK', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_nci_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PT', 'SY', 'CSN', 'DN', 'FBD', 'HD', 'CCN', 'AD', 'CA2', 'CA3', 'BN', 'AB', 'CCS', 'OP'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'NCI') + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + clinvar_variation_id = attributes.get('CLINVAR_VARIATION_ID', list()) + micronutrient = attributes.get('MICRONUTRIENT', list()) + genbank_accession_number = attributes.get('GENBANK_ACCESSION_NUMBER', list()) + fda_table = attributes.get('FDA_TABLE', list()) + usda_id = attributes.get('USDA_ID', list()) + icd_o_3_code = attributes.get('ICD-O-3_CODE', list()) + tolerable_level = attributes.get('TOLERABLE_LEVEL', list()) + ncbi_taxon_id = attributes.get('NCBI_TAXON_ID', list()) + mgi_accession_id = attributes.get('MGI_ACCESSION_ID', list()) + homologous_gene = attributes.get('HOMOLOGOUS_GENE', list()) + pid_id = attributes.get('PID_ID', list()) + swiss_prot = attributes.get('SWISS_PROT', list()) + essential_amino_acid = attributes.get('ESSENTIAL_AMINO_ACID', list()) + publish_value_set = attributes.get('PUBLISH_VALUE_SET', list()) + cas_registry = attributes.get('CAS_REGISTRY', list()) + value_set_pair = attributes.get('VALUE_SET_PAIR', list()) + accepted_therapeutic_use_for = attributes.get('ACCEPTED_THERAPEUTIC_USE_FOR', list()) + hgnc_id = attributes.get('HGNC_ID', list()) + nci_drug_dictionary_id = attributes.get('NCI_DRUG_DICTIONARY_ID', list()) + chebi_id = attributes.get('CHEBI_ID', list()) + cnu = attributes.get('CNU', list()) + mirbase_id = attributes.get('MIRBASE_ID', list()) + macronutrient = attributes.get('MACRONUTRIENT', list()) + essential_fatty_acid = attributes.get('ESSENTIAL_FATTY_ACID', list()) + unit = attributes.get('UNIT', list()) + pdq_open_trial_search_id = attributes.get('PDQ_OPEN_TRIAL_SEARCH_ID', list()) + term_browser_value_set_description = attributes.get('TERM_BROWSER_VALUE_SET_DESCRIPTION', list()) + entrezgene_id = attributes.get('ENTREZGENE_ID', list()) + infoods = attributes.get('INFOODS', list()) + pubmedid_primary_reference = attributes.get('PUBMEDID_PRIMARY_REFERENCE', list()) + biocarta_id = attributes.get('BIOCARTA_ID', list()) + extensible_list = attributes.get('EXTENSIBLE_LIST', list()) + use_for = attributes.get('USE_FOR', list()) + neoplastic_status = attributes.get('NEOPLASTIC_STATUS', list()) + nsc_number = attributes.get('NSC_NUMBER', list()) + omim_number = attributes.get('OMIM_NUMBER', list()) + lt = attributes.get('LT', list()) + kegg_id = attributes.get('KEGG_ID', list()) + gene_encodes_product = attributes.get('GENE_ENCODES_PRODUCT', list()) + pdq_closed_trial_search_id = attributes.get('PDQ_CLOSED_TRIAL_SEARCH_ID', list()) + design_note = attributes.get('DESIGN_NOTE', list()) + nutrient = attributes.get('NUTRIENT', list()) + fda_unii_code = attributes.get('FDA_UNII_CODE', list()) + us_recommended_intake = attributes.get('US_RECOMMENDED_INTAKE', list()) + chemical_formula = attributes.get('CHEMICAL_FORMULA', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + +def process_nddf_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['MTH_RXN_CDC', 'CDC', 'CDD', 'CDA', 'IN', 'DF'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + ndc = attributes.get('NDC', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + +def process_omim_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PT', 'PHENO', 'PHENO_ET', 'PTAV', 'PTCS', 'ETAL', 'ET', 'HT', 'ACR'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + genesymbol = attributes.get('GENESYMBOL', list()) + mimtypevalue = attributes.get('MIMTYPEVALUE', list()) + moved_from = attributes.get('MOVED_FROM', list()) + sos = attributes.get('SOS', list()) + genelocus = attributes.get('GENELOCUS', list()) + mimtypemeaning = attributes.get('MIMTYPEMEANING', list()) + mimtype = attributes.get('MIMTYPE', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_pdq_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PT', 'HT', 'PSC', 'SY', 'ET', 'CU', 'LV', 'ACR', 'AB', 'BN', 'FBD', 'CCN', 'CHN', 'OP', 'IS'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + lt = attributes.get('LT', list()) + cas_registry = attributes.get('CAS_REGISTRY', list()) + date_first_published = attributes.get('DATE_FIRST_PUBLISHED', list()) + date_last_modified = attributes.get('DATE_LAST_MODIFIED', list()) + ind_code = attributes.get('IND_CODE', list()) + pid = attributes.get('PID', list()) + nsc_code = attributes.get('NSC_CODE', list()) + pxc = attributes.get('PXC', list()) + menu_parent = attributes.get('MENU_PARENT', list()) + nci_id = attributes.get('NCI_ID', list()) + orig_sty = attributes.get('ORIG_STY', list()) + menu_type = attributes.get('MENU_TYPE', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_psy_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PT', 'HT', 'ET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + hn = attributes.get('HN', list()) + pyr = attributes.get('PYR', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_rxnorm_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['SCD', 'SBD', 'SCDG', 'SBDG', 'BPCK', 'GPCK', 'IN', 'PSN', 'MIN', 'SCDF', 'SBDF', 'SCDC', 'DFG', 'DF', 'SBDC', 'BN', 'PIN', 'TMSY', 'SY', 'ET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + ndc = attributes.get('NDC', list()) + rxn_obsoleted = attributes.get('RXN_OBSOLETED', list()) + rxn_available_strength = attributes.get('RXN_AVAILABLE_STRENGTH', list()) + rxn_human_drug = attributes.get('RXN_HUMAN_DRUG', list()) + rxn_quantity = attributes.get('RXN_QUANTITY', list()) + rxterm_form = attributes.get('RXTERM_FORM', list()) + rxn_in_expressed_flag = attributes.get('RXN_IN_EXPRESSED_FLAG', list()) + rxaui = attributes.get('RXAUI', list()) + rxn_bn_cardinality = attributes.get('RXN_BN_CARDINALITY', list()) + rxn_activated = attributes.get('RXN_ACTIVATED', list()) + rxn_boss_strength_denom_unit = attributes.get('RXN_BOSS_STRENGTH_DENOM_UNIT', list()) + ambiguity_flag = attributes.get('AMBIGUITY_FLAG', list()) + rxn_strength = attributes.get('RXN_STRENGTH', list()) + rxcui = attributes.get('RXCUI', list()) + rxn_ai = attributes.get('RXN_AI', list()) + rxn_boss_from = attributes.get('RXN_BOSS_FROM', list()) + rxn_boss_strength_num_unit = attributes.get('RXN_BOSS_STRENGTH_NUM_UNIT', list()) + rxn_vet_drug = attributes.get('RXN_VET_DRUG', list()) + orig_code = attributes.get('ORIG_CODE', list()) + rxn_am = attributes.get('RXN_AM', list()) + rxn_boss_strength_denom_value = attributes.get('RXN_BOSS_STRENGTH_DENOM_VALUE', list()) + rxn_boss_strength_num_value = attributes.get('RXN_BOSS_STRENGTH_NUM_VALUE', list()) + rxn_qualitative_distinction = attributes.get('RXN_QUALITATIVE_DISTINCTION', list()) + orig_source = attributes.get('ORIG_SOURCE', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) + + +def process_vandf_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): + accession_heirarchy = ['PT', 'CD', 'IN', 'AB', 'MTH_RXN_CD'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html + node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(INFO_KEY, dict()) + ndf_transmit_to_cmop = attributes.get('NDF_TRANSMIT_TO_CMOP', list()) + sngl_or_mult_src_prd = attributes.get('SNGL_OR_MULT_SRC_PRD', list()) + dcsa = attributes.get('DCSA', list()) + exclude_di_check = attributes.get('EXCLUDE_DI_CHECK', list()) + nfi = attributes.get('NFI', list()) + va_class_name = attributes.get('VA_CLASS_NAME', list()) + vmo = attributes.get('VMO', list()) + drug_class_type = attributes.get('DRUG_CLASS_TYPE', list()) + nf_name = attributes.get('NF_NAME', list()) + ndc = attributes.get('NDC', list()) + vac = attributes.get('VAC', list()) + va_generic_name = attributes.get('VA_GENERIC_NAME', list()) + parent_class = attributes.get('PARENT_CLASS', list()) + va_dispense_unit = attributes.get('VA_DISPENSE_UNIT', list()) + ddf = attributes.get('DDF', list()) + + make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) From b419e86eb9688e0689ac4442bce9b331118c16bc Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 22 Aug 2023 15:55:26 -0700 Subject: [PATCH 063/117] #316 successfully refactored UMLS code into a class that takes care of all the variables and source picking --- umls_list_jsonl_to_kg_jsonl.py | 735 ++------------------ umls_util.py | 1155 +++++++++++++++++--------------- 2 files changed, 667 insertions(+), 1223 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 7636bec4..49569844 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -17,68 +17,64 @@ import argparse import kg2_util import json - -CUIS_KEY = 'cuis' -INFO_KEY = 'attributes' -NAMES_KEY = 'names' -TUIS_KEY = 'tuis' +import umls_util TUI_MAPPINGS = dict() IRI_MAPPINGS = dict() -ATC_PREFIX = kg2_util.CURIE_PREFIX_ATC -CHV_PREFIX = kg2_util.CURIE_PREFIX_CHV -DRUGBANK_PREFIX = kg2_util.CURIE_PREFIX_DRUGBANK -FMA_PREFIX = kg2_util.CURIE_PREFIX_FMA -GO_PREFIX = kg2_util.CURIE_PREFIX_GO -HCPCS_PREFIX = kg2_util.CURIE_PREFIX_HCPCS -HGNC_PREFIX = kg2_util.CURIE_PREFIX_HGNC -HL7_PREFIX = kg2_util.CURIE_PREFIX_UMLS -HPO_PREFIX = kg2_util.CURIE_PREFIX_HP -ICD10PCS_PREFIX = kg2_util.CURIE_PREFIX_ICD10PCS -ICD9CM_PREFIX = kg2_util.CURIE_PREFIX_ICD9 -MEDRT_PREFIX = kg2_util.CURIE_PREFIX_UMLS -MEDLINEPLUS_PREFIX = kg2_util.CURIE_PREFIX_UMLS -MSH_PREFIX = kg2_util.CURIE_PREFIX_MESH -MTH_PREFIX = kg2_util.CURIE_PREFIX_UMLS -NCBI_PREFIX = kg2_util.CURIE_PREFIX_NCBI_TAXON -NCI_PREFIX = kg2_util.CURIE_PREFIX_NCIT -NDDF_PREFIX = kg2_util.CURIE_PREFIX_NDDF -OMIM_PREFIX = kg2_util.CURIE_PREFIX_OMIM -PDQ_PREFIX = kg2_util.CURIE_PREFIX_PDQ -PSY_PREFIX = kg2_util.CURIE_PREFIX_PSY -RXNORM_PREFIX = kg2_util.CURIE_PREFIX_RXNORM -VANDF_PREFIX = kg2_util.CURIE_PREFIX_VANDF - -UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE - -# Mined from HTML Page Source of https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html -ACCESSION_HEIRARCHY = list() -ACCESSION_SOURCES_HEIRARCHY = dict() - # [('MTH', 'PN'), ('RXNORM', 'SCD'), ('RXNORM', 'SBD'), ('RXNORM', 'SCDG'), ('RXNORM', 'SBDG'), ('RXNORM', 'BPCK'), ('RXNORM', 'GPCK'), - # ('RXNORM', 'IN'), ('RXNORM', 'PSN'), ('RXNORM', 'MIN'), ('RXNORM', 'SCDF'), ('RXNORM', 'SBDF'), ('RXNORM', 'SCDC'), ('RXNORM', 'DFG'), - # ('RXNORM', 'DF'), ('RXNORM', 'SBDC'), ('RXNORM', 'BN'), ('RXNORM', 'PIN'), ('RXNORM', 'TMSY'), ('RXNORM', 'SY'), ('MSH', 'MH'), - # ('MSH', 'TQ'), ('MSH', 'PEP'), ('MSH', 'ET'), ('MSH', 'XQ'), ('MSH', 'PXQ'), ('MSH', 'NM'), ('HPO', 'PT'), ('HPO', 'SY'), ('HPO', 'ET'), - # ('NCBI', 'SCN'), ('ATC', 'RXN_PT'), ('ATC', 'PT'), ('VANDF', 'PT'), ('VANDF', 'CD'), ('VANDF', 'IN'), ('DRUGBANK', 'IN'), - # ('DRUGBANK', 'SY'), ('DRUGBANK', 'FSY'), ('MSH', 'N1'), ('MSH', 'PCE'), ('MSH', 'CE'), ('FMA', 'PT'), ('FMA', 'SY'), ('FMA', 'AB'), - # ('ATC', 'RXN_IN'), ('ATC', 'IN'), ('VANDF', 'AB'), ('VANDF', 'MTH_RXN_CD'), ('NDDF', 'MTH_RXN_CDC'), ('NDDF', 'CDC'), ('NDDF', 'CDD'), - # ('NDDF', 'CDA'), ('NDDF', 'IN'), ('NDDF', 'DF'), ('MED-RT', 'PT'), ('MED-RT', 'FN'), ('MED-RT', 'SY'), ('HCPCS', 'PT'), ('HCPCS', 'MP'), - # ('OMIM', 'PT'), ('OMIM', 'PHENO'), ('OMIM', 'PHENO_ET'), ('OMIM', 'PTAV'), ('OMIM', 'PTCS'), ('OMIM', 'ETAL'), ('OMIM', 'ET'), - # ('OMIM', 'HT'), ('OMIM', 'ACR'), ('HGNC', 'PT'), ('HGNC', 'ACR'), ('HGNC', 'MTH_ACR'), ('HGNC', 'NA'), ('HGNC', 'SYN'), ('HGNC', 'NP'), - # ('HGNC', 'NS'), ('NCI', 'PT'), ('NCI', 'SY'), ('NCI', 'CSN'), ('NCI', 'DN'), ('NCI', 'FBD'), ('NCI', 'HD'), ('NCI', 'CCN'), - # ('NCI', 'AD'), ('NCI', 'CA2'), ('NCI', 'CA3'), ('NCI', 'BN'), ('NCI', 'AB'), ('NCI', 'CCS'), ('PDQ', 'PT'), ('PDQ', 'HT'), - # ('PDQ', 'PSC'), ('PDQ', 'SY'), ('CHV', 'PT'), ('MEDLINEPLUS', 'PT'), ('GO', 'PT'), ('GO', 'MTH_PT'), ('GO', 'ET'), ('GO', 'MTH_ET'), - # ('GO', 'SY'), ('GO', 'MTH_SY'), ('PDQ', 'ET'), ('PDQ', 'CU'), ('PDQ', 'LV'), ('PDQ', 'ACR'), ('PDQ', 'AB'), ('PDQ', 'BN'), ('PDQ', 'FBD'), - # ('PDQ', 'CCN'), ('PDQ', 'CHN'), ('NCBI', 'USN'), ('NCBI', 'USY'), ('NCBI', 'SY'), ('NCBI', 'UCN'), ('NCBI', 'CMN'), ('NCBI', 'UE'), - # ('NCBI', 'EQ'), ('ICD9CM', 'PT'), ('ICD9CM', 'HT'), ('ICD10PCS', 'PT'), ('ICD10PCS', 'PX'), ('ICD10PCS', 'HX'), ('ICD10PCS', 'MTH_HX'), - # ('ICD10PCS', 'HT'), ('ICD10PCS', 'HS'), ('ICD10PCS', 'AB'), ('HL7V3.0', 'CSY'), ('HL7V3.0', 'PT'), ('HL7V3.0', 'CDO'), ('HL7V3.0', 'VS'), - # ('HL7V3.0', 'BR'), ('HL7V3.0', 'CPR'), ('HL7V3.0', 'CR'), ('HL7V3.0', 'NPT'), ('HCPCS', 'MTH_HT'), ('MTH', 'CV'), ('MTH', 'XM'), - # ('MTH', 'PT'), ('MTH', 'SY'), ('MTH', 'RT'), ('ICD9CM', 'AB'), ('PSY', 'PT'), ('PSY', 'HT'), ('PSY', 'ET'), ('MEDLINEPLUS', 'ET'), - # ('MEDLINEPLUS', 'SY'), ('MEDLINEPLUS', 'HT'), ('MSH', 'HT'), ('MSH', 'HS'), ('MSH', 'DEV'), ('MSH', 'DSV'), ('MSH', 'QAB'), - # ('MSH', 'QEV'), ('MSH', 'QSV'), ('MSH', 'PM'), ('HCPCS', 'AB'), ('MTH', 'DT'), ('HCPCS', 'AM'), ('CHV', 'SY'), ('RXNORM', 'ET'), - # ('HPO', 'OP'), ('HPO', 'IS'), ('NCI', 'OP'), ('HPO', 'OET'), ('HCPCS', 'OP'), ('HCPCS', 'OM'), ('HCPCS', 'OAM'), ('GO', 'OP'), - # ('GO', 'MTH_OP'), ('GO', 'OET'), ('GO', 'MTH_OET'), ('GO', 'IS'), ('GO', 'MTH_IS'), ('PDQ', 'OP'), ('PDQ', 'IS'), ('HL7V3.0', 'OP'), - # ('HL7V3.0', 'ONP'), ('HCPCS', 'OA'), ('FMA', 'OP'), ('FMA', 'IS')] +# ATC_PREFIX = kg2_util.CURIE_PREFIX_ATC +# CHV_PREFIX = kg2_util.CURIE_PREFIX_CHV +# DRUGBANK_PREFIX = kg2_util.CURIE_PREFIX_DRUGBANK +# FMA_PREFIX = kg2_util.CURIE_PREFIX_FMA +# GO_PREFIX = kg2_util.CURIE_PREFIX_GO +# HCPCS_PREFIX = kg2_util.CURIE_PREFIX_HCPCS +# HGNC_PREFIX = kg2_util.CURIE_PREFIX_HGNC +# HL7_PREFIX = kg2_util.CURIE_PREFIX_UMLS +# HPO_PREFIX = kg2_util.CURIE_PREFIX_HP +# ICD10PCS_PREFIX = kg2_util.CURIE_PREFIX_ICD10PCS +# ICD9CM_PREFIX = kg2_util.CURIE_PREFIX_ICD9 +# MEDRT_PREFIX = kg2_util.CURIE_PREFIX_UMLS +# MEDLINEPLUS_PREFIX = kg2_util.CURIE_PREFIX_UMLS +# MSH_PREFIX = kg2_util.CURIE_PREFIX_MESH +# MTH_PREFIX = kg2_util.CURIE_PREFIX_UMLS +# NCBI_PREFIX = kg2_util.CURIE_PREFIX_NCBI_TAXON +# NCI_PREFIX = kg2_util.CURIE_PREFIX_NCIT +# NDDF_PREFIX = kg2_util.CURIE_PREFIX_NDDF +# OMIM_PREFIX = kg2_util.CURIE_PREFIX_OMIM +# PDQ_PREFIX = kg2_util.CURIE_PREFIX_PDQ +# PSY_PREFIX = kg2_util.CURIE_PREFIX_PSY +# RXNORM_PREFIX = kg2_util.CURIE_PREFIX_RXNORM +# VANDF_PREFIX = kg2_util.CURIE_PREFIX_VANDF + +# UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE + +# DESIRED_CODES = {'ATC': [umls_util.process_atc_item, kg2_util.CURIE_PREFIX_ATC, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'ATC')], +# 'CHV': [umls_util.process_chv_item, kg2_util.CURIE_PREFIX_CHV, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'CHV')], +# 'DRUGBANK': [umls_util.process_drugbank_item, kg2_util.CURIE_PREFIX_DRUGBANK, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'DRUGBANK')], +# 'FMA': [umls_util.process_fma_item, kg2_util.CURIE_PREFIX_FMA, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'FMA')], +# 'GO': [umls_util.process_go_item, kg2_util.CURIE_PREFIX_GO, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'GO')], +# 'HCPCS': [umls_util.process_hcpcs_item, kg2_util.CURIE_PREFIX_HCPCS, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'HCPCS')], +# 'HGNC': [umls_util.process_hgnc_item, kg2_util.CURIE_PREFIX_HGNC, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'HGNC')], +# 'HL7V3.0': [umls_util.process_hl7_item, kg2_util.CURIE_PREFIX_UMLS, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'HL7')], +# 'HPO': [umls_util.process_hpo_item, kg2_util.CURIE_PREFIX_HP, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'HPO')], +# 'ICD10PCS': [umls_util.process_icd10pcs_item, kg2_util.CURIE_PREFIX_ICD10PCS, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'ICD10PCS')], +# 'ICD9CM': [umls_util.process_icd9cm_item, kg2_util.CURIE_PREFIX_ICD9, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'ICD9CM')], +# 'MED-RT': [umls_util.process_medrt_item, kg2_util.CURIE_PREFIX_UMLS, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'MED-RT')], +# 'MEDLINEPLUS': [umls_util.process_medlineplus_item, kg2_util.CURIE_PREFIX_UMLS, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'MEDLINEPLUS')], +# 'MSH': [umls_util.process_msh_item, kg2_util.CURIE_PREFIX_MESH, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'MSH')], +# 'MTH': [umls_util.process_mth_item, kg2_util.CURIE_PREFIX_UMLS, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'MTH')], +# 'NCBI': [umls_util.process_ncbi_item, kg2_util.CURIE_PREFIX_NCBI_TAXON, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'NCBITAXON')], +# 'NCI': [umls_util.process_nci_item, kg2_util.CURIE_PREFIX_NCIT, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'NCI')], +# 'NDDF': [umls_util.process_nddf_item, kg2_util.CURIE_PREFIX_NDDF, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'NCI')], +# 'OMIM': [umls_util.process_omim_item, kg2_util.CURIE_PREFIX_OMIM, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'OMIM')], +# 'PDQ': [umls_util.process_pdq_item, kg2_util.CURIE_PREFIX_PDQ, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'PDQ')], +# 'PSY': [umls_util.process_psy_item, kg2_util.CURIE_PREFIX_PSY, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'PSY')], +# 'RXNORM': [umls_util.process_rxnorm_item, kg2_util.CURIE_PREFIX_RXNORM, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'RXNORM')], +# 'VANDF': [umls_util.process_vandf_item, kg2_util.CURIE_PREFIX_VANDF, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'VANDF')]} + +# # Mined from HTML Page Source of https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html +# ACCESSION_HEIRARCHY = list() +# ACCESSION_SOURCES_HEIRARCHY = dict() def get_args(): arg_parser = argparse.ArgumentParser(description='umls_list_jsonl_to_kg_jsonl.py: converts UMLS MySQL JSON Lines dump into KG2 JSON format') @@ -95,608 +91,16 @@ def extract_node_id(node_id_str): return node_id[0].strip(), node_id[1].strip() -def make_node_id(curie_prefix, node_id): - return curie_prefix + ':' + node_id - - -def create_description(comment, tuis): - description = comment - for tui in tuis: - description += "; UMLS Semantic Type: STY:" + tui - description = description.strip("; ") - return description - - -def get_name_synonyms(names_dict, accession_heirarchy): - names = list() - for key in accession_heirarchy: - names += [name for name in names_dict.get(key, dict()).get('Y', list())] - names += [name for name in names_dict.get(key, dict()).get('N', list())] - assert len(names) > 0 - if len(names) == 1: - return names[0], list() - return names[0], names[1:] - - -def get_name_keys(names_dict): - keys_list = [] - for key in names_dict: - keys_list.append(key) - return str(sorted(keys_list)) - - -def get_attribute_keys(attributes_dict): - keys_list = [] - for key in attributes_dict: - keys_list.append(key) - return set(keys_list) - - -def make_umls_node(node_curie, iri, name, category, update_date, provided_by, synonyms, description, nodes_output): - node = kg2_util.make_node(node_curie, iri, name, category, "2023", provided_by) - node['synonym'] = synonyms - node['description'] = description - - nodes_output.write(node) - - -def get_basic_info(curie_prefix, node_id, info, umls_code): - # accession_heirarchy - # for (umls_code_compare, name_key) in ACCESSION_HEIRARCHY: - - cuis = info.get(CUIS_KEY, list()) - tuis = info.get(TUIS_KEY, list()) - if curie_prefix == kg2_util.CURIE_PREFIX_UMLS: - if len(cuis) != 1: - return None, None, None, None, None, None, None, None - node_id = cuis[0] - node_curie = make_node_id(curie_prefix, node_id) - iri = IRI_MAPPINGS[curie_prefix] + node_id - category = TUI_MAPPINGS[str(tuple(tuis))] - - names = info.get(NAMES_KEY, dict()) - name, synonyms = get_name_synonyms(names, accession_heirarchy) - - return node_curie, iri, name, category, synonyms, cuis, tuis - - -def process_atc_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['RXN_PT', 'PT', 'RXN_IN', 'IN']) - - # Currently not used, but extracting them in case we want them in the future - atc_level = info.get(INFO_KEY, dict()).get('ATC_LEVEL', list())[0] - is_drug_class = info.get(INFO_KEY, dict()).get('IS_DRUG_CLASS', list()) == ["Y"] - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_chv_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['PT', 'SY']) - - # Currently not used, but extracting them in case we want them in the future - combo_score = info.get(INFO_KEY, dict()).get('COMBO_SCORE', list()) - combo_score_no_top_words = info.get(INFO_KEY, dict()).get('COMBO_SCORE_NO_TOP_WORDS', list()) - context_score = info.get(INFO_KEY, dict()).get('CONTEXT_SCORE', list()) - cui_score = info.get(INFO_KEY, dict()).get('CUI_SCORE', list()) - disparaged = info.get(INFO_KEY, dict()).get('DISPARAGED', list()) - frequency = info.get(INFO_KEY, dict()).get('FREQUENCY', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_drugbank_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['IN', 'SY', 'FSY']) - - # Currently not used, but extracting them in case we want them in the future - fda_codes = info.get(INFO_KEY, dict()).get('FDA_UNII_CODE', list()) - secondary_accession_keys = info.get(INFO_KEY, dict()).get('SID', list()) - - # TODO: figure out update date - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_fma_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['PT', 'SY', 'AB', 'OP', 'IS']) - - # Currently not used, but extracting them in case we want them in the future - authority = info.get(INFO_KEY, dict()).get('AUTHORITY', list()) - date_last_modified = info.get(INFO_KEY, dict()).get('DATE_LAST_MODIFIED', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_go_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'MTH_PT', 'ET', 'MTH_ET', 'SY', 'MTH_SY', 'OP', 'MTH_OP', 'OET', 'MTH_OET', 'IS', 'MTH_IS'] - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id.replace('GO:', ''), info, accession_heirarchy) - - # GO-specific information - attributes = info.get(INFO_KEY, dict()) - go_namespace = attributes.get('GO_NAMESPACE', list()) - assert len(go_namespace) == 1 - go_namespace = go_namespace[0] - namespace_category_map = {'molecular_function': kg2_util.BIOLINK_CATEGORY_MOLECULAR_ACTIVITY, - 'cellular_component': kg2_util.BIOLINK_CATEGORY_CELLULAR_COMPONENT, - 'biological_process': kg2_util.BIOLINK_CATEGORY_BIOLOGICAL_PROCESS} - category = namespace_category_map.get(go_namespace, category) - go_comment = attributes.get('GO_COMMENT', str()) - if len(go_comment) > 0: - go_comment = go_comment[0] - go_comment = "// COMMENTS: " + go_comment - - # Currently not used, but extracting them in case we want them in the future - date_created = attributes.get('DATE_CREATED', list()) - go_subset = attributes.get('GO_SUBSET', list()) - gxr = attributes.get('GXR', list()) - ref = attributes.get('REF', list()) - sid = attributes.get('SID', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description(go_comment, tuis), nodes_output) - - -def process_hcpcs_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['PT', 'MP', 'MTH_HT']) - - # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html - attributes = info.get(INFO_KEY, dict()) - had = attributes.get('HAD', list()) # HCPCS Action Effective Date - effective date of action to a procedure or modifier code. - hcc = attributes.get('HCC', list()) # HCPCS Coverage Code - code denoting Medicare coverage status. There are two subelements separated by "=". - hts = attributes.get('HTS', list()) # HCPCS Type of Service Code - carrier assigned HCFA Type of Service which describes the particular kind(s) of service represented by the procedure code. - hcd = attributes.get('HCD', list()) # HCPCS Code Added Date - year the HCPCS code was added to the HCFA Common Procedure Coding System. - hpn = attributes.get('HPN', list()) # HCPCS processing note number identifying the processing note contained in Appendix A of the HCPCS Manual. - haq = attributes.get('HAQ', list()) # HCPCS Anesthesia Base Unit Quantity - base unit represents the level of intensity for anesthesia procedure services that reflects all activities except time. - hlc = attributes.get('HLC', list()) # HCPCS Lab Certification Code - code used to classify laboratory procedures according to the specialty certification categories listed by CMS(formerly HCFA). - hsn = attributes.get('HSN', list()) # HCPCS Statute Number identifying statute reference for coverage or noncoverage of procedure or service. - hpd = attributes.get('HPD', list()) # HCPCS ASC payment group effective date - date the procedure is assigned to the ASC payment group. - hpg = attributes.get('HPG', list()) # HCPCS ASC payment group code which represents the dollar amount of the facility charge payable by Medicare for the procedure. - hmg = attributes.get('HMR', list()) # HCPCS Medicare Carriers Manual reference section number - number identifying a section of the Medicare Carriers Manual. - hir = attributes.get('HIR', list()) # HCPCS Coverage Issues Manual Reference Section Number - number identifying the Reference Section of the Coverage Issues Manual. - hxr = attributes.get('HXR', list()) # HCPCS Cross reference code - an explicit reference crosswalking a deleted code or a code that is not valid for Medicare to a valid current code (or range of codes). - hmp = attributes.get('HMP', list()) # HCPCS Multiple Pricing Indicator Code - code used to identify instances where a procedure could be priced. - hpi = attributes.get('HPI', list()) # HCPCS Pricing Indicator Code - used to identify the appropriate methodology for developing unique pricing amounts under Part B. - hac = attributes.get('HAC', list()) # HCPCS action code - code denoting the change made to a procedure or modifier code within the HCPCS system. - hbt = attributes.get('HBT', list()) # HCPCS Berenson-Eggers Type of Service Code - BETOS for the procedure code based on generally agreed upon clinically meaningful groupings of procedures and services. - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_hgnc_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'ACR', 'MTH_ACR', 'NA', 'SYN', 'NP', 'NS'] - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id.replace('HGNC:', ''), info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html - attributes = info.get(INFO_KEY, dict()) - mgd_id = attributes.get('MGD_ID', list()) - vega_id = attributes.get('VEGA_ID', list()) - genecc = attributes.get('GENCC', list()) - swp = attributes.get('SWP', list()) - mane_select = attributes.get('MANE_SELECT', list()) - local_specific_db_xr = attributes.get('LOCUS_SPECIFIC_DB_XR', list()) - locus_type = attributes.get('LOCUS_TYPE', list()) - agr = attributes.get('AGR', list()) - cytogenetic_location = attributes.get('CYTOGENETIC_LOCATION', list()) - date_created = attributes.get('DATE_CREATED', list()) - ensemblgene_id = attributes.get('ENSEMBLGENE_ID', list()) - db_xr_id = attributes.get('DB_XR_ID', list()) - locus_group = attributes.get('LOCUS_GROUP', list()) - entrezgene_id = attributes.get('ENTREZGENE_ID', list()) - date_name_changed = attributes.get('DATE_NAME_CHANGED', list()) - pmid = attributes.get('PMID', list()) - date_last_modified = attributes.get('DATE_LAST_MODIFIED', list()) - mapped_ucsc_id = attributes.get('MAPPED_UCSC_ID', list()) - refseq_id = attributes.get('REFSEQ_ID', list()) - ena = attributes.get('ENA', list()) - rgd_id = attributes.get('RGD_ID', list()) - date_symbol_changed = attributes.get('DATE_SYMBOL_CHANGED', list()) - omim_id = attributes.get('OMIM_ID', list()) - gene_fam_id = attributes.get('GENE_FAM_ID', list()) - gene_symbol = attributes.get('GENESYMBOL', list()) - ez = attributes.get('EZ', list()) - ccds_id = attributes.get('CCDS_ID', list()) - lncipedia = attributes.get('LNCIPEDIA', list()) - gene_fam_desc = attributes.get('GENE_FAM_DESC', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_hl7_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['CSY', 'PT', 'CDO', 'VS', 'BR', 'CPR', 'CR', 'NPT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - if node_curie == None: - return - - # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html - attributes = info.get(INFO_KEY, dict()) - hl7at = attributes.get('HL7AT', list()) - hl7ii = attributes.get('HL7II', list()) - hl7im = attributes.get('HL7IM', list()) - hl7lt = attributes.get('HL7LT', list()) - hl7un = attributes.get('HL7UN', list()) - hl7oa = attributes.get('HL7OA', list()) - hl7scs = attributes.get('HL7SCS', list()) - hl7cc = attributes.get('HL7CC', list()) - hl7na = attributes.get('HL7NA', list()) - hl7in = attributes.get('HL7IN', list()) - hl7ap = attributes.get('HL7AP', list()) - hl7mi = attributes.get('HL7MI', list()) - hl7hi = attributes.get('HL7HI', list()) - hl7ir = attributes.get('HL7IR', list()) - hl7ai = attributes.get('HL7AI', list()) - hl7ha = attributes.get('HL7HA', list()) - hl7rf = attributes.get('HL7RF', list()) - hl7rd = attributes.get('HL7RD', list()) - hl7vd = attributes.get('HL7VD', list()) - hl7dc = attributes.get('HL7DC', list()) - hl7rk = attributes.get('HL7RK', list()) - hl7is = attributes.get('HL7IS', list()) - hl7sy = attributes.get('HL7SY', list()) - hl7cd = attributes.get('HL7CD', list()) - hl7sl = attributes.get('HL7SL', list()) - hl7pl = attributes.get('HL7PL', list()) - hl7vc = attributes.get('HL7VC', list()) - hl7ty = attributes.get('HL7TY', list()) - hl7rg = attributes.get('HL7RG', list()) - hl7csc = attributes.get('HL7CSC', list()) - hl7od = attributes.get('HL7OD', list()) - hl7id = attributes.get('HL7ID', list()) - hl7tr = attributes.get('HL7TR', list()) - hl7di = attributes.get('HL7DI', list()) - hl7cs = attributes.get('HL7CS', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_hpo_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'SY', 'ET', 'OP', 'IS', 'OET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id.replace('HP:', ''), info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - sid = attributes.get('SID', list()) - hpo_comment = attributes.get('HPO_COMMENT', list()) - date_created = attributes.get('DATE_CREATED', list()) - syn_qualifier = attributes.get('SYN_QUALIFIER', list()) - ref = attributes.get('REF', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - +def create_accession_heirarchy(full_heirarchy): + for [source, key] in full_heirarchy: + if source in DESIRED_CODES: + ACCESSION_HEIRARCHY.append((source, key)) -def process_icd10pcs_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'PX', 'HX', 'MTH_HX', 'HT', 'HS', 'AB'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - added_meaning = attributes.get('ADDED_MEANING', list()) - order_no = attributes.get('ORDER_NO', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_icd9cm_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'HT', 'AB'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'ICD9CM') - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - icc = attributes.get('ICC', list()) - ice = attributes.get('ICE', list()) - icf = attributes.get('ICF', list()) - sos = attributes.get('SOS', list()) - icn = attributes.get('ICN', list()) - ica = attributes.get('ICA', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - -def process_medrt_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'FN', 'SY'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - if node_curie == None: - return - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - term_status = attributes.get('TERM_STATUS', list()) - concept_type = attributes.get('CONCEPT_TYPE', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_medlineplus_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'ET', 'SY', 'HT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - if node_curie == None: - return - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - sos = attributes.get('SOS', list()) - date_created = attributes.get('DATE_CREATED', list()) - mp_group_url = attributes.get('MP_GROUP_URL', list()) - mp_primary_institute_url = attributes.get('MP_PRIMARY_INSTITUTE_URL', list()) - mp_other_language_url = attributes.get('MP_OTHER_LANGUAGE_URL', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_msh_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['MH', 'TQ', 'PEP', 'ET', 'XQ', 'PXQ', 'NM', 'N1', 'PCE', 'CE', 'HT', 'HS', 'DEV', 'DSV', 'QAB', 'QEV', 'QSV', 'PM'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'MSH') - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - mmr = attributes.get('MMR', list()) - fx = attributes.get('FX', list()) - lt = attributes.get('LT', list()) - dc = attributes.get('DC', list()) - pa = attributes.get('PA', list()) - rr = attributes.get('RR', list()) - hm = attributes.get('HM', list()) - pi = attributes.get('PI', list()) - ec = attributes.get('EC', list()) - hn = attributes.get('HN', list()) - termui = attributes.get('TERMUI', list()) - th = attributes.get('TH', list()) - sos = attributes.get('SOS', list()) - ii = attributes.get('II', list()) - rn = attributes.get('RN', list()) - an = attributes.get('AN', list()) - cx = attributes.get('CX', list()) - dq = attributes.get('DQ', list()) - dx = attributes.get('DX', list()) - pm = attributes.get('PM', list()) - aql = attributes.get('AQL', list()) - sc = attributes.get('SC', list()) - fr = attributes.get('FR', list()) - mda = attributes.get('MDA', list()) - src = attributes.get('SRC', list()) - ol = attributes.get('OL', list()) - mn = attributes.get('MN', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_mth_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PN', 'CV', 'XM', 'PT', 'SY', 'RT', 'DT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - if node_curie == None: - return - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - mth_mapsetcomplexity = attributes.get('MTH_MAPSETCOMPLEXITY', list()) - fromvsab = attributes.get('FROMVSAB', list()) - mapsetrsab = attributes.get('MAPSETRSAB', list()) - mapsetversion = attributes.get('MAPSETVERSION', list()) - mapsetvsab = attributes.get('MAPSETVSAB', list()) - tovsab = attributes.get('TOVSAB', list()) - mth_mapfromexhaustive = attributes.get('MTH_MAPFROMEXHAUSTIVE', list()) - torsab = attributes.get('TORSAB', list()) - mapsetsid = attributes.get('MAPSETSID', list()) - mapsetgrammar = attributes.get('MAPSETGRAMMAR', list()) - mapsettype = attributes.get('MAPSETTYPE', list()) - mth_maptoexhaustive = attributes.get('MTH_MAPTOEXHAUSTIVE', list()) - fromrsab = attributes.get('FROMRSAB', list()) - mth_mapfromcomplexity = attributes.get('MTH_MAPFROMCOMPLEXITY', list()) - lt = attributes.get('LT', list()) - mth_maptocomplexity = attributes.get('MTH_MAPTOCOMPLEXITY', list()) - sos = attributes.get('SOS', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_ncbi_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['SCN', 'USN', 'USY', 'SY', 'UCN', 'CMN', 'UE', 'EQ'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - div = attributes.get('DIV', list()) - authority_name = attributes.get('AUTHORITY_NAME', list()) - rank = attributes.get('RANK', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_nci_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'SY', 'CSN', 'DN', 'FBD', 'HD', 'CCN', 'AD', 'CA2', 'CA3', 'BN', 'AB', 'CCS', 'OP'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'NCI') - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - clinvar_variation_id = attributes.get('CLINVAR_VARIATION_ID', list()) - micronutrient = attributes.get('MICRONUTRIENT', list()) - genbank_accession_number = attributes.get('GENBANK_ACCESSION_NUMBER', list()) - fda_table = attributes.get('FDA_TABLE', list()) - usda_id = attributes.get('USDA_ID', list()) - icd_o_3_code = attributes.get('ICD-O-3_CODE', list()) - tolerable_level = attributes.get('TOLERABLE_LEVEL', list()) - ncbi_taxon_id = attributes.get('NCBI_TAXON_ID', list()) - mgi_accession_id = attributes.get('MGI_ACCESSION_ID', list()) - homologous_gene = attributes.get('HOMOLOGOUS_GENE', list()) - pid_id = attributes.get('PID_ID', list()) - swiss_prot = attributes.get('SWISS_PROT', list()) - essential_amino_acid = attributes.get('ESSENTIAL_AMINO_ACID', list()) - publish_value_set = attributes.get('PUBLISH_VALUE_SET', list()) - cas_registry = attributes.get('CAS_REGISTRY', list()) - value_set_pair = attributes.get('VALUE_SET_PAIR', list()) - accepted_therapeutic_use_for = attributes.get('ACCEPTED_THERAPEUTIC_USE_FOR', list()) - hgnc_id = attributes.get('HGNC_ID', list()) - nci_drug_dictionary_id = attributes.get('NCI_DRUG_DICTIONARY_ID', list()) - chebi_id = attributes.get('CHEBI_ID', list()) - cnu = attributes.get('CNU', list()) - mirbase_id = attributes.get('MIRBASE_ID', list()) - macronutrient = attributes.get('MACRONUTRIENT', list()) - essential_fatty_acid = attributes.get('ESSENTIAL_FATTY_ACID', list()) - unit = attributes.get('UNIT', list()) - pdq_open_trial_search_id = attributes.get('PDQ_OPEN_TRIAL_SEARCH_ID', list()) - term_browser_value_set_description = attributes.get('TERM_BROWSER_VALUE_SET_DESCRIPTION', list()) - entrezgene_id = attributes.get('ENTREZGENE_ID', list()) - infoods = attributes.get('INFOODS', list()) - pubmedid_primary_reference = attributes.get('PUBMEDID_PRIMARY_REFERENCE', list()) - biocarta_id = attributes.get('BIOCARTA_ID', list()) - extensible_list = attributes.get('EXTENSIBLE_LIST', list()) - use_for = attributes.get('USE_FOR', list()) - neoplastic_status = attributes.get('NEOPLASTIC_STATUS', list()) - nsc_number = attributes.get('NSC_NUMBER', list()) - omim_number = attributes.get('OMIM_NUMBER', list()) - lt = attributes.get('LT', list()) - kegg_id = attributes.get('KEGG_ID', list()) - gene_encodes_product = attributes.get('GENE_ENCODES_PRODUCT', list()) - pdq_closed_trial_search_id = attributes.get('PDQ_CLOSED_TRIAL_SEARCH_ID', list()) - design_note = attributes.get('DESIGN_NOTE', list()) - nutrient = attributes.get('NUTRIENT', list()) - fda_unii_code = attributes.get('FDA_UNII_CODE', list()) - us_recommended_intake = attributes.get('US_RECOMMENDED_INTAKE', list()) - chemical_formula = attributes.get('CHEMICAL_FORMULA', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - -def process_nddf_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['MTH_RXN_CDC', 'CDC', 'CDD', 'CDA', 'IN', 'DF'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - ndc = attributes.get('NDC', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - -def process_omim_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'PHENO', 'PHENO_ET', 'PTAV', 'PTCS', 'ETAL', 'ET', 'HT', 'ACR'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - genesymbol = attributes.get('GENESYMBOL', list()) - mimtypevalue = attributes.get('MIMTYPEVALUE', list()) - moved_from = attributes.get('MOVED_FROM', list()) - sos = attributes.get('SOS', list()) - genelocus = attributes.get('GENELOCUS', list()) - mimtypemeaning = attributes.get('MIMTYPEMEANING', list()) - mimtype = attributes.get('MIMTYPE', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_pdq_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'HT', 'PSC', 'SY', 'ET', 'CU', 'LV', 'ACR', 'AB', 'BN', 'FBD', 'CCN', 'CHN', 'OP', 'IS'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - lt = attributes.get('LT', list()) - cas_registry = attributes.get('CAS_REGISTRY', list()) - date_first_published = attributes.get('DATE_FIRST_PUBLISHED', list()) - date_last_modified = attributes.get('DATE_LAST_MODIFIED', list()) - ind_code = attributes.get('IND_CODE', list()) - pid = attributes.get('PID', list()) - nsc_code = attributes.get('NSC_CODE', list()) - pxc = attributes.get('PXC', list()) - menu_parent = attributes.get('MENU_PARENT', list()) - nci_id = attributes.get('NCI_ID', list()) - orig_sty = attributes.get('ORIG_STY', list()) - menu_type = attributes.get('MENU_TYPE', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_psy_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'HT', 'ET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - hn = attributes.get('HN', list()) - pyr = attributes.get('PYR', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_rxnorm_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['SCD', 'SBD', 'SCDG', 'SBDG', 'BPCK', 'GPCK', 'IN', 'PSN', 'MIN', 'SCDF', 'SBDF', 'SCDC', 'DFG', 'DF', 'SBDC', 'BN', 'PIN', 'TMSY', 'SY', 'ET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - ndc = attributes.get('NDC', list()) - rxn_obsoleted = attributes.get('RXN_OBSOLETED', list()) - rxn_available_strength = attributes.get('RXN_AVAILABLE_STRENGTH', list()) - rxn_human_drug = attributes.get('RXN_HUMAN_DRUG', list()) - rxn_quantity = attributes.get('RXN_QUANTITY', list()) - rxterm_form = attributes.get('RXTERM_FORM', list()) - rxn_in_expressed_flag = attributes.get('RXN_IN_EXPRESSED_FLAG', list()) - rxaui = attributes.get('RXAUI', list()) - rxn_bn_cardinality = attributes.get('RXN_BN_CARDINALITY', list()) - rxn_activated = attributes.get('RXN_ACTIVATED', list()) - rxn_boss_strength_denom_unit = attributes.get('RXN_BOSS_STRENGTH_DENOM_UNIT', list()) - ambiguity_flag = attributes.get('AMBIGUITY_FLAG', list()) - rxn_strength = attributes.get('RXN_STRENGTH', list()) - rxcui = attributes.get('RXCUI', list()) - rxn_ai = attributes.get('RXN_AI', list()) - rxn_boss_from = attributes.get('RXN_BOSS_FROM', list()) - rxn_boss_strength_num_unit = attributes.get('RXN_BOSS_STRENGTH_NUM_UNIT', list()) - rxn_vet_drug = attributes.get('RXN_VET_DRUG', list()) - orig_code = attributes.get('ORIG_CODE', list()) - rxn_am = attributes.get('RXN_AM', list()) - rxn_boss_strength_denom_value = attributes.get('RXN_BOSS_STRENGTH_DENOM_VALUE', list()) - rxn_boss_strength_num_value = attributes.get('RXN_BOSS_STRENGTH_NUM_VALUE', list()) - rxn_qualitative_distinction = attributes.get('RXN_QUALITATIVE_DISTINCTION', list()) - orig_source = attributes.get('ORIG_SOURCE', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_vandf_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'CD', 'IN', 'AB', 'MTH_RXN_CD'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - ndf_transmit_to_cmop = attributes.get('NDF_TRANSMIT_TO_CMOP', list()) - sngl_or_mult_src_prd = attributes.get('SNGL_OR_MULT_SRC_PRD', list()) - dcsa = attributes.get('DCSA', list()) - exclude_di_check = attributes.get('EXCLUDE_DI_CHECK', list()) - nfi = attributes.get('NFI', list()) - va_class_name = attributes.get('VA_CLASS_NAME', list()) - vmo = attributes.get('VMO', list()) - drug_class_type = attributes.get('DRUG_CLASS_TYPE', list()) - nf_name = attributes.get('NF_NAME', list()) - ndc = attributes.get('NDC', list()) - vac = attributes.get('VAC', list()) - va_generic_name = attributes.get('VA_GENERIC_NAME', list()) - parent_class = attributes.get('PARENT_CLASS', list()) - va_dispense_unit = attributes.get('VA_DISPENSE_UNIT', list()) - ddf = attributes.get('DDF', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -DESIRED_CODES = {'ATC': [process_atc_item, kg2_util.CURIE_PREFIX_ATC, make_node_id(UMLS_SOURCE_PREFIX, 'ATC')], - 'CHV': [process_chv_item, kg2_util.CURIE_PREFIX_CHV, make_node_id(UMLS_SOURCE_PREFIX, 'CHV')], - 'DRUGBANK': [process_drugbank_item, kg2_util.CURIE_PREFIX_DRUGBANK, make_node_id(UMLS_SOURCE_PREFIX, 'DRUGBANK')], - 'FMA': [process_fma_item, kg2_util.CURIE_PREFIX_FMA, make_node_id(UMLS_SOURCE_PREFIX, 'FMA')], - 'GO': [process_go_item, kg2_util.CURIE_PREFIX_GO, make_node_id(UMLS_SOURCE_PREFIX, 'GO')], - 'HCPCS': [process_hcpcs_item, kg2_util.CURIE_PREFIX_HCPCS, make_node_id(UMLS_SOURCE_PREFIX, 'HCPCS')], - 'HGNC': [process_hgnc_item, kg2_util.CURIE_PREFIX_HGNC, make_node_id(UMLS_SOURCE_PREFIX, 'HGNC')], - 'HL7V3.0': [process_hl7_item, kg2_util.CURIE_PREFIX_UMLS, make_node_id(UMLS_SOURCE_PREFIX, 'HL7')], - 'HPO': [process_hpo_item, kg2_util.CURIE_PREFIX_HP, make_node_id(UMLS_SOURCE_PREFIX, 'HPO')], - 'ICD10PCS': [process_icd10pcs_item, kg2_util.CURIE_PREFIX_ICD10PCS, make_node_id(UMLS_SOURCE_PREFIX, 'ICD10PCS')], - 'ICD9CM': [process_icd9cm_item, kg2_util.CURIE_PREFIX_ICD9, make_node_id(UMLS_SOURCE_PREFIX, 'ICD9CM')], - 'MED-RT': [process_medrt_item, kg2_util.CURIE_PREFIX_UMLS, make_node_id(UMLS_SOURCE_PREFIX, 'MED-RT')], - 'MEDLINEPLUS': [process_medlineplus_item, kg2_util.CURIE_PREFIX_UMLS, make_node_id(UMLS_SOURCE_PREFIX, 'MEDLINEPLUS')], - 'MSH': [process_msh_item, kg2_util.CURIE_PREFIX_MESH, make_node_id(UMLS_SOURCE_PREFIX, 'MSH')], - 'MTH': [process_mth_item, kg2_util.CURIE_PREFIX_UMLS, make_node_id(UMLS_SOURCE_PREFIX, 'MTH')], - 'NCBI': [process_ncbi_item, kg2_util.CURIE_PREFIX_NCBI_TAXON, make_node_id(UMLS_SOURCE_PREFIX, 'NCBITAXON')], - 'NCI': [process_nci_item, kg2_util.CURIE_PREFIX_NCIT, make_node_id(UMLS_SOURCE_PREFIX, 'NCI')], - 'NDDF': [process_nddf_item, kg2_util.CURIE_PREFIX_NDDF, make_node_id(UMLS_SOURCE_PREFIX, 'NCI')], - 'OMIM': [process_omim_item, kg2_util.CURIE_PREFIX_OMIM, make_node_id(UMLS_SOURCE_PREFIX, 'OMIM')], - 'PDQ': [process_pdq_item, kg2_util.CURIE_PREFIX_PDQ, make_node_id(UMLS_SOURCE_PREFIX, 'PDQ')], - 'PSY': [process_psy_item, kg2_util.CURIE_PREFIX_PSY, make_node_id(UMLS_SOURCE_PREFIX, 'PSY')], - 'RXNORM': [process_rxnorm_item, kg2_util.CURIE_PREFIX_RXNORM, make_node_id(UMLS_SOURCE_PREFIX, 'RXNORM')], - 'VANDF': [process_vandf_item, kg2_util.CURIE_PREFIX_VANDF, make_node_id(UMLS_SOURCE_PREFIX, 'VANDF')]} +def create_accession_sources_heirarchy(): + for (source, key) in ACCESSION_HEIRARCHY: + if source not in ACCESSION_SOURCES_HEIRARCHY: + ACCESSION_SOURCES_HEIRARCHY[source] = list() + ACCESSION_SOURCES_HEIRARCHY[source].append(key) if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) @@ -720,12 +124,13 @@ def process_vandf_item(node_id, info, nodes_output, edges_output, umls_code, cur TUI_MAPPINGS = json.load(mappings) iri_mappings_raw = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string('curies-to-urls-map.yaml'))['use_for_bidirectional_mapping'] - heirarchy = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string('umls-name-heirarchy.yaml')) - print(json.dumps(heirarchy, indent=4)) + full_heirarchy = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string('umls-name-heirarchy.yaml')) for item in iri_mappings_raw: for prefix in item: IRI_MAPPINGS[prefix] = item[prefix] + umls_processor = umls_util.UMLS_Processor(nodes_output, edges_output, TUI_MAPPINGS, IRI_MAPPINGS, full_heirarchy) + for data in input_items: # There should only be one item in the data dictionary for entity in data: @@ -734,15 +139,9 @@ def process_vandf_item(node_id, info, nodes_output, edges_output, umls_code, cur value = data[entity] source, node_id = extract_node_id(entity) - if source not in DESIRED_CODES: - continue - # Process the data specifically by source - [source_function, curie_prefix, provided_by] = DESIRED_CODES[source] - source_function(node_id, value, nodes_output, edges_output, source, curie_prefix, provided_by) + umls_processor.process_node(source, node_id, value) kg2_util.end_read_jsonlines(input_read_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) - print(json.dumps(name_keys, indent=4, sort_keys=True, default=list)) - print(json.dumps(attribute_keys, indent=4, sort_keys=True, default=list)) print("Finishing umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) diff --git a/umls_util.py b/umls_util.py index 84d76c55..59705e5c 100644 --- a/umls_util.py +++ b/umls_util.py @@ -16,558 +16,603 @@ import kg2_util - -def make_node_id(curie_prefix, node_id): - return curie_prefix + ':' + node_id - - -def get_name_synonyms(names_dict, accession_heirarchy): - names = list() - for key in accession_heirarchy: - names += [name for name in names_dict.get(key, dict()).get('Y', list())] - names += [name for name in names_dict.get(key, dict()).get('N', list())] - assert len(names) > 0 - if len(names) == 1: - return names[0], list() - return names[0], names[1:] - - -def make_umls_node(node_curie, iri, name, category, update_date, provided_by, synonyms, description, nodes_output): - node = kg2_util.make_node(node_curie, iri, name, category, "2023", provided_by) - node['synonym'] = synonyms - node['description'] = description - - nodes_output.write(node) - - -def get_basic_info(curie_prefix, node_id, info, umls_code): - # accession_heirarchy - # for (umls_code_compare, name_key) in ACCESSION_HEIRARCHY: - - cuis = info.get(CUIS_KEY, list()) - tuis = info.get(TUIS_KEY, list()) - if curie_prefix == kg2_util.CURIE_PREFIX_UMLS: - if len(cuis) != 1: - return None, None, None, None, None, None, None, None - node_id = cuis[0] - node_curie = make_node_id(curie_prefix, node_id) - iri = IRI_MAPPINGS[curie_prefix] + node_id - category = TUI_MAPPINGS[str(tuple(tuis))] - - names = info.get(NAMES_KEY, dict()) - name, synonyms = get_name_synonyms(names, accession_heirarchy) - - return node_curie, iri, name, category, synonyms, cuis, tuis - -def process_atc_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['RXN_PT', 'PT', 'RXN_IN', 'IN']) - - # Currently not used, but extracting them in case we want them in the future - atc_level = info.get(INFO_KEY, dict()).get('ATC_LEVEL', list())[0] - is_drug_class = info.get(INFO_KEY, dict()).get('IS_DRUG_CLASS', list()) == ["Y"] - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_chv_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['PT', 'SY']) - - # Currently not used, but extracting them in case we want them in the future - combo_score = info.get(INFO_KEY, dict()).get('COMBO_SCORE', list()) - combo_score_no_top_words = info.get(INFO_KEY, dict()).get('COMBO_SCORE_NO_TOP_WORDS', list()) - context_score = info.get(INFO_KEY, dict()).get('CONTEXT_SCORE', list()) - cui_score = info.get(INFO_KEY, dict()).get('CUI_SCORE', list()) - disparaged = info.get(INFO_KEY, dict()).get('DISPARAGED', list()) - frequency = info.get(INFO_KEY, dict()).get('FREQUENCY', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_drugbank_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['IN', 'SY', 'FSY']) - - # Currently not used, but extracting them in case we want them in the future - fda_codes = info.get(INFO_KEY, dict()).get('FDA_UNII_CODE', list()) - secondary_accession_keys = info.get(INFO_KEY, dict()).get('SID', list()) - - # TODO: figure out update date - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_fma_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['PT', 'SY', 'AB', 'OP', 'IS']) - - # Currently not used, but extracting them in case we want them in the future - authority = info.get(INFO_KEY, dict()).get('AUTHORITY', list()) - date_last_modified = info.get(INFO_KEY, dict()).get('DATE_LAST_MODIFIED', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_go_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'MTH_PT', 'ET', 'MTH_ET', 'SY', 'MTH_SY', 'OP', 'MTH_OP', 'OET', 'MTH_OET', 'IS', 'MTH_IS'] - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id.replace('GO:', ''), info, accession_heirarchy) - - # GO-specific information - attributes = info.get(INFO_KEY, dict()) - go_namespace = attributes.get('GO_NAMESPACE', list()) - assert len(go_namespace) == 1 - go_namespace = go_namespace[0] - namespace_category_map = {'molecular_function': kg2_util.BIOLINK_CATEGORY_MOLECULAR_ACTIVITY, - 'cellular_component': kg2_util.BIOLINK_CATEGORY_CELLULAR_COMPONENT, - 'biological_process': kg2_util.BIOLINK_CATEGORY_BIOLOGICAL_PROCESS} - category = namespace_category_map.get(go_namespace, category) - go_comment = attributes.get('GO_COMMENT', str()) - if len(go_comment) > 0: - go_comment = go_comment[0] - go_comment = "// COMMENTS: " + go_comment - - # Currently not used, but extracting them in case we want them in the future - date_created = attributes.get('DATE_CREATED', list()) - go_subset = attributes.get('GO_SUBSET', list()) - gxr = attributes.get('GXR', list()) - ref = attributes.get('REF', list()) - sid = attributes.get('SID', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description(go_comment, tuis), nodes_output) - - -def process_hcpcs_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, ['PT', 'MP', 'MTH_HT']) - - # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html - attributes = info.get(INFO_KEY, dict()) - had = attributes.get('HAD', list()) # HCPCS Action Effective Date - effective date of action to a procedure or modifier code. - hcc = attributes.get('HCC', list()) # HCPCS Coverage Code - code denoting Medicare coverage status. There are two subelements separated by "=". - hts = attributes.get('HTS', list()) # HCPCS Type of Service Code - carrier assigned HCFA Type of Service which describes the particular kind(s) of service represented by the procedure code. - hcd = attributes.get('HCD', list()) # HCPCS Code Added Date - year the HCPCS code was added to the HCFA Common Procedure Coding System. - hpn = attributes.get('HPN', list()) # HCPCS processing note number identifying the processing note contained in Appendix A of the HCPCS Manual. - haq = attributes.get('HAQ', list()) # HCPCS Anesthesia Base Unit Quantity - base unit represents the level of intensity for anesthesia procedure services that reflects all activities except time. - hlc = attributes.get('HLC', list()) # HCPCS Lab Certification Code - code used to classify laboratory procedures according to the specialty certification categories listed by CMS(formerly HCFA). - hsn = attributes.get('HSN', list()) # HCPCS Statute Number identifying statute reference for coverage or noncoverage of procedure or service. - hpd = attributes.get('HPD', list()) # HCPCS ASC payment group effective date - date the procedure is assigned to the ASC payment group. - hpg = attributes.get('HPG', list()) # HCPCS ASC payment group code which represents the dollar amount of the facility charge payable by Medicare for the procedure. - hmg = attributes.get('HMR', list()) # HCPCS Medicare Carriers Manual reference section number - number identifying a section of the Medicare Carriers Manual. - hir = attributes.get('HIR', list()) # HCPCS Coverage Issues Manual Reference Section Number - number identifying the Reference Section of the Coverage Issues Manual. - hxr = attributes.get('HXR', list()) # HCPCS Cross reference code - an explicit reference crosswalking a deleted code or a code that is not valid for Medicare to a valid current code (or range of codes). - hmp = attributes.get('HMP', list()) # HCPCS Multiple Pricing Indicator Code - code used to identify instances where a procedure could be priced. - hpi = attributes.get('HPI', list()) # HCPCS Pricing Indicator Code - used to identify the appropriate methodology for developing unique pricing amounts under Part B. - hac = attributes.get('HAC', list()) # HCPCS action code - code denoting the change made to a procedure or modifier code within the HCPCS system. - hbt = attributes.get('HBT', list()) # HCPCS Berenson-Eggers Type of Service Code - BETOS for the procedure code based on generally agreed upon clinically meaningful groupings of procedures and services. - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_hgnc_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'ACR', 'MTH_ACR', 'NA', 'SYN', 'NP', 'NS'] - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id.replace('HGNC:', ''), info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - mgd_id = attributes.get('MGD_ID', list()) - vega_id = attributes.get('VEGA_ID', list()) - genecc = attributes.get('GENCC', list()) - swp = attributes.get('SWP', list()) - mane_select = attributes.get('MANE_SELECT', list()) - local_specific_db_xr = attributes.get('LOCUS_SPECIFIC_DB_XR', list()) - locus_type = attributes.get('LOCUS_TYPE', list()) - agr = attributes.get('AGR', list()) - cytogenetic_location = attributes.get('CYTOGENETIC_LOCATION', list()) - date_created = attributes.get('DATE_CREATED', list()) - ensemblgene_id = attributes.get('ENSEMBLGENE_ID', list()) - db_xr_id = attributes.get('DB_XR_ID', list()) - locus_group = attributes.get('LOCUS_GROUP', list()) - entrezgene_id = attributes.get('ENTREZGENE_ID', list()) - date_name_changed = attributes.get('DATE_NAME_CHANGED', list()) - pmid = attributes.get('PMID', list()) - date_last_modified = attributes.get('DATE_LAST_MODIFIED', list()) - mapped_ucsc_id = attributes.get('MAPPED_UCSC_ID', list()) - refseq_id = attributes.get('REFSEQ_ID', list()) - ena = attributes.get('ENA', list()) - rgd_id = attributes.get('RGD_ID', list()) - date_symbol_changed = attributes.get('DATE_SYMBOL_CHANGED', list()) - omim_id = attributes.get('OMIM_ID', list()) - gene_fam_id = attributes.get('GENE_FAM_ID', list()) - gene_symbol = attributes.get('GENESYMBOL', list()) - ez = attributes.get('EZ', list()) - ccds_id = attributes.get('CCDS_ID', list()) - lncipedia = attributes.get('LNCIPEDIA', list()) - gene_fam_desc = attributes.get('GENE_FAM_DESC', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_hl7_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['CSY', 'PT', 'CDO', 'VS', 'BR', 'CPR', 'CR', 'NPT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - if node_curie == None: - return - - # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html - attributes = info.get(INFO_KEY, dict()) - hl7at = attributes.get('HL7AT', list()) - hl7ii = attributes.get('HL7II', list()) - hl7im = attributes.get('HL7IM', list()) - hl7lt = attributes.get('HL7LT', list()) - hl7un = attributes.get('HL7UN', list()) - hl7oa = attributes.get('HL7OA', list()) - hl7scs = attributes.get('HL7SCS', list()) - hl7cc = attributes.get('HL7CC', list()) - hl7na = attributes.get('HL7NA', list()) - hl7in = attributes.get('HL7IN', list()) - hl7ap = attributes.get('HL7AP', list()) - hl7mi = attributes.get('HL7MI', list()) - hl7hi = attributes.get('HL7HI', list()) - hl7ir = attributes.get('HL7IR', list()) - hl7ai = attributes.get('HL7AI', list()) - hl7ha = attributes.get('HL7HA', list()) - hl7rf = attributes.get('HL7RF', list()) - hl7rd = attributes.get('HL7RD', list()) - hl7vd = attributes.get('HL7VD', list()) - hl7dc = attributes.get('HL7DC', list()) - hl7rk = attributes.get('HL7RK', list()) - hl7is = attributes.get('HL7IS', list()) - hl7sy = attributes.get('HL7SY', list()) - hl7cd = attributes.get('HL7CD', list()) - hl7sl = attributes.get('HL7SL', list()) - hl7pl = attributes.get('HL7PL', list()) - hl7vc = attributes.get('HL7VC', list()) - hl7ty = attributes.get('HL7TY', list()) - hl7rg = attributes.get('HL7RG', list()) - hl7csc = attributes.get('HL7CSC', list()) - hl7od = attributes.get('HL7OD', list()) - hl7id = attributes.get('HL7ID', list()) - hl7tr = attributes.get('HL7TR', list()) - hl7di = attributes.get('HL7DI', list()) - hl7cs = attributes.get('HL7CS', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_hpo_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'SY', 'ET', 'OP', 'IS', 'OET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id.replace('HP:', ''), info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - sid = attributes.get('SID', list()) - hpo_comment = attributes.get('HPO_COMMENT', list()) - date_created = attributes.get('DATE_CREATED', list()) - syn_qualifier = attributes.get('SYN_QUALIFIER', list()) - ref = attributes.get('REF', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_icd10pcs_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'PX', 'HX', 'MTH_HX', 'HT', 'HS', 'AB'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - added_meaning = attributes.get('ADDED_MEANING', list()) - order_no = attributes.get('ORDER_NO', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_icd9cm_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'HT', 'AB'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'ICD9CM') - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - icc = attributes.get('ICC', list()) - ice = attributes.get('ICE', list()) - icf = attributes.get('ICF', list()) - sos = attributes.get('SOS', list()) - icn = attributes.get('ICN', list()) - ica = attributes.get('ICA', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - -def process_medrt_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'FN', 'SY'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - if node_curie == None: - return - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - term_status = attributes.get('TERM_STATUS', list()) - concept_type = attributes.get('CONCEPT_TYPE', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_medlineplus_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'ET', 'SY', 'HT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - if node_curie == None: - return - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - sos = attributes.get('SOS', list()) - date_created = attributes.get('DATE_CREATED', list()) - mp_group_url = attributes.get('MP_GROUP_URL', list()) - mp_primary_institute_url = attributes.get('MP_PRIMARY_INSTITUTE_URL', list()) - mp_other_language_url = attributes.get('MP_OTHER_LANGUAGE_URL', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_msh_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['MH', 'TQ', 'PEP', 'ET', 'XQ', 'PXQ', 'NM', 'N1', 'PCE', 'CE', 'HT', 'HS', 'DEV', 'DSV', 'QAB', 'QEV', 'QSV', 'PM'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'MSH') - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - mmr = attributes.get('MMR', list()) - fx = attributes.get('FX', list()) - lt = attributes.get('LT', list()) - dc = attributes.get('DC', list()) - pa = attributes.get('PA', list()) - rr = attributes.get('RR', list()) - hm = attributes.get('HM', list()) - pi = attributes.get('PI', list()) - ec = attributes.get('EC', list()) - hn = attributes.get('HN', list()) - termui = attributes.get('TERMUI', list()) - th = attributes.get('TH', list()) - sos = attributes.get('SOS', list()) - ii = attributes.get('II', list()) - rn = attributes.get('RN', list()) - an = attributes.get('AN', list()) - cx = attributes.get('CX', list()) - dq = attributes.get('DQ', list()) - dx = attributes.get('DX', list()) - pm = attributes.get('PM', list()) - aql = attributes.get('AQL', list()) - sc = attributes.get('SC', list()) - fr = attributes.get('FR', list()) - mda = attributes.get('MDA', list()) - src = attributes.get('SRC', list()) - ol = attributes.get('OL', list()) - mn = attributes.get('MN', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_mth_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PN', 'CV', 'XM', 'PT', 'SY', 'RT', 'DT'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - if node_curie == None: - return - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - mth_mapsetcomplexity = attributes.get('MTH_MAPSETCOMPLEXITY', list()) - fromvsab = attributes.get('FROMVSAB', list()) - mapsetrsab = attributes.get('MAPSETRSAB', list()) - mapsetversion = attributes.get('MAPSETVERSION', list()) - mapsetvsab = attributes.get('MAPSETVSAB', list()) - tovsab = attributes.get('TOVSAB', list()) - mth_mapfromexhaustive = attributes.get('MTH_MAPFROMEXHAUSTIVE', list()) - torsab = attributes.get('TORSAB', list()) - mapsetsid = attributes.get('MAPSETSID', list()) - mapsetgrammar = attributes.get('MAPSETGRAMMAR', list()) - mapsettype = attributes.get('MAPSETTYPE', list()) - mth_maptoexhaustive = attributes.get('MTH_MAPTOEXHAUSTIVE', list()) - fromrsab = attributes.get('FROMRSAB', list()) - mth_mapfromcomplexity = attributes.get('MTH_MAPFROMCOMPLEXITY', list()) - lt = attributes.get('LT', list()) - mth_maptocomplexity = attributes.get('MTH_MAPTOCOMPLEXITY', list()) - sos = attributes.get('SOS', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_ncbi_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['SCN', 'USN', 'USY', 'SY', 'UCN', 'CMN', 'UE', 'EQ'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - div = attributes.get('DIV', list()) - authority_name = attributes.get('AUTHORITY_NAME', list()) - rank = attributes.get('RANK', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_nci_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'SY', 'CSN', 'DN', 'FBD', 'HD', 'CCN', 'AD', 'CA2', 'CA3', 'BN', 'AB', 'CCS', 'OP'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - provided_by = make_node_id(UMLS_SOURCE_PREFIX, 'NCI') - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - clinvar_variation_id = attributes.get('CLINVAR_VARIATION_ID', list()) - micronutrient = attributes.get('MICRONUTRIENT', list()) - genbank_accession_number = attributes.get('GENBANK_ACCESSION_NUMBER', list()) - fda_table = attributes.get('FDA_TABLE', list()) - usda_id = attributes.get('USDA_ID', list()) - icd_o_3_code = attributes.get('ICD-O-3_CODE', list()) - tolerable_level = attributes.get('TOLERABLE_LEVEL', list()) - ncbi_taxon_id = attributes.get('NCBI_TAXON_ID', list()) - mgi_accession_id = attributes.get('MGI_ACCESSION_ID', list()) - homologous_gene = attributes.get('HOMOLOGOUS_GENE', list()) - pid_id = attributes.get('PID_ID', list()) - swiss_prot = attributes.get('SWISS_PROT', list()) - essential_amino_acid = attributes.get('ESSENTIAL_AMINO_ACID', list()) - publish_value_set = attributes.get('PUBLISH_VALUE_SET', list()) - cas_registry = attributes.get('CAS_REGISTRY', list()) - value_set_pair = attributes.get('VALUE_SET_PAIR', list()) - accepted_therapeutic_use_for = attributes.get('ACCEPTED_THERAPEUTIC_USE_FOR', list()) - hgnc_id = attributes.get('HGNC_ID', list()) - nci_drug_dictionary_id = attributes.get('NCI_DRUG_DICTIONARY_ID', list()) - chebi_id = attributes.get('CHEBI_ID', list()) - cnu = attributes.get('CNU', list()) - mirbase_id = attributes.get('MIRBASE_ID', list()) - macronutrient = attributes.get('MACRONUTRIENT', list()) - essential_fatty_acid = attributes.get('ESSENTIAL_FATTY_ACID', list()) - unit = attributes.get('UNIT', list()) - pdq_open_trial_search_id = attributes.get('PDQ_OPEN_TRIAL_SEARCH_ID', list()) - term_browser_value_set_description = attributes.get('TERM_BROWSER_VALUE_SET_DESCRIPTION', list()) - entrezgene_id = attributes.get('ENTREZGENE_ID', list()) - infoods = attributes.get('INFOODS', list()) - pubmedid_primary_reference = attributes.get('PUBMEDID_PRIMARY_REFERENCE', list()) - biocarta_id = attributes.get('BIOCARTA_ID', list()) - extensible_list = attributes.get('EXTENSIBLE_LIST', list()) - use_for = attributes.get('USE_FOR', list()) - neoplastic_status = attributes.get('NEOPLASTIC_STATUS', list()) - nsc_number = attributes.get('NSC_NUMBER', list()) - omim_number = attributes.get('OMIM_NUMBER', list()) - lt = attributes.get('LT', list()) - kegg_id = attributes.get('KEGG_ID', list()) - gene_encodes_product = attributes.get('GENE_ENCODES_PRODUCT', list()) - pdq_closed_trial_search_id = attributes.get('PDQ_CLOSED_TRIAL_SEARCH_ID', list()) - design_note = attributes.get('DESIGN_NOTE', list()) - nutrient = attributes.get('NUTRIENT', list()) - fda_unii_code = attributes.get('FDA_UNII_CODE', list()) - us_recommended_intake = attributes.get('US_RECOMMENDED_INTAKE', list()) - chemical_formula = attributes.get('CHEMICAL_FORMULA', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - -def process_nddf_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['MTH_RXN_CDC', 'CDC', 'CDD', 'CDA', 'IN', 'DF'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - ndc = attributes.get('NDC', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - -def process_omim_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'PHENO', 'PHENO_ET', 'PTAV', 'PTCS', 'ETAL', 'ET', 'HT', 'ACR'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - genesymbol = attributes.get('GENESYMBOL', list()) - mimtypevalue = attributes.get('MIMTYPEVALUE', list()) - moved_from = attributes.get('MOVED_FROM', list()) - sos = attributes.get('SOS', list()) - genelocus = attributes.get('GENELOCUS', list()) - mimtypemeaning = attributes.get('MIMTYPEMEANING', list()) - mimtype = attributes.get('MIMTYPE', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_pdq_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'HT', 'PSC', 'SY', 'ET', 'CU', 'LV', 'ACR', 'AB', 'BN', 'FBD', 'CCN', 'CHN', 'OP', 'IS'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - lt = attributes.get('LT', list()) - cas_registry = attributes.get('CAS_REGISTRY', list()) - date_first_published = attributes.get('DATE_FIRST_PUBLISHED', list()) - date_last_modified = attributes.get('DATE_LAST_MODIFIED', list()) - ind_code = attributes.get('IND_CODE', list()) - pid = attributes.get('PID', list()) - nsc_code = attributes.get('NSC_CODE', list()) - pxc = attributes.get('PXC', list()) - menu_parent = attributes.get('MENU_PARENT', list()) - nci_id = attributes.get('NCI_ID', list()) - orig_sty = attributes.get('ORIG_STY', list()) - menu_type = attributes.get('MENU_TYPE', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_psy_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'HT', 'ET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - hn = attributes.get('HN', list()) - pyr = attributes.get('PYR', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_rxnorm_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['SCD', 'SBD', 'SCDG', 'SBDG', 'BPCK', 'GPCK', 'IN', 'PSN', 'MIN', 'SCDF', 'SBDF', 'SCDC', 'DFG', 'DF', 'SBDC', 'BN', 'PIN', 'TMSY', 'SY', 'ET'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - ndc = attributes.get('NDC', list()) - rxn_obsoleted = attributes.get('RXN_OBSOLETED', list()) - rxn_available_strength = attributes.get('RXN_AVAILABLE_STRENGTH', list()) - rxn_human_drug = attributes.get('RXN_HUMAN_DRUG', list()) - rxn_quantity = attributes.get('RXN_QUANTITY', list()) - rxterm_form = attributes.get('RXTERM_FORM', list()) - rxn_in_expressed_flag = attributes.get('RXN_IN_EXPRESSED_FLAG', list()) - rxaui = attributes.get('RXAUI', list()) - rxn_bn_cardinality = attributes.get('RXN_BN_CARDINALITY', list()) - rxn_activated = attributes.get('RXN_ACTIVATED', list()) - rxn_boss_strength_denom_unit = attributes.get('RXN_BOSS_STRENGTH_DENOM_UNIT', list()) - ambiguity_flag = attributes.get('AMBIGUITY_FLAG', list()) - rxn_strength = attributes.get('RXN_STRENGTH', list()) - rxcui = attributes.get('RXCUI', list()) - rxn_ai = attributes.get('RXN_AI', list()) - rxn_boss_from = attributes.get('RXN_BOSS_FROM', list()) - rxn_boss_strength_num_unit = attributes.get('RXN_BOSS_STRENGTH_NUM_UNIT', list()) - rxn_vet_drug = attributes.get('RXN_VET_DRUG', list()) - orig_code = attributes.get('ORIG_CODE', list()) - rxn_am = attributes.get('RXN_AM', list()) - rxn_boss_strength_denom_value = attributes.get('RXN_BOSS_STRENGTH_DENOM_VALUE', list()) - rxn_boss_strength_num_value = attributes.get('RXN_BOSS_STRENGTH_NUM_VALUE', list()) - rxn_qualitative_distinction = attributes.get('RXN_QUALITATIVE_DISTINCTION', list()) - orig_source = attributes.get('ORIG_SOURCE', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) - - -def process_vandf_item(node_id, info, nodes_output, edges_output, umls_code, curie_prefix, provided_by): - accession_heirarchy = ['PT', 'CD', 'IN', 'AB', 'MTH_RXN_CD'] # https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html - node_curie, iri, name, category, synonyms, cuis, tuis = get_basic_info(curie_prefix, node_id, info, accession_heirarchy) - - # Currently not used, but extracting them in case we want them in the future - attributes = info.get(INFO_KEY, dict()) - ndf_transmit_to_cmop = attributes.get('NDF_TRANSMIT_TO_CMOP', list()) - sngl_or_mult_src_prd = attributes.get('SNGL_OR_MULT_SRC_PRD', list()) - dcsa = attributes.get('DCSA', list()) - exclude_di_check = attributes.get('EXCLUDE_DI_CHECK', list()) - nfi = attributes.get('NFI', list()) - va_class_name = attributes.get('VA_CLASS_NAME', list()) - vmo = attributes.get('VMO', list()) - drug_class_type = attributes.get('DRUG_CLASS_TYPE', list()) - nf_name = attributes.get('NF_NAME', list()) - ndc = attributes.get('NDC', list()) - vac = attributes.get('VAC', list()) - va_generic_name = attributes.get('VA_GENERIC_NAME', list()) - parent_class = attributes.get('PARENT_CLASS', list()) - va_dispense_unit = attributes.get('VA_DISPENSE_UNIT', list()) - ddf = attributes.get('DDF', list()) - - make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, create_description("", tuis), nodes_output) +class UMLS_Processor(object): + def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_name_heirarchy): + self.nodes_output = nodes_output + self.edges_output = edges_output + self.TUI_MAPPINGS = tui_mappings + self.IRI_MAPPINGS = iri_mappings + self.full_name_heirarchy = full_name_heirarchy + self.SOURCES = {'ATC': [self.process_atc_item, kg2_util.CURIE_PREFIX_ATC, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ATC')], + 'CHV': [self.process_chv_item, kg2_util.CURIE_PREFIX_CHV, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'CHV')], + 'DRUGBANK': [self.process_drugbank_item, kg2_util.CURIE_PREFIX_DRUGBANK, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'DRUGBANK')], + 'FMA': [self.process_fma_item, kg2_util.CURIE_PREFIX_FMA, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'FMA')], + 'GO': [self.process_go_item, kg2_util.CURIE_PREFIX_GO, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'GO')], + 'HCPCS': [self.process_hcpcs_item, kg2_util.CURIE_PREFIX_HCPCS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HCPCS')], + 'HGNC': [self.process_hgnc_item, kg2_util.CURIE_PREFIX_HGNC, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HGNC')], + 'HL7V3.0': [self.process_hl7_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HL7')], + 'HPO': [self.process_hpo_item, kg2_util.CURIE_PREFIX_HP, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HPO')], + 'ICD10PCS': [self.process_icd10pcs_item, kg2_util.CURIE_PREFIX_ICD10PCS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ICD10PCS')], + 'ICD9CM': [self.process_icd9cm_item, kg2_util.CURIE_PREFIX_ICD9, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ICD9CM')], + 'MED-RT': [self.process_medrt_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MED-RT')], + 'MEDLINEPLUS': [self.process_medlineplus_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MEDLINEPLUS')], + 'MSH': [self.process_msh_item, kg2_util.CURIE_PREFIX_MESH, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MSH')], + 'MTH': [self.process_mth_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MTH')], + 'NCBI': [self.process_ncbi_item, kg2_util.CURIE_PREFIX_NCBI_TAXON, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NCBITAXON')], + 'NCI': [self.process_nci_item, kg2_util.CURIE_PREFIX_NCIT, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NCI')], + 'NDDF': [self.process_nddf_item, kg2_util.CURIE_PREFIX_NDDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NCI')], + 'OMIM': [self.process_omim_item, kg2_util.CURIE_PREFIX_OMIM, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'OMIM')], + 'PDQ': [self.process_pdq_item, kg2_util.CURIE_PREFIX_PDQ, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PDQ')], + 'PSY': [self.process_psy_item, kg2_util.CURIE_PREFIX_PSY, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PSY')], + 'RXNORM': [self.process_rxnorm_item, kg2_util.CURIE_PREFIX_RXNORM, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'RXNORM')], + 'VANDF': [self.process_vandf_item, kg2_util.CURIE_PREFIX_VANDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'VANDF')]} + self.create_umls_accession_heirarchy() + self.create_accession_sources_heirarchy() + + self.CUIS_KEY = 'cuis' + self.INFO_KEY = 'attributes' + self.NAMES_KEY = 'names' + self.TUIS_KEY = 'tuis' + + + def process_node(self, source, node_id, data): + if source in self.SOURCES: + self.SOURCES[source][0](node_id, data, source) + + + def create_umls_accession_heirarchy(self): + self.UMLS_ACCESSION_HEIRARCHY = list() + for [source, key] in self.full_name_heirarchy: + if source in self.SOURCES: + self.UMLS_ACCESSION_HEIRARCHY.append((source, key)) + + def create_accession_sources_heirarchy(self): + self.ACCESSION_SOURCES_HEIRARCHY = dict() + for (source, key) in self.UMLS_ACCESSION_HEIRARCHY: + if source not in self.ACCESSION_SOURCES_HEIRARCHY: + self.ACCESSION_SOURCES_HEIRARCHY[source] = list() + self.ACCESSION_SOURCES_HEIRARCHY[source].append(key) + + def make_umls_node(self, node_curie, iri, name, category, update_date, provided_by, synonyms, description): + node = kg2_util.make_node(node_curie, iri, name, category, "2023", provided_by) + node['synonym'] = synonyms + node['description'] = description + + self.nodes_output.write(node) + + def make_node_id(self, curie_prefix, node_id): + return curie_prefix + ':' + node_id + + def get_name_synonyms(self, names_dict, source): + names = list() + if source == 'UMLS': + for (key_source, key) in self.UMLS_ACCESSION_HEIRARCHY: + names += [name for name in names_dict.get(key_source, dict()).get(key, dict()).get('Y', list())] + names += [name for name in names_dict.get(key_source, dict()).get(key, dict()).get('N', list())] + else: + for key in self.ACCESSION_SOURCES_HEIRARCHY[source]: + names += [name for name in names_dict.get(key, dict()).get('Y', list())] + names += [name for name in names_dict.get(key, dict()).get('N', list())] + + assert len(names) > 0 + if len(names) == 1: + return names[0], list() + return names[0], names[1:] + + def get_basic_info(self, source, node_id, info): + curie_prefix = self.SOURCES[source][1] + provided_by = self.SOURCES[source][2] + cuis = info.get(self.CUIS_KEY, list()) + tuis = info.get(self.TUIS_KEY, list()) + if curie_prefix == kg2_util.CURIE_PREFIX_UMLS: + if len(cuis) != 1: + return None, None, None, None, None, None, None, None + node_id = cuis[0] + node_curie = self.make_node_id(curie_prefix, node_id) + iri = self.IRI_MAPPINGS[curie_prefix] + node_id + category = self.TUI_MAPPINGS[str(tuple(tuis))] + + names = info.get(self.NAMES_KEY, dict()) + name, synonyms = self.get_name_synonyms(names, source) + + return node_curie, iri, name, category, provided_by, synonyms, cuis, tuis + + def create_description(self, tuis, comment=""): + description = comment + for tui in tuis: + description += "; UMLS Semantic Type: STY:" + tui + description = description.strip("; ") + return description + + + def process_atc_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + atc_level = info.get(self.INFO_KEY, dict()).get('ATC_LEVEL', list())[0] + is_drug_class = info.get(self.INFO_KEY, dict()).get('IS_DRUG_CLASS', list()) == ["Y"] + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_chv_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + combo_score = info.get(self.INFO_KEY, dict()).get('COMBO_SCORE', list()) + combo_score_no_top_words = info.get(self.INFO_KEY, dict()).get('COMBO_SCORE_NO_TOP_WORDS', list()) + context_score = info.get(self.INFO_KEY, dict()).get('CONTEXT_SCORE', list()) + cui_score = info.get(self.INFO_KEY, dict()).get('CUI_SCORE', list()) + disparaged = info.get(self.INFO_KEY, dict()).get('DISPARAGED', list()) + frequency = info.get(self.INFO_KEY, dict()).get('FREQUENCY', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_drugbank_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + fda_codes = info.get(self.INFO_KEY, dict()).get('FDA_UNII_CODE', list()) + secondary_accession_keys = info.get(self.INFO_KEY, dict()).get('SID', list()) + + # TODO: figure out update date + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_fma_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + authority = info.get(self.INFO_KEY, dict()).get('AUTHORITY', list()) + date_last_modified = info.get(self.INFO_KEY, dict()).get('DATE_LAST_MODIFIED', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_go_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id.replace('GO:', ''), info) + + # GO-specific information + attributes = info.get(self.INFO_KEY, dict()) + go_namespace = attributes.get('GO_NAMESPACE', list()) + assert len(go_namespace) == 1 + go_namespace = go_namespace[0] + namespace_category_map = {'molecular_function': kg2_util.BIOLINK_CATEGORY_MOLECULAR_ACTIVITY, + 'cellular_component': kg2_util.BIOLINK_CATEGORY_CELLULAR_COMPONENT, + 'biological_process': kg2_util.BIOLINK_CATEGORY_BIOLOGICAL_PROCESS} + category = namespace_category_map.get(go_namespace, category) + go_comment = attributes.get('GO_COMMENT', str()) + if len(go_comment) > 0: + go_comment = go_comment[0] + go_comment = "// COMMENTS: " + go_comment + + # Currently not used, but extracting them in case we want them in the future + date_created = attributes.get('DATE_CREATED', list()) + go_subset = attributes.get('GO_SUBSET', list()) + gxr = attributes.get('GXR', list()) + ref = attributes.get('REF', list()) + sid = attributes.get('SID', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, go_comment)) + + + def process_hcpcs_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html + attributes = info.get(self.INFO_KEY, dict()) + had = attributes.get('HAD', list()) # HCPCS Action Effective Date - effective date of action to a procedure or modifier code. + hcc = attributes.get('HCC', list()) # HCPCS Coverage Code - code denoting Medicare coverage status. There are two subelements separated by "=". + hts = attributes.get('HTS', list()) # HCPCS Type of Service Code - carrier assigned HCFA Type of Service which describes the particular kind(s) of service represented by the procedure code. + hcd = attributes.get('HCD', list()) # HCPCS Code Added Date - year the HCPCS code was added to the HCFA Common Procedure Coding System. + hpn = attributes.get('HPN', list()) # HCPCS processing note number identifying the processing note contained in Appendix A of the HCPCS Manual. + haq = attributes.get('HAQ', list()) # HCPCS Anesthesia Base Unit Quantity - base unit represents the level of intensity for anesthesia procedure services that reflects all activities except time. + hlc = attributes.get('HLC', list()) # HCPCS Lab Certification Code - code used to classify laboratory procedures according to the specialty certification categories listed by CMS(formerly HCFA). + hsn = attributes.get('HSN', list()) # HCPCS Statute Number identifying statute reference for coverage or noncoverage of procedure or service. + hpd = attributes.get('HPD', list()) # HCPCS ASC payment group effective date - date the procedure is assigned to the ASC payment group. + hpg = attributes.get('HPG', list()) # HCPCS ASC payment group code which represents the dollar amount of the facility charge payable by Medicare for the procedure. + hmg = attributes.get('HMR', list()) # HCPCS Medicare Carriers Manual reference section number - number identifying a section of the Medicare Carriers Manual. + hir = attributes.get('HIR', list()) # HCPCS Coverage Issues Manual Reference Section Number - number identifying the Reference Section of the Coverage Issues Manual. + hxr = attributes.get('HXR', list()) # HCPCS Cross reference code - an explicit reference crosswalking a deleted code or a code that is not valid for Medicare to a valid current code (or range of codes). + hmp = attributes.get('HMP', list()) # HCPCS Multiple Pricing Indicator Code - code used to identify instances where a procedure could be priced. + hpi = attributes.get('HPI', list()) # HCPCS Pricing Indicator Code - used to identify the appropriate methodology for developing unique pricing amounts under Part B. + hac = attributes.get('HAC', list()) # HCPCS action code - code denoting the change made to a procedure or modifier code within the HCPCS system. + hbt = attributes.get('HBT', list()) # HCPCS Berenson-Eggers Type of Service Code - BETOS for the procedure code based on generally agreed upon clinically meaningful groupings of procedures and services. + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_hgnc_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id.replace('HGNC:', ''), info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + mgd_id = attributes.get('MGD_ID', list()) + vega_id = attributes.get('VEGA_ID', list()) + genecc = attributes.get('GENCC', list()) + swp = attributes.get('SWP', list()) + mane_select = attributes.get('MANE_SELECT', list()) + local_specific_db_xr = attributes.get('LOCUS_SPECIFIC_DB_XR', list()) + locus_type = attributes.get('LOCUS_TYPE', list()) + agr = attributes.get('AGR', list()) + cytogenetic_location = attributes.get('CYTOGENETIC_LOCATION', list()) + date_created = attributes.get('DATE_CREATED', list()) + ensemblgene_id = attributes.get('ENSEMBLGENE_ID', list()) + db_xr_id = attributes.get('DB_XR_ID', list()) + locus_group = attributes.get('LOCUS_GROUP', list()) + entrezgene_id = attributes.get('ENTREZGENE_ID', list()) + date_name_changed = attributes.get('DATE_NAME_CHANGED', list()) + pmid = attributes.get('PMID', list()) + date_last_modified = attributes.get('DATE_LAST_MODIFIED', list()) + mapped_ucsc_id = attributes.get('MAPPED_UCSC_ID', list()) + refseq_id = attributes.get('REFSEQ_ID', list()) + ena = attributes.get('ENA', list()) + rgd_id = attributes.get('RGD_ID', list()) + date_symbol_changed = attributes.get('DATE_SYMBOL_CHANGED', list()) + omim_id = attributes.get('OMIM_ID', list()) + gene_fam_id = attributes.get('GENE_FAM_ID', list()) + gene_symbol = attributes.get('GENESYMBOL', list()) + ez = attributes.get('EZ', list()) + ccds_id = attributes.get('CCDS_ID', list()) + lncipedia = attributes.get('LNCIPEDIA', list()) + gene_fam_desc = attributes.get('GENE_FAM_DESC', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_hl7_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + if node_curie == None: + return + + # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html + attributes = info.get(self.INFO_KEY, dict()) + hl7at = attributes.get('HL7AT', list()) + hl7ii = attributes.get('HL7II', list()) + hl7im = attributes.get('HL7IM', list()) + hl7lt = attributes.get('HL7LT', list()) + hl7un = attributes.get('HL7UN', list()) + hl7oa = attributes.get('HL7OA', list()) + hl7scs = attributes.get('HL7SCS', list()) + hl7cc = attributes.get('HL7CC', list()) + hl7na = attributes.get('HL7NA', list()) + hl7in = attributes.get('HL7IN', list()) + hl7ap = attributes.get('HL7AP', list()) + hl7mi = attributes.get('HL7MI', list()) + hl7hi = attributes.get('HL7HI', list()) + hl7ir = attributes.get('HL7IR', list()) + hl7ai = attributes.get('HL7AI', list()) + hl7ha = attributes.get('HL7HA', list()) + hl7rf = attributes.get('HL7RF', list()) + hl7rd = attributes.get('HL7RD', list()) + hl7vd = attributes.get('HL7VD', list()) + hl7dc = attributes.get('HL7DC', list()) + hl7rk = attributes.get('HL7RK', list()) + hl7is = attributes.get('HL7IS', list()) + hl7sy = attributes.get('HL7SY', list()) + hl7cd = attributes.get('HL7CD', list()) + hl7sl = attributes.get('HL7SL', list()) + hl7pl = attributes.get('HL7PL', list()) + hl7vc = attributes.get('HL7VC', list()) + hl7ty = attributes.get('HL7TY', list()) + hl7rg = attributes.get('HL7RG', list()) + hl7csc = attributes.get('HL7CSC', list()) + hl7od = attributes.get('HL7OD', list()) + hl7id = attributes.get('HL7ID', list()) + hl7tr = attributes.get('HL7TR', list()) + hl7di = attributes.get('HL7DI', list()) + hl7cs = attributes.get('HL7CS', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_hpo_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id.replace('HP:', ''), info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + sid = attributes.get('SID', list()) + hpo_comment = attributes.get('HPO_COMMENT', list()) + date_created = attributes.get('DATE_CREATED', list()) + syn_qualifier = attributes.get('SYN_QUALIFIER', list()) + ref = attributes.get('REF', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_icd10pcs_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + added_meaning = attributes.get('ADDED_MEANING', list()) + order_no = attributes.get('ORDER_NO', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_icd9cm_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + icc = attributes.get('ICC', list()) + ice = attributes.get('ICE', list()) + icf = attributes.get('ICF', list()) + sos = attributes.get('SOS', list()) + icn = attributes.get('ICN', list()) + ica = attributes.get('ICA', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + def process_medrt_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + if node_curie == None: + return + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + term_status = attributes.get('TERM_STATUS', list()) + concept_type = attributes.get('CONCEPT_TYPE', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_medlineplus_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + if node_curie == None: + return + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + sos = attributes.get('SOS', list()) + date_created = attributes.get('DATE_CREATED', list()) + mp_group_url = attributes.get('MP_GROUP_URL', list()) + mp_primary_institute_url = attributes.get('MP_PRIMARY_INSTITUTE_URL', list()) + mp_other_language_url = attributes.get('MP_OTHER_LANGUAGE_URL', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_msh_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + mmr = attributes.get('MMR', list()) + fx = attributes.get('FX', list()) + lt = attributes.get('LT', list()) + dc = attributes.get('DC', list()) + pa = attributes.get('PA', list()) + rr = attributes.get('RR', list()) + hm = attributes.get('HM', list()) + pi = attributes.get('PI', list()) + ec = attributes.get('EC', list()) + hn = attributes.get('HN', list()) + termui = attributes.get('TERMUI', list()) + th = attributes.get('TH', list()) + sos = attributes.get('SOS', list()) + ii = attributes.get('II', list()) + rn = attributes.get('RN', list()) + an = attributes.get('AN', list()) + cx = attributes.get('CX', list()) + dq = attributes.get('DQ', list()) + dx = attributes.get('DX', list()) + pm = attributes.get('PM', list()) + aql = attributes.get('AQL', list()) + sc = attributes.get('SC', list()) + fr = attributes.get('FR', list()) + mda = attributes.get('MDA', list()) + src = attributes.get('SRC', list()) + ol = attributes.get('OL', list()) + mn = attributes.get('MN', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_mth_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + if node_curie == None: + return + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + mth_mapsetcomplexity = attributes.get('MTH_MAPSETCOMPLEXITY', list()) + fromvsab = attributes.get('FROMVSAB', list()) + mapsetrsab = attributes.get('MAPSETRSAB', list()) + mapsetversion = attributes.get('MAPSETVERSION', list()) + mapsetvsab = attributes.get('MAPSETVSAB', list()) + tovsab = attributes.get('TOVSAB', list()) + mth_mapfromexhaustive = attributes.get('MTH_MAPFROMEXHAUSTIVE', list()) + torsab = attributes.get('TORSAB', list()) + mapsetsid = attributes.get('MAPSETSID', list()) + mapsetgrammar = attributes.get('MAPSETGRAMMAR', list()) + mapsettype = attributes.get('MAPSETTYPE', list()) + mth_maptoexhaustive = attributes.get('MTH_MAPTOEXHAUSTIVE', list()) + fromrsab = attributes.get('FROMRSAB', list()) + mth_mapfromcomplexity = attributes.get('MTH_MAPFROMCOMPLEXITY', list()) + lt = attributes.get('LT', list()) + mth_maptocomplexity = attributes.get('MTH_MAPTOCOMPLEXITY', list()) + sos = attributes.get('SOS', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_ncbi_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + div = attributes.get('DIV', list()) + authority_name = attributes.get('AUTHORITY_NAME', list()) + rank = attributes.get('RANK', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_nci_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + clinvar_variation_id = attributes.get('CLINVAR_VARIATION_ID', list()) + micronutrient = attributes.get('MICRONUTRIENT', list()) + genbank_accession_number = attributes.get('GENBANK_ACCESSION_NUMBER', list()) + fda_table = attributes.get('FDA_TABLE', list()) + usda_id = attributes.get('USDA_ID', list()) + icd_o_3_code = attributes.get('ICD-O-3_CODE', list()) + tolerable_level = attributes.get('TOLERABLE_LEVEL', list()) + ncbi_taxon_id = attributes.get('NCBI_TAXON_ID', list()) + mgi_accession_id = attributes.get('MGI_ACCESSION_ID', list()) + homologous_gene = attributes.get('HOMOLOGOUS_GENE', list()) + pid_id = attributes.get('PID_ID', list()) + swiss_prot = attributes.get('SWISS_PROT', list()) + essential_amino_acid = attributes.get('ESSENTIAL_AMINO_ACID', list()) + publish_value_set = attributes.get('PUBLISH_VALUE_SET', list()) + cas_registry = attributes.get('CAS_REGISTRY', list()) + value_set_pair = attributes.get('VALUE_SET_PAIR', list()) + accepted_therapeutic_use_for = attributes.get('ACCEPTED_THERAPEUTIC_USE_FOR', list()) + hgnc_id = attributes.get('HGNC_ID', list()) + nci_drug_dictionary_id = attributes.get('NCI_DRUG_DICTIONARY_ID', list()) + chebi_id = attributes.get('CHEBI_ID', list()) + cnu = attributes.get('CNU', list()) + mirbase_id = attributes.get('MIRBASE_ID', list()) + macronutrient = attributes.get('MACRONUTRIENT', list()) + essential_fatty_acid = attributes.get('ESSENTIAL_FATTY_ACID', list()) + unit = attributes.get('UNIT', list()) + pdq_open_trial_search_id = attributes.get('PDQ_OPEN_TRIAL_SEARCH_ID', list()) + term_browser_value_set_description = attributes.get('TERM_BROWSER_VALUE_SET_DESCRIPTION', list()) + entrezgene_id = attributes.get('ENTREZGENE_ID', list()) + infoods = attributes.get('INFOODS', list()) + pubmedid_primary_reference = attributes.get('PUBMEDID_PRIMARY_REFERENCE', list()) + biocarta_id = attributes.get('BIOCARTA_ID', list()) + extensible_list = attributes.get('EXTENSIBLE_LIST', list()) + use_for = attributes.get('USE_FOR', list()) + neoplastic_status = attributes.get('NEOPLASTIC_STATUS', list()) + nsc_number = attributes.get('NSC_NUMBER', list()) + omim_number = attributes.get('OMIM_NUMBER', list()) + lt = attributes.get('LT', list()) + kegg_id = attributes.get('KEGG_ID', list()) + gene_encodes_product = attributes.get('GENE_ENCODES_PRODUCT', list()) + pdq_closed_trial_search_id = attributes.get('PDQ_CLOSED_TRIAL_SEARCH_ID', list()) + design_note = attributes.get('DESIGN_NOTE', list()) + nutrient = attributes.get('NUTRIENT', list()) + fda_unii_code = attributes.get('FDA_UNII_CODE', list()) + us_recommended_intake = attributes.get('US_RECOMMENDED_INTAKE', list()) + chemical_formula = attributes.get('CHEMICAL_FORMULA', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + def process_nddf_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + ndc = attributes.get('NDC', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + def process_omim_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + genesymbol = attributes.get('GENESYMBOL', list()) + mimtypevalue = attributes.get('MIMTYPEVALUE', list()) + moved_from = attributes.get('MOVED_FROM', list()) + sos = attributes.get('SOS', list()) + genelocus = attributes.get('GENELOCUS', list()) + mimtypemeaning = attributes.get('MIMTYPEMEANING', list()) + mimtype = attributes.get('MIMTYPE', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_pdq_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + lt = attributes.get('LT', list()) + cas_registry = attributes.get('CAS_REGISTRY', list()) + date_first_published = attributes.get('DATE_FIRST_PUBLISHED', list()) + date_last_modified = attributes.get('DATE_LAST_MODIFIED', list()) + ind_code = attributes.get('IND_CODE', list()) + pid = attributes.get('PID', list()) + nsc_code = attributes.get('NSC_CODE', list()) + pxc = attributes.get('PXC', list()) + menu_parent = attributes.get('MENU_PARENT', list()) + nci_id = attributes.get('NCI_ID', list()) + orig_sty = attributes.get('ORIG_STY', list()) + menu_type = attributes.get('MENU_TYPE', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_psy_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + hn = attributes.get('HN', list()) + pyr = attributes.get('PYR', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_rxnorm_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + ndc = attributes.get('NDC', list()) + rxn_obsoleted = attributes.get('RXN_OBSOLETED', list()) + rxn_available_strength = attributes.get('RXN_AVAILABLE_STRENGTH', list()) + rxn_human_drug = attributes.get('RXN_HUMAN_DRUG', list()) + rxn_quantity = attributes.get('RXN_QUANTITY', list()) + rxterm_form = attributes.get('RXTERM_FORM', list()) + rxn_in_expressed_flag = attributes.get('RXN_IN_EXPRESSED_FLAG', list()) + rxaui = attributes.get('RXAUI', list()) + rxn_bn_cardinality = attributes.get('RXN_BN_CARDINALITY', list()) + rxn_activated = attributes.get('RXN_ACTIVATED', list()) + rxn_boss_strength_denom_unit = attributes.get('RXN_BOSS_STRENGTH_DENOM_UNIT', list()) + ambiguity_flag = attributes.get('AMBIGUITY_FLAG', list()) + rxn_strength = attributes.get('RXN_STRENGTH', list()) + rxcui = attributes.get('RXCUI', list()) + rxn_ai = attributes.get('RXN_AI', list()) + rxn_boss_from = attributes.get('RXN_BOSS_FROM', list()) + rxn_boss_strength_num_unit = attributes.get('RXN_BOSS_STRENGTH_NUM_UNIT', list()) + rxn_vet_drug = attributes.get('RXN_VET_DRUG', list()) + orig_code = attributes.get('ORIG_CODE', list()) + rxn_am = attributes.get('RXN_AM', list()) + rxn_boss_strength_denom_value = attributes.get('RXN_BOSS_STRENGTH_DENOM_VALUE', list()) + rxn_boss_strength_num_value = attributes.get('RXN_BOSS_STRENGTH_NUM_VALUE', list()) + rxn_qualitative_distinction = attributes.get('RXN_QUALITATIVE_DISTINCTION', list()) + orig_source = attributes.get('ORIG_SOURCE', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + + def process_vandf_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + + # Currently not used, but extracting them in case we want them in the future + attributes = info.get(self.INFO_KEY, dict()) + ndf_transmit_to_cmop = attributes.get('NDF_TRANSMIT_TO_CMOP', list()) + sngl_or_mult_src_prd = attributes.get('SNGL_OR_MULT_SRC_PRD', list()) + dcsa = attributes.get('DCSA', list()) + exclude_di_check = attributes.get('EXCLUDE_DI_CHECK', list()) + nfi = attributes.get('NFI', list()) + va_class_name = attributes.get('VA_CLASS_NAME', list()) + vmo = attributes.get('VMO', list()) + drug_class_type = attributes.get('DRUG_CLASS_TYPE', list()) + nf_name = attributes.get('NF_NAME', list()) + ndc = attributes.get('NDC', list()) + vac = attributes.get('VAC', list()) + va_generic_name = attributes.get('VA_GENERIC_NAME', list()) + parent_class = attributes.get('PARENT_CLASS', list()) + va_dispense_unit = attributes.get('VA_DISPENSE_UNIT', list()) + ddf = attributes.get('DDF', list()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) From 247d38cea2c50d209c0c41ab1a27f9995f602596 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 22 Aug 2023 17:04:29 -0700 Subject: [PATCH 064/117] #316 UMLS integrated, ran to completion in a little over 23 minutes --- umls_util.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/umls_util.py b/umls_util.py index 59705e5c..b02bcfe4 100644 --- a/umls_util.py +++ b/umls_util.py @@ -45,7 +45,8 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ 'PDQ': [self.process_pdq_item, kg2_util.CURIE_PREFIX_PDQ, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PDQ')], 'PSY': [self.process_psy_item, kg2_util.CURIE_PREFIX_PSY, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PSY')], 'RXNORM': [self.process_rxnorm_item, kg2_util.CURIE_PREFIX_RXNORM, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'RXNORM')], - 'VANDF': [self.process_vandf_item, kg2_util.CURIE_PREFIX_VANDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'VANDF')]} + 'VANDF': [self.process_vandf_item, kg2_util.CURIE_PREFIX_VANDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'VANDF')], + 'UMLS': [self.process_umls_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY, 'umls')]} self.create_umls_accession_heirarchy() self.create_accession_sources_heirarchy() @@ -53,6 +54,7 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ self.INFO_KEY = 'attributes' self.NAMES_KEY = 'names' self.TUIS_KEY = 'tuis' + self.DEFINITIONS_KEY = 'definitions' def process_node(self, source, node_id, data): @@ -94,7 +96,8 @@ def get_name_synonyms(self, names_dict, source): names += [name for name in names_dict.get(key, dict()).get('Y', list())] names += [name for name in names_dict.get(key, dict()).get('N', list())] - assert len(names) > 0 + if len(names) == 0: + return None, None if len(names) == 1: return names[0], list() return names[0], names[1:] @@ -103,8 +106,8 @@ def get_basic_info(self, source, node_id, info): curie_prefix = self.SOURCES[source][1] provided_by = self.SOURCES[source][2] cuis = info.get(self.CUIS_KEY, list()) - tuis = info.get(self.TUIS_KEY, list()) - if curie_prefix == kg2_util.CURIE_PREFIX_UMLS: + tuis = sorted(info.get(self.TUIS_KEY, list())) + if curie_prefix == kg2_util.CURIE_PREFIX_UMLS and source != 'UMLS': if len(cuis) != 1: return None, None, None, None, None, None, None, None node_id = cuis[0] @@ -114,6 +117,8 @@ def get_basic_info(self, source, node_id, info): names = info.get(self.NAMES_KEY, dict()) name, synonyms = self.get_name_synonyms(names, source) + if name == None: + return None, None, None, None, None, None, None, None return node_curie, iri, name, category, provided_by, synonyms, cuis, tuis @@ -616,3 +621,12 @@ def process_vandf_item(self, node_id, info, umls_code): ddf = attributes.get('DDF', list()) self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + + def process_umls_item(self, node_id, info, umls_code): + node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + if node_curie == None: + return + + description = info.get(self.DEFINITIONS_KEY, str()) + + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) \ No newline at end of file From c579e8573daabf20fda232315c9d6f70dedd83f8 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 23 Aug 2023 09:42:24 -0700 Subject: [PATCH 065/117] #316 switch it to group by source then code to make output order for by source --- umls_mysql_to_list_jsonl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index fc91b7ca..90475c0d 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -52,9 +52,9 @@ def code_sources(cursor, output): info_key = 'attributes' # See TTY meanings here: https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html - names_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.TTY, '|', con.ISPREF, '|', con.STR) SEPARATOR '\t') FROM MRCONSO con GROUP BY con.CODE, con.SAB" - extra_info_sql_statement = "SELECT sat.CODE, sat.SAB, GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', REPLACE(sat.ATV, '\t', ' ')) SEPARATOR '\t') FROM MRSAT sat GROUP BY sat.CODE, sat.SAB" - tuis_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT sty.TUI) FROM MRCONSO con LEFT JOIN MRSTY sty ON con.CUI = sty.CUI GROUP BY con.CODE, con.SAB" + names_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.TTY, '|', con.ISPREF, '|', con.STR) SEPARATOR '\t') FROM MRCONSO con GROUP BY con.SAB, con.CODE" + extra_info_sql_statement = "SELECT sat.CODE, sat.SAB, GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', REPLACE(sat.ATV, '\t', ' ')) SEPARATOR '\t') FROM MRSAT sat GROUP BY sat.SAB, sat.CODE" + tuis_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT sty.TUI) FROM MRCONSO con LEFT JOIN MRSTY sty ON con.CUI = sty.CUI GROUP BY con.SAB, con.CODE" cursor.execute(names_sql_statement) for result in cursor.fetchall(): From 1cd2b17b66c75f17171c7686aea2bd27d208b8ea Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 23 Aug 2023 09:42:35 -0700 Subject: [PATCH 066/117] #316 remove some unneeded code --- umls_list_jsonl_to_kg_jsonl.py | 64 ---------------------------------- 1 file changed, 64 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 49569844..a431d9cd 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -22,59 +22,6 @@ TUI_MAPPINGS = dict() IRI_MAPPINGS = dict() -# ATC_PREFIX = kg2_util.CURIE_PREFIX_ATC -# CHV_PREFIX = kg2_util.CURIE_PREFIX_CHV -# DRUGBANK_PREFIX = kg2_util.CURIE_PREFIX_DRUGBANK -# FMA_PREFIX = kg2_util.CURIE_PREFIX_FMA -# GO_PREFIX = kg2_util.CURIE_PREFIX_GO -# HCPCS_PREFIX = kg2_util.CURIE_PREFIX_HCPCS -# HGNC_PREFIX = kg2_util.CURIE_PREFIX_HGNC -# HL7_PREFIX = kg2_util.CURIE_PREFIX_UMLS -# HPO_PREFIX = kg2_util.CURIE_PREFIX_HP -# ICD10PCS_PREFIX = kg2_util.CURIE_PREFIX_ICD10PCS -# ICD9CM_PREFIX = kg2_util.CURIE_PREFIX_ICD9 -# MEDRT_PREFIX = kg2_util.CURIE_PREFIX_UMLS -# MEDLINEPLUS_PREFIX = kg2_util.CURIE_PREFIX_UMLS -# MSH_PREFIX = kg2_util.CURIE_PREFIX_MESH -# MTH_PREFIX = kg2_util.CURIE_PREFIX_UMLS -# NCBI_PREFIX = kg2_util.CURIE_PREFIX_NCBI_TAXON -# NCI_PREFIX = kg2_util.CURIE_PREFIX_NCIT -# NDDF_PREFIX = kg2_util.CURIE_PREFIX_NDDF -# OMIM_PREFIX = kg2_util.CURIE_PREFIX_OMIM -# PDQ_PREFIX = kg2_util.CURIE_PREFIX_PDQ -# PSY_PREFIX = kg2_util.CURIE_PREFIX_PSY -# RXNORM_PREFIX = kg2_util.CURIE_PREFIX_RXNORM -# VANDF_PREFIX = kg2_util.CURIE_PREFIX_VANDF - -# UMLS_SOURCE_PREFIX = kg2_util.CURIE_PREFIX_UMLS_SOURCE - -# DESIRED_CODES = {'ATC': [umls_util.process_atc_item, kg2_util.CURIE_PREFIX_ATC, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'ATC')], -# 'CHV': [umls_util.process_chv_item, kg2_util.CURIE_PREFIX_CHV, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'CHV')], -# 'DRUGBANK': [umls_util.process_drugbank_item, kg2_util.CURIE_PREFIX_DRUGBANK, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'DRUGBANK')], -# 'FMA': [umls_util.process_fma_item, kg2_util.CURIE_PREFIX_FMA, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'FMA')], -# 'GO': [umls_util.process_go_item, kg2_util.CURIE_PREFIX_GO, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'GO')], -# 'HCPCS': [umls_util.process_hcpcs_item, kg2_util.CURIE_PREFIX_HCPCS, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'HCPCS')], -# 'HGNC': [umls_util.process_hgnc_item, kg2_util.CURIE_PREFIX_HGNC, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'HGNC')], -# 'HL7V3.0': [umls_util.process_hl7_item, kg2_util.CURIE_PREFIX_UMLS, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'HL7')], -# 'HPO': [umls_util.process_hpo_item, kg2_util.CURIE_PREFIX_HP, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'HPO')], -# 'ICD10PCS': [umls_util.process_icd10pcs_item, kg2_util.CURIE_PREFIX_ICD10PCS, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'ICD10PCS')], -# 'ICD9CM': [umls_util.process_icd9cm_item, kg2_util.CURIE_PREFIX_ICD9, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'ICD9CM')], -# 'MED-RT': [umls_util.process_medrt_item, kg2_util.CURIE_PREFIX_UMLS, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'MED-RT')], -# 'MEDLINEPLUS': [umls_util.process_medlineplus_item, kg2_util.CURIE_PREFIX_UMLS, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'MEDLINEPLUS')], -# 'MSH': [umls_util.process_msh_item, kg2_util.CURIE_PREFIX_MESH, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'MSH')], -# 'MTH': [umls_util.process_mth_item, kg2_util.CURIE_PREFIX_UMLS, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'MTH')], -# 'NCBI': [umls_util.process_ncbi_item, kg2_util.CURIE_PREFIX_NCBI_TAXON, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'NCBITAXON')], -# 'NCI': [umls_util.process_nci_item, kg2_util.CURIE_PREFIX_NCIT, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'NCI')], -# 'NDDF': [umls_util.process_nddf_item, kg2_util.CURIE_PREFIX_NDDF, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'NCI')], -# 'OMIM': [umls_util.process_omim_item, kg2_util.CURIE_PREFIX_OMIM, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'OMIM')], -# 'PDQ': [umls_util.process_pdq_item, kg2_util.CURIE_PREFIX_PDQ, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'PDQ')], -# 'PSY': [umls_util.process_psy_item, kg2_util.CURIE_PREFIX_PSY, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'PSY')], -# 'RXNORM': [umls_util.process_rxnorm_item, kg2_util.CURIE_PREFIX_RXNORM, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'RXNORM')], -# 'VANDF': [umls_util.process_vandf_item, kg2_util.CURIE_PREFIX_VANDF, umls_util.make_node_id(UMLS_SOURCE_PREFIX, 'VANDF')]} - -# # Mined from HTML Page Source of https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/precedence_suppressibility.html -# ACCESSION_HEIRARCHY = list() -# ACCESSION_SOURCES_HEIRARCHY = dict() def get_args(): arg_parser = argparse.ArgumentParser(description='umls_list_jsonl_to_kg_jsonl.py: converts UMLS MySQL JSON Lines dump into KG2 JSON format') @@ -91,17 +38,6 @@ def extract_node_id(node_id_str): return node_id[0].strip(), node_id[1].strip() -def create_accession_heirarchy(full_heirarchy): - for [source, key] in full_heirarchy: - if source in DESIRED_CODES: - ACCESSION_HEIRARCHY.append((source, key)) - -def create_accession_sources_heirarchy(): - for (source, key) in ACCESSION_HEIRARCHY: - if source not in ACCESSION_SOURCES_HEIRARCHY: - ACCESSION_SOURCES_HEIRARCHY[source] = list() - ACCESSION_SOURCES_HEIRARCHY[source].append(key) - if __name__ == '__main__': print("Starting umls_list_jsonl_to_kg_jsonl.py at", kg2_util.date()) args = get_args() From 6e36f24c9041909816804ce628a67b4f019e1113 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 23 Aug 2023 16:26:32 -0700 Subject: [PATCH 067/117] #316 descriptions are present for non-CUI nodes too --- umls_mysql_to_list_jsonl.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index 90475c0d..e7f055c4 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -47,6 +47,7 @@ def code_sources(cursor, output): tui_key = 'tuis' cui_key = 'cuis' name_key = 'names' + definitions_key = 'definitions' # See info about these here: https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html info_key = 'attributes' @@ -55,6 +56,7 @@ def code_sources(cursor, output): names_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.TTY, '|', con.ISPREF, '|', con.STR) SEPARATOR '\t') FROM MRCONSO con GROUP BY con.SAB, con.CODE" extra_info_sql_statement = "SELECT sat.CODE, sat.SAB, GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', REPLACE(sat.ATV, '\t', ' ')) SEPARATOR '\t') FROM MRSAT sat GROUP BY sat.SAB, sat.CODE" tuis_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT sty.TUI) FROM MRCONSO con LEFT JOIN MRSTY sty ON con.CUI = sty.CUI GROUP BY con.SAB, con.CODE" + definitions_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT def.DEF SEPARATOR ';') FROM MRCONSO con INNER JOIN MRDEF def on con.CUI=def.CUI GROUP BY con.SAB, con.CODE" cursor.execute(names_sql_statement) for result in cursor.fetchall(): @@ -106,6 +108,17 @@ def code_sources(cursor, output): print("Finished tuis_sql_statement at", kg2_util.date()) + cursor.execute(definitions_sql_statement) + for result in cursor.fetchall(): + (node_id, node_source, definition) = result + key = (node_source, node_id) + if key not in code_source_info: + # This occurs if a node doesn't have a name. + continue + code_source_info[key][definitions_key] = definition + + print("Finished definitions_sql_statement at", kg2_util.date()) + record_num = 0 for key, val in code_source_info.items(): record_num += 1 @@ -129,7 +142,7 @@ def cui_sources(cursor, output, sources): names_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT CONCAT(TTY, '|', SAB, '|', ISPREF, '|', STR) SEPARATOR '\t') FROM MRCONSO WHERE SAB IN " + sources_where + " GROUP BY CUI" tuis_sql_statement = "SELECT CUI, GROUP_CONCAT(TUI) FROM MRSTY GROUP BY CUI" relations_sql_statement = "SELECT DISTINCT CUI1, REL, RELA, DIR, CUI2, SAB FROM MRREL WHERE SAB IN " + sources_where - definitions_sql_statement = "SELECT CUI, DEF FROM MRDEF WHERE SAB IN " + sources_where + definitions_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT DEF SEPARATOR ';') FROM MRDEF WHERE SAB IN " + sources_where + " GROUP BY CUI" cursor.execute(names_sql_statement) for result in cursor.fetchall(): From eb587858bdb4ce131d9f09669b477147f910dde6 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 23 Aug 2023 16:26:49 -0700 Subject: [PATCH 068/117] #316 non-CUI descriptions and xref edges --- umls_util.py | 145 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 92 insertions(+), 53 deletions(-) diff --git a/umls_util.py b/umls_util.py index b02bcfe4..28546e9b 100644 --- a/umls_util.py +++ b/umls_util.py @@ -55,9 +55,13 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ self.NAMES_KEY = 'names' self.TUIS_KEY = 'tuis' self.DEFINITIONS_KEY = 'definitions' + self.last_source = '' def process_node(self, source, node_id, data): + if source != self.last_source and self.last_source != '' and self.last_source in self.SOURCES: + print("Finished processing", self.last_source, "at", kg2_util.date()) + self.last_source = source if source in self.SOURCES: self.SOURCES[source][0](node_id, data, source) @@ -102,14 +106,25 @@ def get_name_synonyms(self, names_dict, source): return names[0], list() return names[0], names[1:] + def create_xref_edges(subject_id, cuis, provided_by): + relation_curie = 'UMLS:xref' + relation_label = 'xref' + + for cui in cuis: + object_id = make_node_id(kg2_util.CURIE_PREFIX_UMLS, cui) + # TODO: resolve update_date + self.edges_output.write(make_edge(subject_id, object_id, relation_curie, relation_label, primary_knowledge_source, "2023")) + + def get_basic_info(self, source, node_id, info): curie_prefix = self.SOURCES[source][1] provided_by = self.SOURCES[source][2] cuis = info.get(self.CUIS_KEY, list()) tuis = sorted(info.get(self.TUIS_KEY, list())) + description = info.get(self.DEFINITIONS_KEY, str()) if curie_prefix == kg2_util.CURIE_PREFIX_UMLS and source != 'UMLS': if len(cuis) != 1: - return None, None, None, None, None, None, None, None + return None, None, None, None, None, None, None, None, None node_id = cuis[0] node_curie = self.make_node_id(curie_prefix, node_id) iri = self.IRI_MAPPINGS[curie_prefix] + node_id @@ -118,9 +133,9 @@ def get_basic_info(self, source, node_id, info): names = info.get(self.NAMES_KEY, dict()) name, synonyms = self.get_name_synonyms(names, source) if name == None: - return None, None, None, None, None, None, None, None + return None, None, None, None, None, None, None, None, None - return node_curie, iri, name, category, provided_by, synonyms, cuis, tuis + return node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis def create_description(self, tuis, comment=""): description = comment @@ -131,17 +146,18 @@ def create_description(self, tuis, comment=""): def process_atc_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future atc_level = info.get(self.INFO_KEY, dict()).get('ATC_LEVEL', list())[0] is_drug_class = info.get(self.INFO_KEY, dict()).get('IS_DRUG_CLASS', list()) == ["Y"] - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_chv_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future combo_score = info.get(self.INFO_KEY, dict()).get('COMBO_SCORE', list()) @@ -151,32 +167,35 @@ def process_chv_item(self, node_id, info, umls_code): disparaged = info.get(self.INFO_KEY, dict()).get('DISPARAGED', list()) frequency = info.get(self.INFO_KEY, dict()).get('FREQUENCY', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_drugbank_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future fda_codes = info.get(self.INFO_KEY, dict()).get('FDA_UNII_CODE', list()) secondary_accession_keys = info.get(self.INFO_KEY, dict()).get('SID', list()) # TODO: figure out update date - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_fma_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future authority = info.get(self.INFO_KEY, dict()).get('AUTHORITY', list()) date_last_modified = info.get(self.INFO_KEY, dict()).get('DATE_LAST_MODIFIED', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_go_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id.replace('GO:', ''), info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id.replace('GO:', ''), info) # GO-specific information attributes = info.get(self.INFO_KEY, dict()) @@ -200,10 +219,11 @@ def process_go_item(self, node_id, info, umls_code): sid = attributes.get('SID', list()) self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, go_comment)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_hcpcs_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future - descriptions from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/attribute_names.html attributes = info.get(self.INFO_KEY, dict()) @@ -225,11 +245,12 @@ def process_hcpcs_item(self, node_id, info, umls_code): hac = attributes.get('HAC', list()) # HCPCS action code - code denoting the change made to a procedure or modifier code within the HCPCS system. hbt = attributes.get('HBT', list()) # HCPCS Berenson-Eggers Type of Service Code - BETOS for the procedure code based on generally agreed upon clinically meaningful groupings of procedures and services. - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_hgnc_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id.replace('HGNC:', ''), info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id.replace('HGNC:', ''), info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) @@ -263,11 +284,12 @@ def process_hgnc_item(self, node_id, info, umls_code): lncipedia = attributes.get('LNCIPEDIA', list()) gene_fam_desc = attributes.get('GENE_FAM_DESC', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_hl7_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) if node_curie == None: return @@ -309,36 +331,41 @@ def process_hl7_item(self, node_id, info, umls_code): hl7di = attributes.get('HL7DI', list()) hl7cs = attributes.get('HL7CS', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_hpo_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id.replace('HP:', ''), info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id.replace('HP:', ''), info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) sid = attributes.get('SID', list()) - hpo_comment = attributes.get('HPO_COMMENT', list()) + hpo_comment = attributes.get('HPO_COMMENT', str()) + if len(hpo_comment) > 0: + hpo_comment = hpo_comment[0] date_created = attributes.get('DATE_CREATED', list()) syn_qualifier = attributes.get('SYN_QUALIFIER', list()) ref = attributes.get('REF', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_icd10pcs_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) added_meaning = attributes.get('ADDED_MEANING', list()) order_no = attributes.get('ORDER_NO', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_icd9cm_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) @@ -349,10 +376,11 @@ def process_icd9cm_item(self, node_id, info, umls_code): icn = attributes.get('ICN', list()) ica = attributes.get('ICA', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_medrt_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) if node_curie == None: return @@ -361,11 +389,12 @@ def process_medrt_item(self, node_id, info, umls_code): term_status = attributes.get('TERM_STATUS', list()) concept_type = attributes.get('CONCEPT_TYPE', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_medlineplus_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) if node_curie == None: return @@ -377,11 +406,12 @@ def process_medlineplus_item(self, node_id, info, umls_code): mp_primary_institute_url = attributes.get('MP_PRIMARY_INSTITUTE_URL', list()) mp_other_language_url = attributes.get('MP_OTHER_LANGUAGE_URL', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_msh_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) @@ -413,11 +443,12 @@ def process_msh_item(self, node_id, info, umls_code): ol = attributes.get('OL', list()) mn = attributes.get('MN', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_mth_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) if node_curie == None: return @@ -441,11 +472,12 @@ def process_mth_item(self, node_id, info, umls_code): mth_maptocomplexity = attributes.get('MTH_MAPTOCOMPLEXITY', list()) sos = attributes.get('SOS', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_ncbi_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) @@ -453,11 +485,12 @@ def process_ncbi_item(self, node_id, info, umls_code): authority_name = attributes.get('AUTHORITY_NAME', list()) rank = attributes.get('RANK', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_nci_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) @@ -507,19 +540,21 @@ def process_nci_item(self, node_id, info, umls_code): us_recommended_intake = attributes.get('US_RECOMMENDED_INTAKE', list()) chemical_formula = attributes.get('CHEMICAL_FORMULA', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_nddf_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) ndc = attributes.get('NDC', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_omim_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) @@ -531,11 +566,12 @@ def process_omim_item(self, node_id, info, umls_code): mimtypemeaning = attributes.get('MIMTYPEMEANING', list()) mimtype = attributes.get('MIMTYPE', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_pdq_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) @@ -552,22 +588,24 @@ def process_pdq_item(self, node_id, info, umls_code): orig_sty = attributes.get('ORIG_STY', list()) menu_type = attributes.get('MENU_TYPE', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_psy_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) hn = attributes.get('HN', list()) pyr = attributes.get('PYR', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_rxnorm_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) @@ -596,11 +634,12 @@ def process_rxnorm_item(self, node_id, info, umls_code): rxn_qualitative_distinction = attributes.get('RXN_QUALITATIVE_DISTINCTION', list()) orig_source = attributes.get('ORIG_SOURCE', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_vandf_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) @@ -620,13 +659,13 @@ def process_vandf_item(self, node_id, info, umls_code): va_dispense_unit = attributes.get('VA_DISPENSE_UNIT', list()) ddf = attributes.get('DDF', list()) - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) def process_umls_item(self, node_id, info, umls_code): - node_curie, iri, name, category, provided_by, synonyms, cuis, tuis = self.get_basic_info(umls_code, node_id, info) + node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id, info) if node_curie == None: return - description = info.get(self.DEFINITIONS_KEY, str()) - - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) \ No newline at end of file + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_xref_edges(node_curie, cuis, provided_by) \ No newline at end of file From d0067194921b625ba38bfe28c97ae492b2e6bd83 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 23 Aug 2023 16:28:29 -0700 Subject: [PATCH 069/117] #316 don't need xrefs for CUI nodes --- umls_util.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/umls_util.py b/umls_util.py index 28546e9b..b603e015 100644 --- a/umls_util.py +++ b/umls_util.py @@ -332,7 +332,6 @@ def process_hl7_item(self, node_id, info, umls_code): hl7cs = attributes.get('HL7CS', list()) self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) - self.create_xref_edges(node_curie, cuis, provided_by) def process_hpo_item(self, node_id, info, umls_code): @@ -390,7 +389,6 @@ def process_medrt_item(self, node_id, info, umls_code): concept_type = attributes.get('CONCEPT_TYPE', list()) self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) - self.create_xref_edges(node_curie, cuis, provided_by) def process_medlineplus_item(self, node_id, info, umls_code): @@ -407,7 +405,6 @@ def process_medlineplus_item(self, node_id, info, umls_code): mp_other_language_url = attributes.get('MP_OTHER_LANGUAGE_URL', list()) self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) - self.create_xref_edges(node_curie, cuis, provided_by) def process_msh_item(self, node_id, info, umls_code): @@ -473,7 +470,6 @@ def process_mth_item(self, node_id, info, umls_code): sos = attributes.get('SOS', list()) self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) - self.create_xref_edges(node_curie, cuis, provided_by) def process_ncbi_item(self, node_id, info, umls_code): @@ -668,4 +664,3 @@ def process_umls_item(self, node_id, info, umls_code): return self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) - self.create_xref_edges(node_curie, cuis, provided_by) \ No newline at end of file From ae147b3945b632c841f0ebb397a60a9c1068884c Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 23 Aug 2023 16:54:12 -0700 Subject: [PATCH 070/117] #316 only want definitions from that source --- umls_mysql_to_list_jsonl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index e7f055c4..df34602a 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -56,7 +56,7 @@ def code_sources(cursor, output): names_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT con.CUI), GROUP_CONCAT(DISTINCT CONCAT(con.TTY, '|', con.ISPREF, '|', con.STR) SEPARATOR '\t') FROM MRCONSO con GROUP BY con.SAB, con.CODE" extra_info_sql_statement = "SELECT sat.CODE, sat.SAB, GROUP_CONCAT(DISTINCT CONCAT(sat.ATN, '|', REPLACE(sat.ATV, '\t', ' ')) SEPARATOR '\t') FROM MRSAT sat GROUP BY sat.SAB, sat.CODE" tuis_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT sty.TUI) FROM MRCONSO con LEFT JOIN MRSTY sty ON con.CUI = sty.CUI GROUP BY con.SAB, con.CODE" - definitions_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT def.DEF SEPARATOR ';') FROM MRCONSO con INNER JOIN MRDEF def on con.CUI=def.CUI GROUP BY con.SAB, con.CODE" + definitions_sql_statement = "SELECT con.CODE, con.SAB, GROUP_CONCAT(DISTINCT def.DEF SEPARATOR ';') FROM MRCONSO con INNER JOIN MRDEF def on con.CUI=def.CUI WHERE con.SAB=def.SAB GROUP BY con.SAB, con.CODE" cursor.execute(names_sql_statement) for result in cursor.fetchall(): From b43dd505ce5aadb65404c0829abc815e8c742115 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 24 Aug 2023 12:12:03 -0700 Subject: [PATCH 071/117] #316 we only want UMLS CUI descriptions from sources that we are using --- umls_mysql_to_list_jsonl.py | 7 +++++-- umls_util.py | 11 ++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index df34602a..0776ab23 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -142,7 +142,7 @@ def cui_sources(cursor, output, sources): names_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT CONCAT(TTY, '|', SAB, '|', ISPREF, '|', STR) SEPARATOR '\t') FROM MRCONSO WHERE SAB IN " + sources_where + " GROUP BY CUI" tuis_sql_statement = "SELECT CUI, GROUP_CONCAT(TUI) FROM MRSTY GROUP BY CUI" relations_sql_statement = "SELECT DISTINCT CUI1, REL, RELA, DIR, CUI2, SAB FROM MRREL WHERE SAB IN " + sources_where - definitions_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT DEF SEPARATOR ';') FROM MRDEF WHERE SAB IN " + sources_where + " GROUP BY CUI" + definitions_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT CONCAT(SAB, '|', DEF) SEPARATOR '\t') FROM MRDEF WHERE SAB IN " + sources_where + " GROUP BY CUI" cursor.execute(names_sql_statement) for result in cursor.fetchall(): @@ -204,7 +204,10 @@ def cui_sources(cursor, output, sources): if key not in cui_source_info: # See above for explanation continue - cui_source_info[key][definitions_key] = definition + for def_piece in definition.split('\t'): + split_def_piece = def_piece.split('|') + assert len(split_def_piece) == 2, split_def_piece + cui_source_info[key][definitions_key][split_def_piece[0]] = split_def_piece[1] print("Finished definitions_sql_statement at", kg2_util.date()) diff --git a/umls_util.py b/umls_util.py index b603e015..bde5931b 100644 --- a/umls_util.py +++ b/umls_util.py @@ -121,7 +121,16 @@ def get_basic_info(self, source, node_id, info): provided_by = self.SOURCES[source][2] cuis = info.get(self.CUIS_KEY, list()) tuis = sorted(info.get(self.TUIS_KEY, list())) - description = info.get(self.DEFINITIONS_KEY, str()) + description = str() + if source == 'UMLS': + description = list() + description_dict = info.get(self.DEFINITIONS_KEY, dict()) + for description_key in description_dict: + if description_key in self.SOURCES: + description.append(description_dict[description_key]) + description = '; '.join(description) + else: + description = info.get(self.DEFINITIONS_KEY, str()) if curie_prefix == kg2_util.CURIE_PREFIX_UMLS and source != 'UMLS': if len(cuis) != 1: return None, None, None, None, None, None, None, None, None From 743878af1771d6952d9e4ff5334803b01953f969 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 24 Aug 2023 12:30:51 -0700 Subject: [PATCH 072/117] #316 forgot to add definitions key to dictionary --- umls_mysql_to_list_jsonl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index 0776ab23..b6532725 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -204,6 +204,8 @@ def cui_sources(cursor, output, sources): if key not in cui_source_info: # See above for explanation continue + if definitions_key not in cui_source_info[key]: + cui_source_info[key][definitions_key] = dict() for def_piece in definition.split('\t'): split_def_piece = def_piece.split('|') assert len(split_def_piece) == 2, split_def_piece From 70d49317e2d76483fdba9464ed64fa352c56fb2d Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 24 Aug 2023 13:28:53 -0700 Subject: [PATCH 073/117] #316 can't have tabs in the definition --- umls_mysql_to_list_jsonl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index b6532725..24259de7 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -142,7 +142,7 @@ def cui_sources(cursor, output, sources): names_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT CONCAT(TTY, '|', SAB, '|', ISPREF, '|', STR) SEPARATOR '\t') FROM MRCONSO WHERE SAB IN " + sources_where + " GROUP BY CUI" tuis_sql_statement = "SELECT CUI, GROUP_CONCAT(TUI) FROM MRSTY GROUP BY CUI" relations_sql_statement = "SELECT DISTINCT CUI1, REL, RELA, DIR, CUI2, SAB FROM MRREL WHERE SAB IN " + sources_where - definitions_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT CONCAT(SAB, '|', DEF) SEPARATOR '\t') FROM MRDEF WHERE SAB IN " + sources_where + " GROUP BY CUI" + definitions_sql_statement = "SELECT CUI, GROUP_CONCAT(DISTINCT CONCAT(SAB, '|', REPLACE(DEF, '\t', ' ')) SEPARATOR '\t') FROM MRDEF WHERE SAB IN " + sources_where + " GROUP BY CUI" cursor.execute(names_sql_statement) for result in cursor.fetchall(): From 9c8d733e3a26625843e231ae45e9f527ab1364ad Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 24 Aug 2023 15:11:21 -0700 Subject: [PATCH 074/117] #316 resolve issues with xrefs and descriptions (for MEDLINEPLUS) --- umls_util.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/umls_util.py b/umls_util.py index bde5931b..b2e76e49 100644 --- a/umls_util.py +++ b/umls_util.py @@ -106,14 +106,14 @@ def get_name_synonyms(self, names_dict, source): return names[0], list() return names[0], names[1:] - def create_xref_edges(subject_id, cuis, provided_by): + def create_xref_edges(self, subject_id, cuis, provided_by): relation_curie = 'UMLS:xref' relation_label = 'xref' for cui in cuis: - object_id = make_node_id(kg2_util.CURIE_PREFIX_UMLS, cui) + object_id = self.make_node_id(kg2_util.CURIE_PREFIX_UMLS, cui) # TODO: resolve update_date - self.edges_output.write(make_edge(subject_id, object_id, relation_curie, relation_label, primary_knowledge_source, "2023")) + self.edges_output.write(kg2_util.make_edge(subject_id, object_id, relation_curie, relation_label, provided_by, "2023")) def get_basic_info(self, source, node_id, info): @@ -147,7 +147,7 @@ def get_basic_info(self, source, node_id, info): return node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis def create_description(self, tuis, comment=""): - description = comment + description = comment.replace('

', '').replace('

', '').replace('
  • ', '').replace('
  • ', '').replace('
      ', '').replace('
    ', '') for tui in tuis: description += "; UMLS Semantic Type: STY:" + tui description = description.strip("; ") From 6019cf1140d1a43551201abea7f7d7ff5c28f18b Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 24 Aug 2023 16:07:30 -0700 Subject: [PATCH 075/117] #316 initial umls edges --- umls_util.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/umls_util.py b/umls_util.py index b2e76e49..8be10253 100644 --- a/umls_util.py +++ b/umls_util.py @@ -55,6 +55,7 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ self.NAMES_KEY = 'names' self.TUIS_KEY = 'tuis' self.DEFINITIONS_KEY = 'definitions' + self.RELATIONS_KEY = 'relations' self.last_source = '' @@ -115,6 +116,21 @@ def create_xref_edges(self, subject_id, cuis, provided_by): # TODO: resolve update_date self.edges_output.write(kg2_util.make_edge(subject_id, object_id, relation_curie, relation_label, provided_by, "2023")) +## TODO: make relation nodes +## TODO: make TUI nodes + + def create_umls_edges(self, subject_id, relations, provided_by): + for relation_source in relations: + if relation_source in self.SOURCES: + for relation in relations[relation_source]: + relation_abbr, relation_label, relation_direction = relation.split(',') + if relation_label == 'None': + relation_label = relation_abbr + relation_curie = self.make_node_id(kg2_util.CURIE_PREFIX_UMLS, relation_label) + for cui in relations[relation_source][relation]: + object_id = self.make_node_id(kg2_util.CURIE_PREFIX_UMLS, cui) + # TODO: resolve update_date + self.edges_output.write(kg2_util.make_edge(subject_id, object_id, relation_curie, relation_label, provided_by, "2023")) def get_basic_info(self, source, node_id, info): curie_prefix = self.SOURCES[source][1] @@ -673,3 +689,4 @@ def process_umls_item(self, node_id, info, umls_code): return self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.create_umls_edges(node_curie, info.get(RELATIONS_KEY, dict()), provided_by) From bdd5f9133246c321c919489b6a4a51aeaadfb214 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 24 Aug 2023 16:08:06 -0700 Subject: [PATCH 076/117] #316 cleaning up umls list to jsonl --- umls_list_jsonl_to_kg_jsonl.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index a431d9cd..35fd780e 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -53,9 +53,6 @@ def extract_node_id(node_id_str): input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) input_items = input_read_jsonlines_info[0] - name_keys = set() - attribute_keys = set() - with open('tui_combo_mappings.json') as mappings: TUI_MAPPINGS = json.load(mappings) From 6e937c9b410f9a94d5132e1bbddffba14e5ea99d Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 25 Aug 2023 10:32:02 -0700 Subject: [PATCH 077/117] #316 handling relation direction, used example UMLS:C2063866 RO,may_treat,Y with objects [UMLS:C2825616, UMLS:C3818973] to realize need to flip --- umls_util.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/umls_util.py b/umls_util.py index 8be10253..e68ef4b6 100644 --- a/umls_util.py +++ b/umls_util.py @@ -119,18 +119,22 @@ def create_xref_edges(self, subject_id, cuis, provided_by): ## TODO: make relation nodes ## TODO: make TUI nodes - def create_umls_edges(self, subject_id, relations, provided_by): + def create_umls_edges(self, subject_id, relations): for relation_source in relations: if relation_source in self.SOURCES: + provided_by = self.SOURCES[relation_source][2] for relation in relations[relation_source]: relation_abbr, relation_label, relation_direction = relation.split(',') if relation_label == 'None': relation_label = relation_abbr - relation_curie = self.make_node_id(kg2_util.CURIE_PREFIX_UMLS, relation_label) + relation_curie = self.make_node_id(relation_source, relation_label) for cui in relations[relation_source][relation]: object_id = self.make_node_id(kg2_util.CURIE_PREFIX_UMLS, cui) # TODO: resolve update_date - self.edges_output.write(kg2_util.make_edge(subject_id, object_id, relation_curie, relation_label, provided_by, "2023")) + if relation_direction == 'Y': + self.edges_output.write(kg2_util.make_edge(object_id, subject_id, relation_curie, relation_label, provided_by, "2023")) + else: + self.edges_output.write(kg2_util.make_edge(subject_id, object_id, relation_curie, relation_label, provided_by, "2023")) def get_basic_info(self, source, node_id, info): curie_prefix = self.SOURCES[source][1] @@ -689,4 +693,4 @@ def process_umls_item(self, node_id, info, umls_code): return self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) - self.create_umls_edges(node_curie, info.get(RELATIONS_KEY, dict()), provided_by) + self.create_umls_edges(node_curie, info.get(self.RELATIONS_KEY, dict())) From 92558feeea1a24e17af71b2156fa4d6fc2890f64 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 25 Aug 2023 15:26:31 -0700 Subject: [PATCH 078/117] #316 trying to make OMIM play nice --- umls_util.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/umls_util.py b/umls_util.py index e68ef4b6..c87ecec5 100644 --- a/umls_util.py +++ b/umls_util.py @@ -57,6 +57,7 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ self.DEFINITIONS_KEY = 'definitions' self.RELATIONS_KEY = 'relations' self.last_source = '' + self.hgnc_to_omim = dict() def process_node(self, source, node_id, data): @@ -305,7 +306,7 @@ def process_hgnc_item(self, node_id, info, umls_code): ena = attributes.get('ENA', list()) rgd_id = attributes.get('RGD_ID', list()) date_symbol_changed = attributes.get('DATE_SYMBOL_CHANGED', list()) - omim_id = attributes.get('OMIM_ID', list()) + omim_id_list = attributes.get('OMIM_ID', list()) gene_fam_id = attributes.get('GENE_FAM_ID', list()) gene_symbol = attributes.get('GENESYMBOL', list()) ez = attributes.get('EZ', list()) @@ -313,6 +314,10 @@ def process_hgnc_item(self, node_id, info, umls_code): lncipedia = attributes.get('LNCIPEDIA', list()) gene_fam_desc = attributes.get('GENE_FAM_DESC', list()) + if len(gene_symbol) > 0: + for omim_id in omim_id_list: + self.hgnc_to_omim[self.make_node_id(kg2_util.CURIE_PREFIX_OMIM, omim_id)] = gene_symbol[0] + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) self.create_xref_edges(node_curie, cuis, provided_by) @@ -583,7 +588,7 @@ def process_omim_item(self, node_id, info, umls_code): # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) - genesymbol = attributes.get('GENESYMBOL', list()) + gene_symbol = attributes.get('GENESYMBOL', list()) mimtypevalue = attributes.get('MIMTYPEVALUE', list()) moved_from = attributes.get('MOVED_FROM', list()) sos = attributes.get('SOS', list()) @@ -591,6 +596,18 @@ def process_omim_item(self, node_id, info, umls_code): mimtypemeaning = attributes.get('MIMTYPEMEANING', list()) mimtype = attributes.get('MIMTYPE', list()) + name = name.capitalize() + if len(mimtype) > 0: + mimtype = int(mimtype[0]) + if mimtype in [0, 3, 5]: + category = kg2_util.BIOLINK_CATEGORY_PHENOTYPIC_FEATURE + name += " related phenotypic feature" + if mimtype in [1, 4]: + category = kg2_util.BIOLINK_CATEGORY_GENE + if len(gene_symbol) > 0: + name = gene_symbol[0] + name = self.hgnc_to_omim.get(node_curie, name) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) self.create_xref_edges(node_curie, cuis, provided_by) From c182c785e75c84f8d6b7d08cbe82db9274cb9c5e Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 29 Aug 2023 13:24:46 -0700 Subject: [PATCH 079/117] #316 HGNC nodes should be genes and names should be their gene name --- umls_util.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/umls_util.py b/umls_util.py index c87ecec5..75cf6b15 100644 --- a/umls_util.py +++ b/umls_util.py @@ -317,6 +317,9 @@ def process_hgnc_item(self, node_id, info, umls_code): if len(gene_symbol) > 0: for omim_id in omim_id_list: self.hgnc_to_omim[self.make_node_id(kg2_util.CURIE_PREFIX_OMIM, omim_id)] = gene_symbol[0] + name = gene_symbol[0] + " (human)" + + category = kg2_util.BIOLINK_CATEGORY_GENE self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) self.create_xref_edges(node_curie, cuis, provided_by) From 26f51c08e54377fe7569ef72bca62e044658fceb Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 29 Aug 2023 15:19:42 -0700 Subject: [PATCH 080/117] #316 don't need (human) on HGNC nodes --- umls_util.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/umls_util.py b/umls_util.py index 75cf6b15..0d84830e 100644 --- a/umls_util.py +++ b/umls_util.py @@ -81,11 +81,14 @@ def create_accession_sources_heirarchy(self): self.ACCESSION_SOURCES_HEIRARCHY[source] = list() self.ACCESSION_SOURCES_HEIRARCHY[source].append(key) - def make_umls_node(self, node_curie, iri, name, category, update_date, provided_by, synonyms, description): + def make_umls_node(self, node_curie, iri, name, category, update_date, provided_by, synonyms, description, full_name=None): node = kg2_util.make_node(node_curie, iri, name, category, "2023", provided_by) node['synonym'] = synonyms node['description'] = description + if full_name is not None: + node['full_name'] = full_name + self.nodes_output.write(node) def make_node_id(self, curie_prefix, node_id): @@ -117,7 +120,6 @@ def create_xref_edges(self, subject_id, cuis, provided_by): # TODO: resolve update_date self.edges_output.write(kg2_util.make_edge(subject_id, object_id, relation_curie, relation_label, provided_by, "2023")) -## TODO: make relation nodes ## TODO: make TUI nodes def create_umls_edges(self, subject_id, relations): @@ -282,6 +284,8 @@ def process_hcpcs_item(self, node_id, info, umls_code): def process_hgnc_item(self, node_id, info, umls_code): node_curie, iri, name, category, provided_by, synonyms, description, cuis, tuis = self.get_basic_info(umls_code, node_id.replace('HGNC:', ''), info) + full_name = name + # Currently not used, but extracting them in case we want them in the future attributes = info.get(self.INFO_KEY, dict()) mgd_id = attributes.get('MGD_ID', list()) @@ -317,11 +321,11 @@ def process_hgnc_item(self, node_id, info, umls_code): if len(gene_symbol) > 0: for omim_id in omim_id_list: self.hgnc_to_omim[self.make_node_id(kg2_util.CURIE_PREFIX_OMIM, omim_id)] = gene_symbol[0] - name = gene_symbol[0] + " (human)" + name = gene_symbol[0] category = kg2_util.BIOLINK_CATEGORY_GENE - self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description), full_name=full_name) self.create_xref_edges(node_curie, cuis, provided_by) From c3805bbe016e8faec0860d22697bb08694890633 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 31 Aug 2023 16:18:45 -0700 Subject: [PATCH 081/117] #316 addressing biolink:Drug vs biolink:ChemicalEntity discussion --- umls_util.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/umls_util.py b/umls_util.py index 0d84830e..751264fd 100644 --- a/umls_util.py +++ b/umls_util.py @@ -211,6 +211,9 @@ def process_drugbank_item(self, node_id, info, umls_code): secondary_accession_keys = info.get(self.INFO_KEY, dict()).get('SID', list()) # TODO: figure out update date + + category = kg2_util.BIOLINK_CATEGORY_DRUG + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) self.create_xref_edges(node_curie, cuis, provided_by) @@ -481,6 +484,9 @@ def process_msh_item(self, node_id, info, umls_code): ol = attributes.get('OL', list()) mn = attributes.get('MN', list()) + if tuis == ("T109", "T121"): + category = kg2_util.BIOLINK_CATEGORY_CHEMICAL_ENTITY + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) self.create_xref_edges(node_curie, cuis, provided_by) @@ -716,5 +722,8 @@ def process_umls_item(self, node_id, info, umls_code): if node_curie == None: return + if tuis == ("T109", "T121"): + category = kg2_util.BIOLINK_CATEGORY_CHEMICAL_ENTITY + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) self.create_umls_edges(node_curie, info.get(self.RELATIONS_KEY, dict())) From c0fb8fa92a0e137f5ced72e4de440d0d555cccfc Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 1 Sep 2023 00:00:30 -0700 Subject: [PATCH 082/117] #316 lists for the tuis not tuples --- umls_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/umls_util.py b/umls_util.py index 751264fd..6a824fd7 100644 --- a/umls_util.py +++ b/umls_util.py @@ -484,7 +484,7 @@ def process_msh_item(self, node_id, info, umls_code): ol = attributes.get('OL', list()) mn = attributes.get('MN', list()) - if tuis == ("T109", "T121"): + if tuis == ["T109", "T121"]: category = kg2_util.BIOLINK_CATEGORY_CHEMICAL_ENTITY self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) @@ -722,7 +722,7 @@ def process_umls_item(self, node_id, info, umls_code): if node_curie == None: return - if tuis == ("T109", "T121"): + if tuis == ["T109", "T121"]: category = kg2_util.BIOLINK_CATEGORY_CHEMICAL_ENTITY self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) From 4ce14877f41a12dfe7af63c151c6e9b5733b5ebf Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 1 Sep 2023 00:35:57 -0700 Subject: [PATCH 083/117] #316 addressing NCIT gene to named thing issue --- umls_util.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/umls_util.py b/umls_util.py index 6a824fd7..86cd5d13 100644 --- a/umls_util.py +++ b/umls_util.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 -'''umls_list_jsonl_to_kg_jsonl.py: converts UMLS MySQL JSON Lines dump into KG2 JSON format +'''umls_util.py: handles source-specific conversion of UMLS MySQL JSON Lines dump into KG2 JSON format - Usage: umls_list_jsonl_to_kg_jsonl.py [--test] + Usage: import umls_util.py ''' __author__ = 'Erica Wood' @@ -583,6 +583,9 @@ def process_nci_item(self, node_id, info, umls_code): us_recommended_intake = attributes.get('US_RECOMMENDED_INTAKE', list()) chemical_formula = attributes.get('CHEMICAL_FORMULA', list()) + if tuis == ['T028'] and (len(entrezgene_id) > 0 or len(hgnc_id) > 0 or len(gene_encodes_product) > 0 or "gene" in name.lower() or "allele" in name.lower()): + category = kg2_util.BIOLINK_CATEGORY_GENE + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) self.create_xref_edges(node_curie, cuis, provided_by) From 91e4e5eb8833f5836d5c4fd6a1c4840cbd39cfce Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 1 Sep 2023 01:25:51 -0700 Subject: [PATCH 084/117] #316 there's a lot of T109-drug nodes, hopefully this will clear out the rest of the inconsistencies --- umls_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/umls_util.py b/umls_util.py index 86cd5d13..97dabd0b 100644 --- a/umls_util.py +++ b/umls_util.py @@ -484,7 +484,7 @@ def process_msh_item(self, node_id, info, umls_code): ol = attributes.get('OL', list()) mn = attributes.get('MN', list()) - if tuis == ["T109", "T121"]: + if "T109" in tuis: category = kg2_util.BIOLINK_CATEGORY_CHEMICAL_ENTITY self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) @@ -725,7 +725,7 @@ def process_umls_item(self, node_id, info, umls_code): if node_curie == None: return - if tuis == ["T109", "T121"]: + if "T109" in tuis: category = kg2_util.BIOLINK_CATEGORY_CHEMICAL_ENTITY self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) From 29a4f7175d924d93a9621ca765563ef0e609558c Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 1 Sep 2023 01:57:19 -0700 Subject: [PATCH 085/117] #316 might have overcorrected --- umls_list_jsonl_to_kg_jsonl.py | 2 +- umls_util.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 35fd780e..408f9765 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -67,7 +67,7 @@ def extract_node_id(node_id_str): for data in input_items: # There should only be one item in the data dictionary for entity in data: - if entity == "('NOCODE', 'MTH')": + if entity == "('MTH', 'NOCODE')": continue value = data[entity] source, node_id = extract_node_id(entity) diff --git a/umls_util.py b/umls_util.py index 97dabd0b..4dd3856a 100644 --- a/umls_util.py +++ b/umls_util.py @@ -484,7 +484,7 @@ def process_msh_item(self, node_id, info, umls_code): ol = attributes.get('OL', list()) mn = attributes.get('MN', list()) - if "T109" in tuis: + if category == kg2_util.BIOLINK_CATEGORY_GENE and "T109" in tuis: category = kg2_util.BIOLINK_CATEGORY_CHEMICAL_ENTITY self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) @@ -725,7 +725,7 @@ def process_umls_item(self, node_id, info, umls_code): if node_curie == None: return - if "T109" in tuis: + if category == kg2_util.BIOLINK_CATEGORY_GENE and "T109" in tuis: category = kg2_util.BIOLINK_CATEGORY_CHEMICAL_ENTITY self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) From 54e8cdc20d39e4ec4629b1a54f318a78016894d0 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 1 Sep 2023 02:46:52 -0700 Subject: [PATCH 086/117] #316 correcting a typo and addressing issue of former biolink:BiologicalEntity nodes that should be genes --- umls_util.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/umls_util.py b/umls_util.py index 4dd3856a..6c8e9b18 100644 --- a/umls_util.py +++ b/umls_util.py @@ -484,7 +484,7 @@ def process_msh_item(self, node_id, info, umls_code): ol = attributes.get('OL', list()) mn = attributes.get('MN', list()) - if category == kg2_util.BIOLINK_CATEGORY_GENE and "T109" in tuis: + if category == kg2_util.BIOLINK_CATEGORY_DRUG and "T109" in tuis: category = kg2_util.BIOLINK_CATEGORY_CHEMICAL_ENTITY self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) @@ -725,8 +725,11 @@ def process_umls_item(self, node_id, info, umls_code): if node_curie == None: return - if category == kg2_util.BIOLINK_CATEGORY_GENE and "T109" in tuis: + if category == kg2_util.BIOLINK_CATEGORY_DRUG and "T109" in tuis: category = kg2_util.BIOLINK_CATEGORY_CHEMICAL_ENTITY + if category == kg2_util.BIOLINK_NAMED_THING and tuis == ["T028"] and ("gene" in name.lower() or "allele" in name.lower()): + category = kg2_util.BIOLINK_CATEGORY_GENE + self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) self.create_umls_edges(node_curie, info.get(self.RELATIONS_KEY, dict())) From 3a993ed3b9f3145f83d215b9921353309742d9ce Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 1 Sep 2023 02:54:17 -0700 Subject: [PATCH 087/117] #316 addressing category typo --- umls_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umls_util.py b/umls_util.py index 6c8e9b18..5fe81931 100644 --- a/umls_util.py +++ b/umls_util.py @@ -728,7 +728,7 @@ def process_umls_item(self, node_id, info, umls_code): if category == kg2_util.BIOLINK_CATEGORY_DRUG and "T109" in tuis: category = kg2_util.BIOLINK_CATEGORY_CHEMICAL_ENTITY - if category == kg2_util.BIOLINK_NAMED_THING and tuis == ["T028"] and ("gene" in name.lower() or "allele" in name.lower()): + if category == kg2_util.BIOLINK_CATEGORY_NAMED_THING and tuis == ["T028"] and ("gene" in name.lower() or "allele" in name.lower()): category = kg2_util.BIOLINK_CATEGORY_GENE self.make_umls_node(node_curie, iri, name, category, "2023", provided_by, synonyms, self.create_description(tuis, description)) From 2449bb9e7b854ef3881e973b7d5042ef896987f7 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 1 Sep 2023 14:42:25 -0700 Subject: [PATCH 088/117] #316 source nodes try 1 --- umls_util.py | 54 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/umls_util.py b/umls_util.py index 5fe81931..97a4e299 100644 --- a/umls_util.py +++ b/umls_util.py @@ -23,29 +23,29 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ self.TUI_MAPPINGS = tui_mappings self.IRI_MAPPINGS = iri_mappings self.full_name_heirarchy = full_name_heirarchy - self.SOURCES = {'ATC': [self.process_atc_item, kg2_util.CURIE_PREFIX_ATC, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ATC')], - 'CHV': [self.process_chv_item, kg2_util.CURIE_PREFIX_CHV, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'CHV')], - 'DRUGBANK': [self.process_drugbank_item, kg2_util.CURIE_PREFIX_DRUGBANK, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'DRUGBANK')], - 'FMA': [self.process_fma_item, kg2_util.CURIE_PREFIX_FMA, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'FMA')], - 'GO': [self.process_go_item, kg2_util.CURIE_PREFIX_GO, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'GO')], - 'HCPCS': [self.process_hcpcs_item, kg2_util.CURIE_PREFIX_HCPCS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HCPCS')], - 'HGNC': [self.process_hgnc_item, kg2_util.CURIE_PREFIX_HGNC, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HGNC')], - 'HL7V3.0': [self.process_hl7_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HL7')], - 'HPO': [self.process_hpo_item, kg2_util.CURIE_PREFIX_HP, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HPO')], - 'ICD10PCS': [self.process_icd10pcs_item, kg2_util.CURIE_PREFIX_ICD10PCS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ICD10PCS')], - 'ICD9CM': [self.process_icd9cm_item, kg2_util.CURIE_PREFIX_ICD9, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ICD9CM')], - 'MED-RT': [self.process_medrt_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MED-RT')], - 'MEDLINEPLUS': [self.process_medlineplus_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MEDLINEPLUS')], - 'MSH': [self.process_msh_item, kg2_util.CURIE_PREFIX_MESH, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MSH')], - 'MTH': [self.process_mth_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MTH')], - 'NCBI': [self.process_ncbi_item, kg2_util.CURIE_PREFIX_NCBI_TAXON, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NCBITAXON')], - 'NCI': [self.process_nci_item, kg2_util.CURIE_PREFIX_NCIT, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NCI')], - 'NDDF': [self.process_nddf_item, kg2_util.CURIE_PREFIX_NDDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NCI')], - 'OMIM': [self.process_omim_item, kg2_util.CURIE_PREFIX_OMIM, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'OMIM')], - 'PDQ': [self.process_pdq_item, kg2_util.CURIE_PREFIX_PDQ, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PDQ')], - 'PSY': [self.process_psy_item, kg2_util.CURIE_PREFIX_PSY, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PSY')], - 'RXNORM': [self.process_rxnorm_item, kg2_util.CURIE_PREFIX_RXNORM, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'RXNORM')], - 'VANDF': [self.process_vandf_item, kg2_util.CURIE_PREFIX_VANDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'VANDF')], + self.SOURCES = {'ATC': [self.process_atc_item, kg2_util.CURIE_PREFIX_ATC, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ATC'), "Anatomical Therapeutic Chemical Classification System"], + 'CHV': [self.process_chv_item, kg2_util.CURIE_PREFIX_CHV, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'CHV'), "Consumer Health Vocabulary"], + 'DRUGBANK': [self.process_drugbank_item, kg2_util.CURIE_PREFIX_DRUGBANK, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'DRUGBANK'), "DrugBank"], + 'FMA': [self.process_fma_item, kg2_util.CURIE_PREFIX_FMA, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'FMA'), "Foundational Model of Anatomy"], + 'GO': [self.process_go_item, kg2_util.CURIE_PREFIX_GO, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'GO'), "Gene Ontology"], + 'HCPCS': [self.process_hcpcs_item, kg2_util.CURIE_PREFIX_HCPCS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HCPCS'), "Healthcare Common Procedure Coding System"], + 'HGNC': [self.process_hgnc_item, kg2_util.CURIE_PREFIX_HGNC, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HGNC'), "HUGO Gene Nomenclature Committee"], + 'HL7V3.0': [self.process_hl7_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HL7'), "HL7 Version 3.0"], + 'HPO': [self.process_hpo_item, kg2_util.CURIE_PREFIX_HP, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HPO'), "Human Phenotype Ontology"], + 'ICD10PCS': [self.process_icd10pcs_item, kg2_util.CURIE_PREFIX_ICD10PCS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ICD10PCS'), "ICD-10 Procedure Coding System"], + 'ICD9CM': [self.process_icd9cm_item, kg2_util.CURIE_PREFIX_ICD9, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ICD9CM'), "International Classification of Diseases, Ninth Revision, Clinical Modification"], + 'MED-RT': [self.process_medrt_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MED-RT'), "Medication Reference Terminology"], + 'MEDLINEPLUS': [self.process_medlineplus_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MEDLINEPLUS'), "MedlinePlus Health Topics"], + 'MSH': [self.process_msh_item, kg2_util.CURIE_PREFIX_MESH, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MSH'), "Medical Subject Headings"], + 'MTH': [self.process_mth_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MTH'), "Metathesaurus Names"], + 'NCBI': [self.process_ncbi_item, kg2_util.CURIE_PREFIX_NCBI_TAXON, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NCBITAXON'), "NCBI Taxonomy"], + 'NCI': [self.process_nci_item, kg2_util.CURIE_PREFIX_NCIT, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NCI'), "NCI Thesaurus"], + 'NDDF': [self.process_nddf_item, kg2_util.CURIE_PREFIX_NDDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NDDF'), "National Drug Data File"], + 'OMIM': [self.process_omim_item, kg2_util.CURIE_PREFIX_OMIM, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'OMIM'), "Online Mendelian Inheritance in Man"], + 'PDQ': [self.process_pdq_item, kg2_util.CURIE_PREFIX_PDQ, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PDQ'), "Physician Data Query"], + 'PSY': [self.process_psy_item, kg2_util.CURIE_PREFIX_PSY, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PSY'), "Psychological Index Terms"], + 'RXNORM': [self.process_rxnorm_item, kg2_util.CURIE_PREFIX_RXNORM, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'RXNORM'), "RXNORM"], + 'VANDF': [self.process_vandf_item, kg2_util.CURIE_PREFIX_VANDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'VANDF'), "National Drug File"], 'UMLS': [self.process_umls_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY, 'umls')]} self.create_umls_accession_heirarchy() self.create_accession_sources_heirarchy() @@ -59,6 +59,14 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ self.last_source = '' self.hgnc_to_omim = dict() + for source in self.SOURCES: + source_id = self.SOURCES[source][2] + curie_prefix = source_id.split(':')[0] + node_specific_id = source_id.split(':')[1] + iri = IRI_MAPPINGS[curie_prefix] + node_specific_id + name = self.SOURCES[source][3] + self.make_umls_node(source_id, iri, name, kg2_util.SOURCE_NODE_CATEGORY, "2023", source_id, list(), "") + def process_node(self, source, node_id, data): if source != self.last_source and self.last_source != '' and self.last_source in self.SOURCES: From d050137a0e2ac9a2f80fe88b81c80e42aae32585 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 1 Sep 2023 14:50:29 -0700 Subject: [PATCH 089/117] #316 iri map needs to be called with self --- umls_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umls_util.py b/umls_util.py index 97a4e299..3827f413 100644 --- a/umls_util.py +++ b/umls_util.py @@ -63,7 +63,7 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ source_id = self.SOURCES[source][2] curie_prefix = source_id.split(':')[0] node_specific_id = source_id.split(':')[1] - iri = IRI_MAPPINGS[curie_prefix] + node_specific_id + iri = self.IRI_MAPPINGS[curie_prefix] + node_specific_id name = self.SOURCES[source][3] self.make_umls_node(source_id, iri, name, kg2_util.SOURCE_NODE_CATEGORY, "2023", source_id, list(), "") From 8abca8c518dec9e3b1ec87270d6f0aa301ae29ae Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 1 Sep 2023 14:52:31 -0700 Subject: [PATCH 090/117] #316 name for UMLS --- umls_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umls_util.py b/umls_util.py index 3827f413..54b3a946 100644 --- a/umls_util.py +++ b/umls_util.py @@ -46,7 +46,7 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ 'PSY': [self.process_psy_item, kg2_util.CURIE_PREFIX_PSY, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PSY'), "Psychological Index Terms"], 'RXNORM': [self.process_rxnorm_item, kg2_util.CURIE_PREFIX_RXNORM, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'RXNORM'), "RXNORM"], 'VANDF': [self.process_vandf_item, kg2_util.CURIE_PREFIX_VANDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'VANDF'), "National Drug File"], - 'UMLS': [self.process_umls_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY, 'umls')]} + 'UMLS': [self.process_umls_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY, 'umls'), "Unified Medical Language System"]} self.create_umls_accession_heirarchy() self.create_accession_sources_heirarchy() From 16f99bc9b3b2a093dd0361231eb52afb8c007cf8 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 1 Sep 2023 16:06:35 -0700 Subject: [PATCH 091/117] #316 get source name/version information directly from UMLS --- umls_mysql_to_list_jsonl.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index 24259de7..e69d64c3 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -27,15 +27,16 @@ def get_args(): return arg_parser.parse_args() -def get_english_sources(cursor): - sources_sql_statement = "SELECT RSAB, LAT FROM MRSAB" +def get_english_sources(cursor, output): + sources_sql_statement = "SELECT RSAB, LAT, SSN, IMETA, SVER FROM MRSAB" sources = [] cursor.execute(sources_sql_statement) for result in cursor.fetchall(): - (source, language) = result + (source, language, source_name, version, update_date) = result if language == 'ENG': sources.append(source) + output.write({("UMLS_SOURCE", source): {"update_date": update_date, "source_name": source_name, "version": version}}) print("Finished sources_sql_statement at", kg2_util.date()) @@ -241,7 +242,7 @@ def cui_sources(cursor, output, sources): cursor.fetchall() # This ensure we don't have UMLS sources that overwrite each other's names - sources = get_english_sources(cursor) + sources = get_english_sources(cursor, output) code_sources(cursor, output) cui_sources(cursor, output, sources) From 97801007cb868e835fc03d2885280be0877f6acd Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 1 Sep 2023 16:18:24 -0700 Subject: [PATCH 092/117] #316 adding umls source node processing into umls_util --- umls_list_jsonl_to_kg_jsonl.py | 1 + umls_util.py | 70 ++++++++++++++++++---------------- 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 408f9765..8d58b3ed 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -74,6 +74,7 @@ def extract_node_id(node_id_str): # Process the data specifically by source umls_processor.process_node(source, node_id, value) + print("Finished processing", umls_processor.last_source, "at", kg2_util.date()) kg2_util.end_read_jsonlines(input_read_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) diff --git a/umls_util.py b/umls_util.py index 54b3a946..0d42297f 100644 --- a/umls_util.py +++ b/umls_util.py @@ -23,30 +23,31 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ self.TUI_MAPPINGS = tui_mappings self.IRI_MAPPINGS = iri_mappings self.full_name_heirarchy = full_name_heirarchy - self.SOURCES = {'ATC': [self.process_atc_item, kg2_util.CURIE_PREFIX_ATC, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ATC'), "Anatomical Therapeutic Chemical Classification System"], - 'CHV': [self.process_chv_item, kg2_util.CURIE_PREFIX_CHV, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'CHV'), "Consumer Health Vocabulary"], - 'DRUGBANK': [self.process_drugbank_item, kg2_util.CURIE_PREFIX_DRUGBANK, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'DRUGBANK'), "DrugBank"], - 'FMA': [self.process_fma_item, kg2_util.CURIE_PREFIX_FMA, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'FMA'), "Foundational Model of Anatomy"], - 'GO': [self.process_go_item, kg2_util.CURIE_PREFIX_GO, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'GO'), "Gene Ontology"], - 'HCPCS': [self.process_hcpcs_item, kg2_util.CURIE_PREFIX_HCPCS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HCPCS'), "Healthcare Common Procedure Coding System"], - 'HGNC': [self.process_hgnc_item, kg2_util.CURIE_PREFIX_HGNC, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HGNC'), "HUGO Gene Nomenclature Committee"], - 'HL7V3.0': [self.process_hl7_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HL7'), "HL7 Version 3.0"], - 'HPO': [self.process_hpo_item, kg2_util.CURIE_PREFIX_HP, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HPO'), "Human Phenotype Ontology"], - 'ICD10PCS': [self.process_icd10pcs_item, kg2_util.CURIE_PREFIX_ICD10PCS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ICD10PCS'), "ICD-10 Procedure Coding System"], - 'ICD9CM': [self.process_icd9cm_item, kg2_util.CURIE_PREFIX_ICD9, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ICD9CM'), "International Classification of Diseases, Ninth Revision, Clinical Modification"], - 'MED-RT': [self.process_medrt_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MED-RT'), "Medication Reference Terminology"], - 'MEDLINEPLUS': [self.process_medlineplus_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MEDLINEPLUS'), "MedlinePlus Health Topics"], - 'MSH': [self.process_msh_item, kg2_util.CURIE_PREFIX_MESH, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MSH'), "Medical Subject Headings"], - 'MTH': [self.process_mth_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MTH'), "Metathesaurus Names"], - 'NCBI': [self.process_ncbi_item, kg2_util.CURIE_PREFIX_NCBI_TAXON, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NCBITAXON'), "NCBI Taxonomy"], - 'NCI': [self.process_nci_item, kg2_util.CURIE_PREFIX_NCIT, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NCI'), "NCI Thesaurus"], - 'NDDF': [self.process_nddf_item, kg2_util.CURIE_PREFIX_NDDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NDDF'), "National Drug Data File"], - 'OMIM': [self.process_omim_item, kg2_util.CURIE_PREFIX_OMIM, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'OMIM'), "Online Mendelian Inheritance in Man"], - 'PDQ': [self.process_pdq_item, kg2_util.CURIE_PREFIX_PDQ, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PDQ'), "Physician Data Query"], - 'PSY': [self.process_psy_item, kg2_util.CURIE_PREFIX_PSY, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PSY'), "Psychological Index Terms"], - 'RXNORM': [self.process_rxnorm_item, kg2_util.CURIE_PREFIX_RXNORM, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'RXNORM'), "RXNORM"], - 'VANDF': [self.process_vandf_item, kg2_util.CURIE_PREFIX_VANDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'VANDF'), "National Drug File"], - 'UMLS': [self.process_umls_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY, 'umls'), "Unified Medical Language System"]} + self.SOURCES = {'UMLS_SOURCE': [self.process_umls_source_item, None, None], + 'ATC': [self.process_atc_item, kg2_util.CURIE_PREFIX_ATC, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ATC')], + 'CHV': [self.process_chv_item, kg2_util.CURIE_PREFIX_CHV, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'CHV')], + 'DRUGBANK': [self.process_drugbank_item, kg2_util.CURIE_PREFIX_DRUGBANK, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'DRUGBANK')], + 'FMA': [self.process_fma_item, kg2_util.CURIE_PREFIX_FMA, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'FMA')], + 'GO': [self.process_go_item, kg2_util.CURIE_PREFIX_GO, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'GO')], + 'HCPCS': [self.process_hcpcs_item, kg2_util.CURIE_PREFIX_HCPCS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HCPCS')], + 'HGNC': [self.process_hgnc_item, kg2_util.CURIE_PREFIX_HGNC, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HGNC')], + 'HL7V3.0': [self.process_hl7_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HL7')], + 'HPO': [self.process_hpo_item, kg2_util.CURIE_PREFIX_HP, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'HPO')], + 'ICD10PCS': [self.process_icd10pcs_item, kg2_util.CURIE_PREFIX_ICD10PCS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ICD10PCS')], + 'ICD9CM': [self.process_icd9cm_item, kg2_util.CURIE_PREFIX_ICD9, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'ICD9CM')], + 'MED-RT': [self.process_medrt_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MED-RT')], + 'MEDLINEPLUS': [self.process_medlineplus_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MEDLINEPLUS')], + 'MSH': [self.process_msh_item, kg2_util.CURIE_PREFIX_MESH, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MSH')], + 'MTH': [self.process_mth_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'MTH')], + 'NCBI': [self.process_ncbi_item, kg2_util.CURIE_PREFIX_NCBI_TAXON, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NCBITAXON')], + 'NCI': [self.process_nci_item, kg2_util.CURIE_PREFIX_NCIT, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NCI')], + 'NDDF': [self.process_nddf_item, kg2_util.CURIE_PREFIX_NDDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'NDDF')], + 'OMIM': [self.process_omim_item, kg2_util.CURIE_PREFIX_OMIM, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'OMIM')], + 'PDQ': [self.process_pdq_item, kg2_util.CURIE_PREFIX_PDQ, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PDQ')], + 'PSY': [self.process_psy_item, kg2_util.CURIE_PREFIX_PSY, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'PSY')], + 'RXNORM': [self.process_rxnorm_item, kg2_util.CURIE_PREFIX_RXNORM, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'RXNORM')], + 'VANDF': [self.process_vandf_item, kg2_util.CURIE_PREFIX_VANDF, self.make_node_id(kg2_util.CURIE_PREFIX_UMLS_SOURCE, 'VANDF')], + 'UMLS': [self.process_umls_item, kg2_util.CURIE_PREFIX_UMLS, self.make_node_id(kg2_util.CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY, 'umls')]} self.create_umls_accession_heirarchy() self.create_accession_sources_heirarchy() @@ -59,18 +60,11 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ self.last_source = '' self.hgnc_to_omim = dict() - for source in self.SOURCES: - source_id = self.SOURCES[source][2] - curie_prefix = source_id.split(':')[0] - node_specific_id = source_id.split(':')[1] - iri = self.IRI_MAPPINGS[curie_prefix] + node_specific_id - name = self.SOURCES[source][3] - self.make_umls_node(source_id, iri, name, kg2_util.SOURCE_NODE_CATEGORY, "2023", source_id, list(), "") - def process_node(self, source, node_id, data): if source != self.last_source and self.last_source != '' and self.last_source in self.SOURCES: print("Finished processing", self.last_source, "at", kg2_util.date()) + print("Started processing", source, "at", kg2_util.date()) self.last_source = source if source in self.SOURCES: self.SOURCES[source][0](node_id, data, source) @@ -182,7 +176,17 @@ def create_description(self, tuis, comment=""): for tui in tuis: description += "; UMLS Semantic Type: STY:" + tui description = description.strip("; ") - return description + return description + + + def process_umls_source_item(self, node_id, info, umls_code): + source_id = self.SOURCES[node_id][2] + curie_prefix = source_id.split(':')[0] + node_specific_id = source_id.split(':')[1] + iri = self.IRI_MAPPINGS[curie_prefix] + node_specific_id + name = info.get('source_name', '') + ' v' + info.get('version', '') + update_date = info.get('update_date', '') + self.make_umls_node(source_id, iri, name, kg2_util.SOURCE_NODE_CATEGORY, update_date, source_id, list(), "") def process_atc_item(self, node_id, info, umls_code): From 928af259299a18042bd945d90df8fcb613e1948d Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 6 Sep 2023 11:44:06 -0700 Subject: [PATCH 093/117] #316 source key can't be a tuple --- umls_mysql_to_list_jsonl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index e69d64c3..43118d51 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -36,7 +36,7 @@ def get_english_sources(cursor, output): (source, language, source_name, version, update_date) = result if language == 'ENG': sources.append(source) - output.write({("UMLS_SOURCE", source): {"update_date": update_date, "source_name": source_name, "version": version}}) + output.write({str(("UMLS_SOURCE", source)): {"update_date": update_date, "source_name": source_name, "version": version}}) print("Finished sources_sql_statement at", kg2_util.date()) From 3be034f61202a284043333ebc52fa62826d03410 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 6 Sep 2023 12:50:11 -0700 Subject: [PATCH 094/117] #316 only want some umls source nodes --- umls_util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/umls_util.py b/umls_util.py index 0d42297f..8e07275a 100644 --- a/umls_util.py +++ b/umls_util.py @@ -180,6 +180,8 @@ def create_description(self, tuis, comment=""): def process_umls_source_item(self, node_id, info, umls_code): + if node_id not in self.SOURCES: + return source_id = self.SOURCES[node_id][2] curie_prefix = source_id.split(':')[0] node_specific_id = source_id.split(':')[1] From 7d2254c4a38fd616bae88cf2633967932ba4ebb6 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 6 Sep 2023 12:59:38 -0700 Subject: [PATCH 095/117] #316 trying to stop the starting finishing spam --- umls_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umls_util.py b/umls_util.py index 8e07275a..4a557c08 100644 --- a/umls_util.py +++ b/umls_util.py @@ -62,7 +62,7 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ def process_node(self, source, node_id, data): - if source != self.last_source and self.last_source != '' and self.last_source in self.SOURCES: + if source != self.last_source and self.last_source != '' and self.last_source in self.SOURCES and source in self.SOURCES: print("Finished processing", self.last_source, "at", kg2_util.date()) print("Started processing", source, "at", kg2_util.date()) self.last_source = source From 4a5cc59091fbc6614a30ec18730c0f8c342b8849 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 6 Sep 2023 13:03:31 -0700 Subject: [PATCH 096/117] #316 trying to stop the starting finishing spam trial 2 --- umls_list_jsonl_to_kg_jsonl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 8d58b3ed..3ff28081 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -74,7 +74,7 @@ def extract_node_id(node_id_str): # Process the data specifically by source umls_processor.process_node(source, node_id, value) - print("Finished processing", umls_processor.last_source, "at", kg2_util.date()) + print("Finished processing", umls_processor.last_source, "at", kg2_util.date()) kg2_util.end_read_jsonlines(input_read_jsonlines_info) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) From cdb09ae79df22d06e6e6c482061fcdba9ab8ef00 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 6 Sep 2023 13:06:58 -0700 Subject: [PATCH 097/117] #316 print correct starting/finishing sources --- umls_util.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/umls_util.py b/umls_util.py index 4a557c08..f82dfe59 100644 --- a/umls_util.py +++ b/umls_util.py @@ -62,9 +62,11 @@ def __init__(self, nodes_output, edges_output, tui_mappings, iri_mappings, full_ def process_node(self, source, node_id, data): - if source != self.last_source and self.last_source != '' and self.last_source in self.SOURCES and source in self.SOURCES: - print("Finished processing", self.last_source, "at", kg2_util.date()) - print("Started processing", source, "at", kg2_util.date()) + if source != self.last_source: + if self.last_source != '' and self.last_source in self.SOURCES: + print("Finished processing", self.last_source, "at", kg2_util.date()) + if source in self.SOURCES: + print("Started processing", source, "at", kg2_util.date()) self.last_source = source if source in self.SOURCES: self.SOURCES[source][0](node_id, data, source) From 644892400888ca0f04026fa4545b41f69e56521b Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 6 Sep 2023 14:11:55 -0700 Subject: [PATCH 098/117] #316 handle duplicate source names --- umls_mysql_to_list_jsonl.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index 43118d51..7b148b1d 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -31,14 +31,32 @@ def get_english_sources(cursor, output): sources_sql_statement = "SELECT RSAB, LAT, SSN, IMETA, SVER FROM MRSAB" sources = [] + source_data = dict() + cursor.execute(sources_sql_statement) for result in cursor.fetchall(): (source, language, source_name, version, update_date) = result if language == 'ENG': sources.append(source) - output.write({str(("UMLS_SOURCE", source)): {"update_date": update_date, "source_name": source_name, "version": version}}) + key = ("UMLS_SOURCE", source) + + if key in source_data: + old_date = source_data[key].get('update_date', '') + + old_date_val = old_date.strip('B').strip('A') + new_date_val = update_date.strip('B').strip('A') + + if new_date_val < old_date_val or (new_date_val == old_date_val and old_date_val.endswith('AB')): + continue + + source_data[key] = {"update_date": update_date, "source_name": source_name, "version": version} + + record_num = 0 + for key, val in source_data.items(): + record_num += 1 + output.write({str(key): val}) - print("Finished sources_sql_statement at", kg2_util.date()) + print("Finished adding", record_num, "records in get_english_sources() at", kg2_util.date()) return sources From b6d011c15e733a74a2816ed0d6e285b1f43ec688 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 6 Sep 2023 14:51:26 -0700 Subject: [PATCH 099/117] #316 use string date to handle date priority --- umls_mysql_to_list_jsonl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index 7b148b1d..48df6067 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -46,7 +46,7 @@ def get_english_sources(cursor, output): old_date_val = old_date.strip('B').strip('A') new_date_val = update_date.strip('B').strip('A') - if new_date_val < old_date_val or (new_date_val == old_date_val and old_date_val.endswith('AB')): + if new_date_val < old_date_val or (new_date_val == old_date_val and old_date.endswith('AB')): continue source_data[key] = {"update_date": update_date, "source_name": source_name, "version": version} From c17582f084d416e04caf337e9a0e963cca39ac3e Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 6 Sep 2023 21:23:54 -0700 Subject: [PATCH 100/117] #316 relation prefix --- umls_util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/umls_util.py b/umls_util.py index f82dfe59..3edb4ce7 100644 --- a/umls_util.py +++ b/umls_util.py @@ -130,11 +130,12 @@ def create_umls_edges(self, subject_id, relations): for relation_source in relations: if relation_source in self.SOURCES: provided_by = self.SOURCES[relation_source][2] + relation_prefix = self.SOURCES[relation_source][1] for relation in relations[relation_source]: relation_abbr, relation_label, relation_direction = relation.split(',') if relation_label == 'None': relation_label = relation_abbr - relation_curie = self.make_node_id(relation_source, relation_label) + relation_curie = self.make_node_id(relation_prefix, relation_label) for cui in relations[relation_source][relation]: object_id = self.make_node_id(kg2_util.CURIE_PREFIX_UMLS, cui) # TODO: resolve update_date From 0f74d1d552fd7d5a5e8d049b45a79b5c309c643f Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 7 Sep 2023 13:22:53 -0700 Subject: [PATCH 101/117] #316 predicate remap additions for new UMLS ETL --- predicate-remap.yaml | 528 +++++++++++++++++++++++++++---------------- 1 file changed, 339 insertions(+), 189 deletions(-) diff --git a/predicate-remap.yaml b/predicate-remap.yaml index 1166fbdf..5aa422bd 100644 --- a/predicate-remap.yaml +++ b/predicate-remap.yaml @@ -1,3 +1,15 @@ +ATC:has_member: + operation: keep + core_predicate: biolink:related_to +ATC:inverse_isa: + operation: invert + core_predicate: biolink:subclass_of +ATC:isa: + operation: keep + core_predicate: biolink:subclass_of +ATC:member_of: + operation: keep + core_predicate: biolink:related_to BFO:0000050: operation: invert core_predicate: biolink:has_part @@ -1320,6 +1332,12 @@ FMA:insertion_of: FMA:internal_to: operation: keep core_predicate: biolink:coexists_with +FMA:inverse_isa: + operation: invert + core_predicate: biolink:subclass_of +FMA:isa: + operation: keep + core_predicate: biolink:subclass_of FMA:lateral_to: operation: keep core_predicate: biolink:coexists_with @@ -1481,6 +1499,18 @@ GENEPIO:0001739: # core_predicate: biolink:gene_associated_with_condition # GO:SIB: # operation: delete +GO:RB: + operation: invert + core_predicate: biolink:subclass_of +GO:RN: + operation: keep + core_predicate: biolink:subclass_of +GO:RO: + operation: keep + core_predicate: biolink:related_to +GO:SY: + operation: keep + core_predicate: biolink:close_match GO:acts_upstream_of: operation: keep core_predicate: biolink:affects @@ -1554,6 +1584,12 @@ GO:isa: GO:located_in: operation: keep core_predicate: biolink:located_in +GO:mth_expanded_form_of: + operation: keep + core_predicate: biolink:close_match +GO:mth_has_expanded_form: + operation: keep + core_predicate: biolink:close_match GO:negatively_regulated_by: operation: invert core_predicate: biolink:regulates @@ -1632,11 +1668,41 @@ HANCESTRO:0308: HANCESTRO:0330: operation: keep core_predicate: biolink:related_to +HCPCS:CHD: + operation: invert + core_predicate: biolink:subclass_of +HCPCS:PAR: + operation: keep + core_predicate: biolink:subclass_of HCPCS:mapped_from: operation: delete HCPCS:mapped_to: operation: keep core_predicate: biolink:related_to +HGNC:alias_of: + operation: keep + core_predicate: biolink:close_match +HGNC:expanded_form_of: + operation: keep + core_predicate: biolink:close_match +HGNC:has_alias: + operation: keep + core_predicate: biolink:close_match +HGNC:has_expanded_form: + operation: keep + core_predicate: biolink:close_match +HGNC:has_prev_name: + operation: keep + core_predicate: biolink:close_match +HGNC:has_prev_symbol: + operation: keep + core_predicate: biolink:close_match +HGNC:prev_name_of: + operation: keep + core_predicate: biolink:close_match +HGNC:prev_symbol_of: + operation: keep + core_predicate: biolink:close_match HMDB:at_cellular_location: operation: keep core_predicate: biolink:located_in @@ -1655,8 +1721,24 @@ HMDB:in_biospecimen: HMDB:in_pathway: operation: invert core_predicate: biolink:has_participant -# HP:SIB: -# operation: delete +HP:RB: + operation: invert + core_predicate: biolink:subclass_of +HP:RN: + operation: keep + core_predicate: biolink:subclass_of +HP:RO: + operation: keep + core_predicate: biolink:related_to +HP:SY: + operation: keep + core_predicate: biolink:close_match +HP:inverse_isa: + operation: invert + core_predicate: biolink:subclass_of +HP:isa: + operation: keep + core_predicate: biolink:close_match IAO:0000039: operation: keep core_predicate: biolink:related_to @@ -1669,10 +1751,24 @@ IAO:0000142: IAO:0000219: operation: keep core_predicate: biolink:related_to -# ICD10PCS:SIB: -# operation: delete -# ICD9:SIB: -# operation: delete +ICD10PCS:CHD: + operation: invert + core_predicate: biolink:subclass_of +ICD10PCS:PAR: + operation: keep + core_predicate: biolink:subclass_of +ICD10PCS:expanded_form_of: + operation: keep + core_predicate: biolink:close_match +ICD10PCS:has_expanded_form: + operation: keep + core_predicate: biolink:close_match +ICD9:CHD: + operation: invert + core_predicate: biolink:subclass_of +ICD9:PAR: + operation: keep + core_predicate: biolink:subclass_of IDO:0000664: operation: invert core_predicate: biolink:contributes_to @@ -1957,8 +2053,20 @@ LOINC:time_modifier_of: # core_predicate: biolink:has_part MESH:AQ: operation: delete +MESH:CHD: + operation: invert + core_predicate: biolink:subclass_of +MESH:PAR: + operation: keep + core_predicate: biolink:subclass_of MESH:QB: operation: delete +MESH:RB: + operation: invert + core_predicate: biolink:subclass_of +MESH:RN: + operation: keep + core_predicate: biolink:subclass_of MESH:RO: operation: keep core_predicate: biolink:related_to @@ -1967,6 +2075,9 @@ MESH:RO: MESH:has_mapping_qualifier: operation: keep core_predicate: biolink:related_to +MESH:has_permuted_term: + operation: keep + core_predicate: biolink:close_match MESH:inverse_isa: operation: keep core_predicate: biolink:superclass_of @@ -1980,6 +2091,9 @@ MESH:mapped_to: core_predicate: biolink:related_to MESH:mapping_qualifier_of: operation: delete +MESH:permuted_term_of: + operation: keep + core_predicate: biolink:close_match MI:0192: operation: keep core_predicate: biolink:directly_physically_interacts_with @@ -2153,201 +2267,30 @@ MONDO:part_of_progression_of_disease: MONDO:predisposes_towards: operation: keep core_predicate: biolink:contributes_to -# NBO-PROPERTY:by_means: -# operation: keep -# core_predicate: biolink:actively_involved_in -# NBO-PROPERTY:has_participant: -# operation: keep -# core_predicate: biolink:has_participant -# NBO-PROPERTY:in_response_to: -# operation: keep -# core_predicate: biolink:causes -# NBO-PROPERTY:is_about: -# operation: keep -# core_predicate: biolink:related_to -# NCIT:A11: -# operation: keep -# core_predicate: biolink:subclass_of -# NCIT:A14: -# operation: keep -# core_predicate: biolink:subclass_of -# NCIT:A16: -# operation: keep -# core_predicate: biolink:subclass_of -# NCIT:A3: -# operation: keep -# core_predicate: biolink:subclass_of -# NCIT:A7: -# operation: keep -# core_predicate: biolink:physically_interacts_with -# NCIT:Anatomic_Structure_Has_Location_Role: -# operation: invert -# core_predicate: biolink:located_in -# NCIT:C15220: -# operation: keep -# core_predicate: biolink:diagnoses -# NCIT:C16798: -# operation: keep -# core_predicate: biolink:in_linkage_disequilibrium_with +NCBITaxon:CHD: + operation: invert + core_predicate: biolink:subclass_of +NCBITaxon:PAR: + operation: keep + core_predicate: biolink:subclass_of +NCBITaxon:expanded_form_of: + operation: keep + core_predicate: biolink:close_match +NCBITaxon:has_expanded_form: + operation: keep + core_predicate: biolink:close_match NCIT:C2861: operation: keep core_predicate: biolink:has_side_effect -# NCIT:C37933: -# operation: keep -# core_predicate: biolink:contraindicated_for -# NCIT:R100: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R101: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R102: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R108: -# operation: keep -# core_predicate: biolink:has_phenotype -# NCIT:R113: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R115: -# operation: keep -# core_predicate: biolink:has_phenotype -# NCIT:R124: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R130: -# operation: invert -# core_predicate: biolink:has_participant -# NCIT:R131: -# operation: invert -# core_predicate: biolink:has_participant -# NCIT:R133: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R145: -# operation: keep -# core_predicate: biolink:located_in -# NCIT:R146: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R150: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R155: -# operation: keep -# core_predicate: biolink:located_in -# NCIT:R156: -# operation: keep -# core_predicate: biolink:located_in -# NCIT:R158: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R160: -# operation: keep -# core_predicate: biolink:affects NCIT:R163: operation: keep core_predicate: biolink:related_to -# NCIT:R165: -# operation: keep -# core_predicate: biolink:located_in -# NCIT:R166: -# operation: keep -# core_predicate: biolink:located_in -# NCIT:R167: -# operation: keep -# core_predicate: biolink:located_in -# NCIT:R168: -# operation: keep -# core_predicate: biolink:located_in -# NCIT:R169: -# operation: keep -# core_predicate: biolink:located_in -# NCIT:R170: -# operation: keep -# core_predicate: biolink:located_in -# NCIT:R171: -# operation: keep -# core_predicate: biolink:located_in -# NCIT:R173: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R175: -# operation: keep -# core_predicate: biolink:gene_associated_with_condition -# NCIT:R176: -# operation: invert -# core_predicate: biolink:gene_associated_with_condition -# NCIT:R178: -# operation: invert -# core_predicate: biolink:gene_product_of -# NCIT:R23: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R25: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R27: -# operation: invert -# core_predicate: biolink:has_part -# NCIT:R29: -# operation: keep -# core_predicate: biolink:produces -# NCIT:R30: -# operation: keep -# core_predicate: biolink:affects -# NCIT:R36: -# operation: keep -# core_predicate: biolink:subclass_of -# NCIT:R37: -# operation: invert -# core_predicate: biolink:has_participant -# NCIT:R38: -# operation: keep -# core_predicate: biolink:gene_associated_with_condition -# NCIT:R39: -# operation: keep -# core_predicate: biolink:biomarker_for -# NCIT:R40: -# operation: keep -# core_predicate: biolink:located_in -# NCIT:R42: -# operation: keep -# core_predicate: biolink:subclass_of -# NCIT:R47: -# operation: keep -# core_predicate: biolink:biomarker_for -# NCIT:R48: -# operation: keep -# core_predicate: biolink:gene_associated_with_condition -# NCIT:R50: -# operation: keep -# core_predicate: biolink:has_part -# NCIT:R51: -# operation: invert -# core_predicate: biolink:has_participant -# NCIT:R52: -# operation: keep -# core_predicate: biolink:capable_of -# NCIT:R53: -# operation: invert -# core_predicate: biolink:has_participant -# NCIT:R72: -# operation: keep -# core_predicate: biolink:affects NCIT:R81: operation: keep core_predicate: biolink:related_to NCIT:R82: operation: invert core_predicate: biolink:has_part -# NCIT:R88: -# operation: keep -# core_predicate: biolink:related_to -# NCIT:R89: -# operation: keep -# core_predicate: biolink:has_phenotype NCIT:abnormal_cell_affected_by_chemical_or_drug: operation: delete NCIT:abnormality_associated_with_allele: @@ -2746,6 +2689,9 @@ NCIT:has_seronet_permissible_value: NCIT:has_target: operation: keep core_predicate: biolink:physically_interacts_with +NCIT:has_tradename: + operation: keep + core_predicate: biolink:close_match NCIT:human_disease_maps_to_eo_disease: operation: keep core_predicate: biolink:related_to @@ -2755,6 +2701,12 @@ NCIT:imaged_anatomy_has_procedure: operation: delete NCIT:inc_parent_of: operation: delete +NCIT:inverse_isa: + operation: invert + core_predicate: biolink:close_match +NCIT:isa: + operation: keep + core_predicate: biolink:close_match NCIT:is_abnormal_cell_of_disease: operation: keep core_predicate: biolink:related_to @@ -2979,6 +2931,9 @@ NCIT:target_anatomy_has_procedure: operation: delete NCIT:tissue_is_expression_site_of_gene_product: operation: delete +NCIT:tradename_of: + operation: keep + core_predicate: biolink:close_match NCIT:value_set_is_paired_with: operation: delete NDDF:dose_form_of: @@ -3070,11 +3025,35 @@ OBO:nbo#is_about: # OIO:hasDbXref: # operation: keep # core_predicate: biolink:close_match +OMIM:CHD: + operation: invert + core_predicate: biolink:subclass_of +OMIM:PAR: + operation: keep + core_predicate: biolink:subclass_of +OMIM:alias_of: + operation: keep + core_predicate: biolink:related_to OMIM:allelic_variant_of: operation: keep core_predicate: biolink:is_sequence_variant_of +OMIM:entry_term_of: + operation: keep + core_predicate: biolink:related_to +OMIM:expanded_form_of: + operation: keep + core_predicate: biolink:close_match +OMIM:has_alias: + operation: keep + core_predicate: biolink:related_to OMIM:has_allelic_variant: operation: delete +OMIM:has_entry_term: + operation: keep + core_predicate: biolink:related_to +OMIM:has_expanded_form: + operation: keep + core_predicate: biolink:close_match OMIM:has_inheritance_type: operation: keep core_predicate: biolink:related_to @@ -3176,6 +3155,9 @@ PATO:reciprocal_of: # PATO:towards: # operation: invert # core_predicate: biolink:actively_involved_in +PDQ:SY: + operation: keep + core_predicate: biolink:close_match PDQ:associated_disease: operation: keep core_predicate: biolink:correlated_with @@ -3184,9 +3166,33 @@ PDQ:associated_genetic_condition: PDQ:component_of: operation: invert core_predicate: biolink:has_part +PDQ:expanded_form_of: + operation: keep + core_predicate: biolink:close_match PDQ:has_component: operation: keep core_predicate: biolink:has_part +PDQ:has_expanded_form: + operation: keep + core_predicate: biolink:close_match +PDQ:has_lab_number: + operation: keep + core_predicate: biolink:related_to +PDQ:has_tradename: + operation: keep + core_predicate: biolink:related_to +PDQ:inverse_isa: + operation: invert + core_predicate: biolink:subclass_of +PDQ:isa: + operation: keep + core_predicate: biolink:subclass_of +PDQ:lab_number_of: + operation: keep + core_predicate: biolink:related_to +PDQ:tradename_of: + operation: keep + core_predicate: biolink:related_to PHAROS:drug_targets: operation: keep core_predicate: biolink:directly_physically_interacts_with @@ -3206,12 +3212,27 @@ PR:lacks_part: PR:non-covalently_bound_to: operation: keep core_predicate: biolink:physically_interacts_with +PSY:CHD: + operation: invert + core_predicate: biolink:subclass_of +PSY:PAR: + operation: keep + core_predicate: biolink:subclass_of PSY:RB: operation: invert core_predicate: biolink:subclass_of +PSY:RN: + operation: keep + core_predicate: biolink:subclass_of PSY:RO: operation: keep core_predicate: biolink:related_to +PSY:has_member: + operation: keep + core_predicate: biolink:subclass_of +PSY:member_of: + operation: invert + core_predicate: biolink:subclass_of PSY:use: operation: keep core_predicate: biolink:subclass_of @@ -4115,6 +4136,9 @@ RO:0040036: RO:participates_in: operation: invert core_predicate: biolink:has_participant +RXNORM:SY: + operation: keep + core_predicate: biolink:close_match RXNORM:consists_of: operation: keep core_predicate: biolink:has_part @@ -4158,6 +4182,12 @@ RXNORM:has_quantified_form: RXNORM:has_tradename: operation: keep core_predicate: biolink:related_to +RXNORM:included_in: + operation: keep + core_predicate: biolink:related_to +RXNORM:includes: + operation: keep + core_predicate: biolink:related_to RXNORM:ingredient_of: operation: invert core_predicate: biolink:has_part @@ -4614,9 +4644,18 @@ UBERON_CORE:synapsed_by: # UBERON_NONAMESPACE:subdivision_of: # operation: keep # core_predicate: biolink:coexists_with +UMLS:CHD: + operation: invert + core_predicate: biolink:subclass_of +UMLS:PAR: + operation: keep + core_predicate: biolink:subclass_of UMLS:RB: operation: invert core_predicate: biolink:subclass_of +UMLS:RN: + operation: keep + core_predicate: biolink:subclass_of UMLS:RO: operation: keep core_predicate: biolink:related_to @@ -4628,6 +4667,9 @@ UMLS:RQ: UMLS:SY: operation: keep core_predicate: biolink:close_match +UMLS:active_metabolites_of: + operation: keep + core_predicate: biolink:related_to UMLS:class_code_classified_by: operation: keep core_predicate: biolink:related_to @@ -4638,6 +4680,21 @@ UMLS:component_of: core_predicate: biolink:has_part UMLS:context_binding_of: operation: delete +UMLS:contraindicated_class_of: + operation: keep + core_predicate: biolink:related_to +UMLS:contraindicated_mechanism_of_action_of: + operation: keep + core_predicate: biolink:related_to +UMLS:contraindicated_physiologic_effect_of: + operation: keep + core_predicate: biolink:related_to +UMLS:contraindicated_with_disease: + operation: keep + core_predicate: biolink:related_to +UMLS:effect_may_be_inhibited_by: + operation: keep + core_predicate: biolink:related_to UMLS:exhibited_by: operation: keep core_predicate: biolink:related_to @@ -4645,18 +4702,36 @@ UMLS:exhibits: operation: delete UMLS:form_of: operation: delete +UMLS:has_active_metabolites: + operation: keep + core_predicate: biolink:related_to UMLS:has_component: operation: keep core_predicate: biolink:has_part UMLS:has_context_binding: operation: keep core_predicate: biolink:related_to +UMLS:has_contraindicated_class: + operation: keep + core_predicate: biolink:related_to +UMLS:has_contraindicated_drug: + operation: keep + core_predicate: biolink:related_to +UMLS:has_contraindicated_mechanism_of_action: + operation: keep + core_predicate: biolink:related_to +UMLS:has_contraindicated_physiologic_effect: + operation: keep + core_predicate: biolink:related_to UMLS:has_form: operation: keep core_predicate: biolink:related_to UMLS:has_mapping_qualifier: operation: keep core_predicate: biolink:related_to +UMLS:has_mechanism_of_action: + operation: keep + core_predicate: biolink:related_to UMLS:has_owning_affiliate: operation: invert core_predicate: biolink:has_part @@ -4664,6 +4739,12 @@ UMLS:has_owning_section: operation: delete UMLS:has_owning_subsection: operation: delete +UMLS:has_parent: + operation: keep + core_predicate: biolink:subclass_of +UMLS:has_pharmacokinetics: + operation: keep + core_predicate: biolink:related_to UMLS:has_physiologic_effect: operation: keep core_predicate: biolink:causes @@ -4676,24 +4757,63 @@ UMLS:has_supported_concept_property: UMLS:has_supported_concept_relationship: operation: keep core_predicate: biolink:related_to +UMLS:has_therapeutic_class: + operation: keep + core_predicate: biolink:related_to +UMLS:induced_by: + operation: keep + core_predicate: biolink:related_to +UMLS:induces: + operation: keep + core_predicate: biolink:related_to UMLS:larger_than: operation: keep core_predicate: biolink:related_to UMLS:mapped_from: operation: delete +UMLS:mapping_qualifier_of: + operation: keep + core_predicate: biolink:related_to UMLS:mapped_to: operation: keep core_predicate: biolink:related_to +UMLS:may_be_diagnosed_by: + operation: keep + core_predicate: biolink:related_to +UMLS:may_be_prevented_by: + operation: keep + core_predicate: biolink:related_to UMLS:may_be_qualified_by: operation: keep core_predicate: biolink:related_to +UMLS:may_be_treated_by: + operation: keep + core_predicate: biolink:related_to +UMLS:may_diagnose: + operation: keep + core_predicate: biolink:related_to +UMLS:may_inhibit_effect_of: + operation: keep + core_predicate: biolink:related_to +UMLS:may_prevent: + operation: keep + core_predicate: biolink:related_to UMLS:may_qualify: operation: delete +UMLS:may_treat: + operation: keep + core_predicate: biolink:related_to UMLS:measured_by: operation: delete UMLS:measures: operation: keep core_predicate: biolink:related_to +UMLS:mechanism_of_action_of: + operation: keep + core_predicate: biolink:related_to +UMLS:metabolic_site_of: + operation: keep + core_predicate: biolink:related_to UMLS:owning_affiliate_of: operation: delete UMLS:owning_section_of: @@ -4702,13 +4822,37 @@ UMLS:owning_section_of: UMLS:owning_subsection_of: operation: invert core_predicate: biolink:has_part +UMLS:parent_of: + operation: invert + core_predicate: biolink:subclass_of +UMLS:pharmacokinetics_of: + operation: keep + core_predicate: biolink:related_to +UMLS:physiologic_effect_of: + operation: keep + core_predicate: biolink:related_to UMLS:related_to: operation: keep core_predicate: biolink:related_to +UMLS:site_of_metabolism: + operation: keep + core_predicate: biolink:related_to +UMLS:smaller_than: + operation: keep + core_predicate: biolink:related_to +UMLS:structural_class_of: + operation: keep + core_predicate: biolink:related_to UMLS:supported_concept_property_in: operation: delete UMLS:supported_concept_relationship_in: operation: delete +UMLS:therapeutic_class_of: + operation: keep + core_predicate: biolink:related_to +UMLS:xref: + operation: keep + core_predicate: biolink:close_match # UO-PROPERTY:is_unit_of: # operation: keep # core_predicate: biolink:related_to @@ -4718,6 +4862,9 @@ UMLS:supported_concept_relationship_in: VANDF:has_ingredient: operation: keep core_predicate: biolink:has_part +VANDF:has_print_name: + operation: keep + core_predicate: biolink:close_match VANDF:ingredient_of: operation: invert core_predicate: biolink:has_part @@ -4727,6 +4874,9 @@ VANDF:inverse_isa: VANDF:isa: operation: keep core_predicate: biolink:subclass_of +VANDF:print_name_of: + operation: keep + core_predicate: biolink:close_match # WIKIDATA:P2888: # operation: keep # core_predicate: biolink:exact_match From c9e8b98d1abc2aa765f307a063d4a820f35c8960 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 7 Sep 2023 13:58:15 -0700 Subject: [PATCH 102/117] #316 more robust date comparison --- umls_mysql_to_list_jsonl.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index 48df6067..a929a045 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -41,12 +41,7 @@ def get_english_sources(cursor, output): key = ("UMLS_SOURCE", source) if key in source_data: - old_date = source_data[key].get('update_date', '') - - old_date_val = old_date.strip('B').strip('A') - new_date_val = update_date.strip('B').strip('A') - - if new_date_val < old_date_val or (new_date_val == old_date_val and old_date.endswith('AB')): + if update_date < old_date: continue source_data[key] = {"update_date": update_date, "source_name": source_name, "version": version} From 5ad4bd7e7e106b1ecaa5680fa6a3c579157cc1ed Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 7 Sep 2023 14:11:47 -0700 Subject: [PATCH 103/117] #316 actually more robust date comparison by comparing versions --- umls_mysql_to_list_jsonl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index a929a045..11dafac6 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -41,7 +41,9 @@ def get_english_sources(cursor, output): key = ("UMLS_SOURCE", source) if key in source_data: - if update_date < old_date: + old_ver = source_data[key].get('version', '') + + if version < old_ver: continue source_data[key] = {"update_date": update_date, "source_name": source_name, "version": version} From a4c2d28aab5e9e37f458bb984d3711f3811c0392 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 7 Sep 2023 15:39:15 -0700 Subject: [PATCH 104/117] #316 subject is cui2 and object is cui1 --- umls_mysql_to_list_jsonl.py | 6 +++--- umls_util.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/umls_mysql_to_list_jsonl.py b/umls_mysql_to_list_jsonl.py index 11dafac6..c29925c1 100755 --- a/umls_mysql_to_list_jsonl.py +++ b/umls_mysql_to_list_jsonl.py @@ -196,8 +196,8 @@ def cui_sources(cursor, output, sources): cursor.execute(relations_sql_statement) for result in cursor.fetchall(): - (cui1, rel, rela, direction, cui2, source) = result - key = (umls_source_name, cui1) + (cui_object, rel, rela, direction, cui_subject, source) = result + key = (umls_source_name, cui_object) if key not in cui_source_info: # See above for explanation continue @@ -209,7 +209,7 @@ def cui_sources(cursor, output, sources): cui_source_info[key][relation_key][source] = dict() if relation_type_key not in cui_source_info[key][relation_key][source]: cui_source_info[key][relation_key][source][relation_type_key] = list() - cui_source_info[key][relation_key][source][relation_type_key].append(cui2) + cui_source_info[key][relation_key][source][relation_type_key].append(cui_subject) print("Finished relations_sql_statement at", kg2_util.date()) diff --git a/umls_util.py b/umls_util.py index 3edb4ce7..b2109b93 100644 --- a/umls_util.py +++ b/umls_util.py @@ -139,7 +139,7 @@ def create_umls_edges(self, subject_id, relations): for cui in relations[relation_source][relation]: object_id = self.make_node_id(kg2_util.CURIE_PREFIX_UMLS, cui) # TODO: resolve update_date - if relation_direction == 'Y': + if relation_direction == 'N': self.edges_output.write(kg2_util.make_edge(object_id, subject_id, relation_curie, relation_label, provided_by, "2023")) else: self.edges_output.write(kg2_util.make_edge(subject_id, object_id, relation_curie, relation_label, provided_by, "2023")) From 5732454dc0f2ef7dd0249768774a4c4fe27a7e3a Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 7 Sep 2023 16:33:25 -0700 Subject: [PATCH 105/117] #316 updating UMLS predicate mappings based on meeting with Steve --- predicate-remap.yaml | 71 +++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/predicate-remap.yaml b/predicate-remap.yaml index 5aa422bd..f617a3bb 100644 --- a/predicate-remap.yaml +++ b/predicate-remap.yaml @@ -1,6 +1,6 @@ ATC:has_member: operation: keep - core_predicate: biolink:related_to + core_predicate: biolink:has_member ATC:inverse_isa: operation: invert core_predicate: biolink:subclass_of @@ -8,8 +8,8 @@ ATC:isa: operation: keep core_predicate: biolink:subclass_of ATC:member_of: - operation: keep - core_predicate: biolink:related_to + operation: invert + core_predicate: biolink:has_member BFO:0000050: operation: invert core_predicate: biolink:has_part @@ -1669,10 +1669,10 @@ HANCESTRO:0330: operation: keep core_predicate: biolink:related_to HCPCS:CHD: - operation: invert + operation: keep core_predicate: biolink:subclass_of HCPCS:PAR: - operation: keep + operation: invert core_predicate: biolink:subclass_of HCPCS:mapped_from: operation: delete @@ -1738,7 +1738,7 @@ HP:inverse_isa: core_predicate: biolink:subclass_of HP:isa: operation: keep - core_predicate: biolink:close_match + core_predicate: biolink:subclass_of IAO:0000039: operation: keep core_predicate: biolink:related_to @@ -1752,10 +1752,10 @@ IAO:0000219: operation: keep core_predicate: biolink:related_to ICD10PCS:CHD: - operation: invert + operation: keep core_predicate: biolink:subclass_of ICD10PCS:PAR: - operation: keep + operation: invert core_predicate: biolink:subclass_of ICD10PCS:expanded_form_of: operation: keep @@ -1764,10 +1764,10 @@ ICD10PCS:has_expanded_form: operation: keep core_predicate: biolink:close_match ICD9:CHD: - operation: invert + operation: keep core_predicate: biolink:subclass_of ICD9:PAR: - operation: keep + operation: invert core_predicate: biolink:subclass_of IDO:0000664: operation: invert @@ -2054,10 +2054,10 @@ LOINC:time_modifier_of: MESH:AQ: operation: delete MESH:CHD: - operation: invert + operation: keep core_predicate: biolink:subclass_of MESH:PAR: - operation: keep + operation: invert core_predicate: biolink:subclass_of MESH:QB: operation: delete @@ -2268,10 +2268,10 @@ MONDO:predisposes_towards: operation: keep core_predicate: biolink:contributes_to NCBITaxon:CHD: - operation: invert + operation: keep core_predicate: biolink:subclass_of NCBITaxon:PAR: - operation: keep + operation: invert core_predicate: biolink:subclass_of NCBITaxon:expanded_form_of: operation: keep @@ -2703,10 +2703,10 @@ NCIT:inc_parent_of: operation: delete NCIT:inverse_isa: operation: invert - core_predicate: biolink:close_match + core_predicate: biolink:subclass_of NCIT:isa: operation: keep - core_predicate: biolink:close_match + core_predicate: biolink:subclass_of NCIT:is_abnormal_cell_of_disease: operation: keep core_predicate: biolink:related_to @@ -3026,10 +3026,10 @@ OBO:nbo#is_about: # operation: keep # core_predicate: biolink:close_match OMIM:CHD: - operation: invert + operation: keep core_predicate: biolink:subclass_of OMIM:PAR: - operation: keep + operation: invert core_predicate: biolink:subclass_of OMIM:alias_of: operation: keep @@ -3176,11 +3176,10 @@ PDQ:has_expanded_form: operation: keep core_predicate: biolink:close_match PDQ:has_lab_number: - operation: keep - core_predicate: biolink:related_to + operation: delete PDQ:has_tradename: operation: keep - core_predicate: biolink:related_to + core_predicate: biolink:close_match PDQ:inverse_isa: operation: invert core_predicate: biolink:subclass_of @@ -3188,11 +3187,10 @@ PDQ:isa: operation: keep core_predicate: biolink:subclass_of PDQ:lab_number_of: - operation: keep - core_predicate: biolink:related_to + operation: delete PDQ:tradename_of: operation: keep - core_predicate: biolink:related_to + core_predicate: biolink:close_match PHAROS:drug_targets: operation: keep core_predicate: biolink:directly_physically_interacts_with @@ -3213,10 +3211,10 @@ PR:non-covalently_bound_to: operation: keep core_predicate: biolink:physically_interacts_with PSY:CHD: - operation: invert + operation: keep core_predicate: biolink:subclass_of PSY:PAR: - operation: keep + operation: invert core_predicate: biolink:subclass_of PSY:RB: operation: invert @@ -3229,10 +3227,10 @@ PSY:RO: core_predicate: biolink:related_to PSY:has_member: operation: keep - core_predicate: biolink:subclass_of + core_predicate: biolink:has_member PSY:member_of: operation: invert - core_predicate: biolink:subclass_of + core_predicate: biolink:has_member PSY:use: operation: keep core_predicate: biolink:subclass_of @@ -4645,10 +4643,10 @@ UBERON_CORE:synapsed_by: # operation: keep # core_predicate: biolink:coexists_with UMLS:CHD: - operation: invert + operation: keep core_predicate: biolink:subclass_of UMLS:PAR: - operation: keep + operation: invert core_predicate: biolink:subclass_of UMLS:RB: operation: invert @@ -4668,8 +4666,8 @@ UMLS:SY: operation: keep core_predicate: biolink:close_match UMLS:active_metabolites_of: - operation: keep - core_predicate: biolink:related_to + operation: invert + core_predicate: biolink:has_metabolite UMLS:class_code_classified_by: operation: keep core_predicate: biolink:related_to @@ -4680,7 +4678,7 @@ UMLS:component_of: core_predicate: biolink:has_part UMLS:context_binding_of: operation: delete -UMLS:contraindicated_class_of: +UMLS:contraindicated_class_of: # Consider mapping this to drug interaction if and when that biolink predicate becomes available operation: keep core_predicate: biolink:related_to UMLS:contraindicated_mechanism_of_action_of: @@ -4757,7 +4755,7 @@ UMLS:has_supported_concept_property: UMLS:has_supported_concept_relationship: operation: keep core_predicate: biolink:related_to -UMLS:has_therapeutic_class: +UMLS:has_therapeutic_class: # Come back to this one at another time, might have a chance to use has_member operation: keep core_predicate: biolink:related_to UMLS:induced_by: @@ -4772,8 +4770,7 @@ UMLS:larger_than: UMLS:mapped_from: operation: delete UMLS:mapping_qualifier_of: - operation: keep - core_predicate: biolink:related_to + operation: delete UMLS:mapped_to: operation: keep core_predicate: biolink:related_to @@ -4847,7 +4844,7 @@ UMLS:supported_concept_property_in: operation: delete UMLS:supported_concept_relationship_in: operation: delete -UMLS:therapeutic_class_of: +UMLS:therapeutic_class_of: # Look into this at a later time, there might be an opportunity to use has_member operation: keep core_predicate: biolink:related_to UMLS:xref: From 2b07595b4512affa134bed2e94f2af40502be010 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 7 Sep 2023 18:52:29 -0700 Subject: [PATCH 106/117] #316 updating build process for UMLS ETL --- Snakefile-conversion | 18 +++- Snakefile-extraction | 8 +- build-multi-ont-kg.sh | 23 ++--- extract-umls.sh | 44 +--------- multi_ont_to_kg_jsonl.py | 132 +--------------------------- ont-load-inventory.yaml | 179 -------------------------------------- snakemake-config-var.yaml | 15 +++- 7 files changed, 41 insertions(+), 378 deletions(-) diff --git a/Snakefile-conversion b/Snakefile-conversion index 7fae8d37..00167044 100644 --- a/Snakefile-conversion +++ b/Snakefile-conversion @@ -1,7 +1,19 @@ -rule Ontologies_and_TTL: +rule UMLS_Conversion: + input: + code = config['UMLS_CONVERSION_SCRIPT'], + real = config['UMLS_EXTRACT_FILE'], + validation = config['VALIDATION_PLACEHOLDER'] + output: + nodes = config['UMLS_OUTPUT_NODES_FILE'], + edges = config['UMLS_OUTPUT_EDGES_FILE'] + log: + config['UMLS_CONVERSION_LOG'] + shell: + "bash -x {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" + +rule Ontologies_Conversion: input: code = config['ONT_CONVERSION_SCRIPT'], - real = config['UMLS_CUI_FILE'], validation = config['VALIDATION_PLACEHOLDER'] output: nodes = config['ONT_OUTPUT_NODES_FILE'], @@ -9,7 +21,7 @@ rule Ontologies_and_TTL: log: config['ONT_CONVERSION_LOG'] shell: - "bash -x {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" + "bash -x {input.code} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" rule SemMedDB_Conversion: input: diff --git a/Snakefile-extraction b/Snakefile-extraction index 89eb44c5..14cf0eb8 100644 --- a/Snakefile-extraction +++ b/Snakefile-extraction @@ -1,13 +1,13 @@ rule UMLS: input: - code = config['ONT_EXTRACTION_SCRIPT'], + code = config['UMLS_EXTRACTION_SCRIPT'], validation = config['VALIDATION_PLACEHOLDER'] output: - config['UMLS_CUI_FILE'] + config['UMLS_EXTRACT_FILE'] log: - config['ONT_EXTRACTION_LOG'] + config['UMLS_EXTRACTION_LOG'] shell: - "bash -x {input.code} " + config['BUILD_DIR'] + " {output} > {log} 2>&1" + "bash -x {input.code} {output} > {log} 2>&1" rule SemMedDB: input: diff --git a/build-multi-ont-kg.sh b/build-multi-ont-kg.sh index 1e95fa58..98e85998 100755 --- a/build-multi-ont-kg.sh +++ b/build-multi-ont-kg.sh @@ -5,7 +5,7 @@ set -o nounset -o pipefail -o errexit if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then - echo Usage: "$0 [test]" + echo Usage: "$0 [test]" exit 2 fi @@ -20,7 +20,7 @@ config_dir=`dirname "$0"` source ${config_dir}/master-config.shinc ## supply a default value for the build_flag string -build_flag=${4:-""} +build_flag=${3:-""} biolink_base_url_no_version=https://raw.githubusercontent.com/biolink/biolink-model/ # Issue #300: Need "v" before version number for URL to resolve @@ -44,9 +44,8 @@ else test_arg='' fi -umls_cuis_file=${1:-"${BUILD_DIR}/umls_cuis.tsv"} -output_nodes_file=${2:-"${BUILD_DIR}/kg2-ont-nodes${test_suffix}.json"} -output_edges_file=${3:-"${BUILD_DIR}/kg2-ont-edges${test_suffix}.json"} +output_nodes_file=${1:-"${BUILD_DIR}/kg2-ont-nodes${test_suffix}.json"} +output_edges_file=${2:-"${BUILD_DIR}/kg2-ont-edges${test_suffix}.json"} ## set the path to include ${BUILD_DIR} export PATH=$PATH:${BUILD_DIR} @@ -56,16 +55,6 @@ mem_gb=`${CODE_DIR}/get-system-memory-gb.sh` export OWLTOOLS_MEMORY=${mem_gb}G export DEBUG=1 ## for owltools -node_datatype_properties_file="${BUILD_DIR}/node_datatype_properties.json" - -## temporary work around for ontobio issue (see biolink issue #507) -${BUILD_DIR}/robot convert --input ${BUILD_DIR}/umls-hgnc.ttl --output ${BUILD_DIR}/umls-hgnc.owl -${BUILD_DIR}/robot convert --input ${BUILD_DIR}/umls-omim.ttl --output ${BUILD_DIR}/umls-omim.owl -${python_command} ${CODE_DIR}/save_owl_datatypeproperties.py \ - ${BUILD_DIR}/umls-hgnc.owl \ - ${BUILD_DIR}/umls-omim.owl \ - --outputFile ${node_datatype_properties_file} - ${s3_cp_cmd} s3://${s3_bucket}/foodon.pickle ${BUILD_DIR}/ ## run the multi_ont_to_json_kg.py script @@ -75,9 +64,7 @@ cd ${BUILD_DIR} && ${python_command} ${CODE_DIR}/multi_ont_to_kg_jsonl.py \ ${curies_to_urls_file} \ ${ont_load_inventory_file} \ ${output_nodes_file} \ - ${output_edges_file} \ - ${umls_cuis_file} \ - ${node_datatype_properties_file} \ + ${output_edges_file} date echo "================= finished build-multi-ont-kg.sh =================" diff --git a/extract-umls.sh b/extract-umls.sh index c4acb17d..1b91cb2c 100755 --- a/extract-umls.sh +++ b/extract-umls.sh @@ -5,11 +5,11 @@ set -o nounset -o pipefail -o errexit if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then - echo Usage: "$0 [output_dir] [umls_cui_file]" + echo Usage: "$0 [umls_cui_file]" exit 2 fi -# Usage: extract-umls.sh [OUTPUT_DIR] [UMLS_CUI_FILE] +# Usage: extract-umls.sh [UMLS_CUI_FILE] echo "================= starting extract-umls.sh =================" date @@ -17,14 +17,10 @@ date config_dir=`dirname "$0"` source ${config_dir}/master-config.shinc -output_dir=${1:-${BUILD_DIR}} umls_cui_file=${2:-${BUILD_DIR}/umls_cuis.tsv} umls_ver=2023AA umls_file_base=umls-${umls_ver}-metathesaurus-full -umls2rdf_release=rtx-2.2 # This is the version of umls2rdf NOT RTX-KG2; do not change to update RTX-KG2 version -umls2rdf_pkgname=umls2rdf-${umls2rdf_release} -umls2rdf_dir=${umls_dir}/${umls2rdf_pkgname} config_file=${umls_dir}/config.prop mysql_dbname=umls @@ -79,41 +75,7 @@ sed -i "s/@LINE_TERMINATION@/'\n'/g" ${umls_dest_dir}/mysql_tables.sql cd ${umls_dest_dir} bash -x populate_mysql_db_configured.sh -## download and unpack the umls2rdf software -${curl_get} https://github.com/RTXteam/umls2rdf/archive/${umls2rdf_release}.tar.gz > ${umls2rdf_pkgname}.tar.gz -tar xzf ${umls2rdf_pkgname}.tar.gz -C ${umls_dir} - -## make the umls2rdf config file -cat ${umls2rdf_dir}/conf_sample.py | sed 's/your-host/localhost/g' | \ - sed "s/umls2015ab/${mysql_dbname}/g" | \ - sed "s/your db user/${mysql_user}/g" | \ - sed "s/your db pass/${mysql_password}/g" | \ - sed "s|output|${output_dir}|g" | \ - sed "s/2015ab/${umls_ver}/g" > ${umls2rdf_dir}/conf.py - -cp ${umls2rdf_config_master} ${umls2rdf_dir}/umls.conf - -## change to the umls2rdf_dir directory -cd ${umls2rdf_dir} - -## run umls2rdf -${VENV_DIR}/bin/python3 umls2rdf.py - -## verify the output files -./checkOutputSyntax.sh ${output_dir} # uses "rapper" command from the "raptor" package - -umls_cuis_query="SELECT DISTINCT s.CUI, GROUP_CONCAT(DISTINCT s.TUI), GROUP_CONCAT(DISTINCT c.STR) -FROM MRSTY s -INNER JOIN MRCONSO c -ON s.CUI=c.CUI -WHERE c.LAT='ENG' -AND c.TS='P' -AND STT='PF' -AND ISPREF='Y' -GROUP BY s.CUI" - -mysql --defaults-extra-file=${mysql_conf} --database=${mysql_dbname} \ - -e "${umls_cuis_query}" > ${umls_cui_file} +${python_command} ${CODE_DIR}/umls_mysql_to_list_jsonl.py ${mysql_conf} ${mysql_dbname} ${output_file} date echo "================= finished extract-umls.sh =================" diff --git a/multi_ont_to_kg_jsonl.py b/multi_ont_to_kg_jsonl.py index 530a4842..a0ff7568 100755 --- a/multi_ont_to_kg_jsonl.py +++ b/multi_ont_to_kg_jsonl.py @@ -1,12 +1,8 @@ #!/usr/bin/env python3 '''Builds the RTX "KG2" second-generation knowledge graph, from various OWL input files. - Usage: multi_ont_to_json_kg.py + Usage: multi_ont_to_kg_jsonl.py - - (note: outputFile can end in .json or in .gz; if the latter, it will be written as a gzipped file; - but using the gzip options for input or output seems to significantly increase transient memory - usage) ''' __author__ = 'Stephen Ramsey' @@ -28,7 +24,6 @@ import urllib.parse import urllib.request from typing import Dict -import json # temporary addition for Ontobio Issue #507 import datetime # -------------- define globals here --------------- @@ -130,7 +125,6 @@ def make_kg2(curies_to_categories: dict, nodes_output, edges_output, umls_cui_tsv_file: str, - node_datatype_properties_file: str, # temporary addition for Ontobio Issue #507 test_mode: bool = False, save_pickle: bool = False): @@ -159,32 +153,10 @@ def make_kg2(curies_to_categories: dict, kg2_util.log_message('Calling make_nodes_dict_from_ontologies_list') - # Temporary addition for addressing Ontobio Issue #507 - select_datatype_properties = dict() - with open(node_datatype_properties_file, 'r') as node_properties: - select_datatype_properties = json.load(node_properties) - - cui_lookup = dict() - with open(umls_cui_tsv_file, 'r') as cuis: - count = 0 - for line in cuis: - count += 1 - if count == 1: - continue - line = line.split('\t') - cui = line[0] - tuis = line[1].split(',') - name = line[2].strip() - if cui in cui_lookup: - kg2_util.log_message('CUI', cui, 'in TSV file multiple times') - cui_lookup[cui] = {'TUIs': tuis, 'Name': name} - nodes_dict = make_nodes_dict_from_ontologies_list(ont_file_information_dict_list, curies_to_categories, uri_to_curie_shortener, - curie_to_uri_expander, - cui_lookup, - select_datatype_properties) # temporary addition for Ontobio Issue #507 + curie_to_uri_expander) kg2_util.log_message('Calling make_map_of_node_ontology_ids_to_curie_ids') @@ -513,9 +485,7 @@ def get_category_for_multiple_tui(biolink_category_tree: dict, def make_nodes_dict_from_ontologies_list(ontology_info_list: list, curies_to_categories: dict, uri_to_curie_shortener: callable, - curie_to_uri_expander: callable, - cui_lookup: dict, - select_datatype_properties: dict) -> Dict[str, dict]: # temporary addition for Ontobio Issue #507 + curie_to_uri_expander: callable) -> Dict[str, dict]: ret_dict = dict() omim_to_hgnc_symbol = dict() ontologies_iris_to_curies = dict() @@ -536,13 +506,6 @@ def make_nodes_dict_from_ontologies_list(ontology_info_list: list, convert_bpv_pred_to_curie_func = make_convert_bpv_predicate_to_curie(uri_to_curie_shortener, curie_to_uri_expander) - for cui in cui_lookup: - tuis = cui_lookup[cui]['TUIs'] - category = get_category_for_multiple_tui(biolink_category_tree, - tuis, - mappings_to_categories) - cui_lookup[cui]['Category'] = category - def biolink_depth_getter(category: str): return biolink_categories_ontology_depths.get(category, None) @@ -762,8 +725,6 @@ def biolink_depth_getter(category: str): elif bpv_pred_curie == kg2_util.CURIE_ID_HGNC_GENE_SYMBOL: node_gene_symbol = bpv_val node_synonyms.add(node_gene_symbol) - elif bpv_pred_curie == kg2_util.CURIE_ID_UMLS_HAS_CUI: - node_has_cui = True if len(node_tui_list) == 1: node_tui = node_tui_list[0] node_tui_curie = kg2_util.CURIE_PREFIX_UMLS_STY + ':' + node_tui @@ -887,40 +848,6 @@ def biolink_depth_getter(category: str): if node_gene_symbol is not None: node_name = node_gene_symbol - # Temporary code to address Ontobio Issue #507 - if ontology_info_dict['file'] in select_datatype_properties: - filename = ontology_info_dict['file'] - if filename == 'umls-omim.ttl': - mimtype = select_datatype_properties[filename].get(node_curie_id, {}).get('MIMTYPE', None) - if mimtype is not None: - # 0, 3, 5 are phenotypes - # 1, 4 are genes - # There isn't a 2 anymore - if mimtype == "1" or mimtype == "4": - node_category_label = kg2_util.BIOLINK_CATEGORY_GENE - gene_symbol = omim_to_hgnc_symbol.get(node_curie_id, None) - if gene_symbol is not None: - old_name = node_name - node_name = gene_symbol - else: - node_name += " related phenotypic feature" - else: - node_category_label = kg2_util.BIOLINK_CATEGORY_NAMED_THING - if filename == 'umls-hgnc.ttl': - hgnc_properties = select_datatype_properties[filename].get(node_curie_id, {}) - omim_id = hgnc_properties.get('OMIM_ID', None) - gene_symbol = hgnc_properties.get('GENESYMBOL', None) - if omim_id is not None: - if isinstance(omim_id, list): - for id in omim_id: - omim_to_hgnc_symbol[kg2_util.CURIE_PREFIX_OMIM + ':' + id] = gene_symbol - else: - omim_to_hgnc_symbol[kg2_util.CURIE_PREFIX_OMIM + ':' + omim_id] = gene_symbol - locus_group = hgnc_properties.get('LOCUS_GROUP', None) - if locus_group is not None: - if locus_group == "phenotype": - continue - node_dict = kg2_util.make_node(node_curie_id, iri, node_name, @@ -939,53 +866,6 @@ def biolink_depth_getter(category: str): node_dict['synonym'] = sorted(list(node_synonyms)) # slot name is not biolink standard node_dict['publications'] = sorted(list(node_publications)) - # check if we need to make a CUI node - if node_meta is not None and basic_property_values is not None: - for basic_property_value_dict in basic_property_values: - bpv_pred = basic_property_value_dict['pred'] - bpv_pred_curie = convert_bpv_pred_to_curie_func(bpv_pred) - bpv_val = basic_property_value_dict['val'] - if bpv_pred_curie == kg2_util.CURIE_ID_UMLS_HAS_CUI: - cui_node_dict = dict(node_dict) - cui_uri = kg2_util.BASE_URL_UMLS + bpv_val - cui_curie = uri_to_curie_shortener(cui_uri) - assert cui_curie is not None - # Skip this CUI if it's identical to the ontology node itself (happens with files created - # using 'load_on_cuis' - part of fix for issue #565) - if get_local_id_from_curie_id(cui_curie) == get_local_id_from_curie_id(node_curie_id): - continue - cui_node_dict['id'] = cui_curie - cui = cui_curie.split(':')[1] - cui_node_dict['iri'] = cui_uri - cui_node_dict['synonym'] = [] - cui_node_dict['category'] = kg2_util.convert_biolink_category_to_curie(cui_lookup[cui]['Category']) - cui_node_dict['category_label'] = cui_lookup[cui]['Category'].replace(' ', '_') - cui_name = cui_lookup[cui]['Name'] - if cui_name.isupper(): - cui_name = kg2_util.allcaps_to_only_first_letter_capitalized(cui_name) - cui_node_dict['name'] = cui_name - cui_node_dict['ontology node ids'] = [] - cui_node_dict['provided_by'] = kg2_util.CURIE_ID_UMLS_SOURCE_CUI - cui_node_dict['xrefs'] = [] # blanking the "xrefs" here is *vital* in order to avoid issue #395 - cui_node_dict_existing = ret_dict.get(cui_curie, None) - if cui_node_dict_existing is not None: - cui_node_dict = kg2_util.merge_two_dicts(cui_node_dict, - cui_node_dict_existing, - biolink_depth_getter) - ret_dict[cui_curie] = cui_node_dict - node_dict_xrefs = node_dict['xrefs'] - node_dict_xrefs.append(cui_curie) - node_dict['xrefs'] = sorted(list(set(node_dict_xrefs))) - elif bpv_pred_curie == kg2_util.CURIE_ID_HGNC_ENTREZ_GENE_ID: - entrez_gene_id = bpv_val - entrez_node_dict = dict(node_dict) - entrez_curie = kg2_util.CURIE_PREFIX_NCBI_GENE + ':' + entrez_gene_id - entrez_node_dict['id'] = entrez_curie - entrez_node_dict['iri'] = curie_to_uri_expander(entrez_curie) - ret_dict[entrez_curie] = entrez_node_dict - node_dict_xrefs = node_dict['xrefs'] - node_dict_xrefs.append(entrez_curie) - node_dict['xrefs'] = sorted(list(set(node_dict_xrefs))) if node_curie_id in ret_dict: if node_curie_id != provided_by: node_dict = kg2_util.merge_two_dicts(ret_dict[node_curie_id], @@ -1358,8 +1238,6 @@ def make_arg_parser(): arg_parser.add_argument('ontLoadInventoryFile', type=str) arg_parser.add_argument('outputNodesFile', type=str) arg_parser.add_argument('outputEdgesFile', type=str) - arg_parser.add_argument('umlsCUITSVFile', type=str) - arg_parser.add_argument('nodeDatatypePropertiesFile', type=str) # temporary addition for Ontobio Issue #507 return arg_parser @@ -1374,8 +1252,6 @@ def make_arg_parser(): ont_load_inventory_file = args.ontLoadInventoryFile output_nodes_file_name = args.outputNodesFile output_edges_file_name = args.outputEdgesFile - umls_cui_tsv_file = args.umlsCUITSVFile - node_datatype_properties_file = args.nodeDatatypePropertiesFile # temporary addition for Ontobio Issue #507 save_pickle = args.save_pickle test_mode = args.test curies_to_categories = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name)) @@ -1394,8 +1270,6 @@ def make_arg_parser(): ont_urls_and_files, nodes_output, edges_output, - umls_cui_tsv_file, - node_datatype_properties_file, # temporary addition for Ontobio Issue #507 test_mode, save_pickle) diff --git a/ont-load-inventory.yaml b/ont-load-inventory.yaml index 7910c922..aa98ff61 100644 --- a/ont-load-inventory.yaml +++ b/ont-load-inventory.yaml @@ -3,180 +3,6 @@ file: biolink-model.owl.ttl download: true title: Biolink meta-model -- # maps to CURIE prefix: UMLSSC - url: http://purl.bioontology.org/ontology/STY/ - file: umls-semantictypes.ttl - download: false - title: UMLS Semantic Types -- # maps to CURIE prefix: ATC - download: false - file: umls-atc.ttl - title: Anatomical Therapeutic Chemical Classification System - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ATC -- # maps to CURIE prefix CHV - download: false - file: umls-chv.ttl - title: Consumer Health Vocabulary - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/CHV -# - # maps to CURIE prefix CPT -# download: false -# file: umls-cpt.ttl -# title: Current Procedural Terminology -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/CPT -- # maps to CURIE prefix DRUGBANK - download: false - file: umls-drugbank.ttl - title: DrugBank - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/DRUGBANK -- # maps to CURIE prefix FMA - download: false - file: umls-fma.ttl - title: Foundational Model of Anatomy - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/FMA -- # maps to CURIE prefix GO - download: false - file: umls-go.ttl - title: Gene Ontology - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/GO -- # maps to CURIE prefix HCPCS - download: false - file: umls-hcpcs.ttl - title: Healthcare Common Procedure Coding System - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HCPCS -# - # maps to CURIE prefix CPT -# download: false -# file: umls-hcpt.ttl -# title: CPT in HCPCS -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HCPT -- # maps to CURIE prefix HGNC - download: false - file: umls-hgnc.ttl - title: HUGO Gene Nomenclature Committee - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HGNC -- # maps to CURIE prefix umls - download: false - file: umls-hl7.ttl - title: HL7 Version 3.0 - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HL7 -- # maps to CURIE prefix HP - download: false - file: umls-hpo.ttl - title: Human Phenotype Ontology - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HPO -# - # maps to CURIE prefix ICD10 -# download: false -# file: umls-icd10.ttl -# title: International Classification of Diseases and Related Health Problems, -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD10 -# - # maps to CURIE prefix ICD10 -# download: false -# file: umls-icd10ae.ttl -# title: ICD-10, American English Equivalents -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD10AE -# - # maps to CURIE prefix ICD10 -# download: false -# file: umls-icd10cm.ttl -# title: International Classification of Diseases, Tenth Revision, Clinical Modification -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD10CM -- # maps to CURIE prefix ICD10PCS - download: false - file: umls-icd10pcs.ttl - title: ICD-10 Procedure Coding System - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD10PCS -- # maps to CURIE prefix ICD9 - download: false - file: umls-icd9cm.ttl - title: International Classification of Diseases, Ninth Revision, Clinical Modification - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD9CM -# - # maps to CURIE prefix LOINC -# download: false -# file: umls-lnc.ttl -# title: Logical Observation Identifiers Names and Codes -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/LNC -# - # maps to CURIE prefix MEDDRA -# download: false -# file: umls-mdr.ttl -# title: MedDRA -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MEDDRA -- # maps to CURIE prefix umls - download: false - file: umls-med-rt.ttl - title: Medication Reference Terminology - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MED-RT -- # maps to CURIE prefix umls - download: false - file: umls-medlineplus.ttl - title: MedlinePlus Health Topics - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MEDLINEPLUS -- # maps to CURIE prefix MESH - download: false - file: umls-msh.ttl - title: Medical Subject Headings - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MSH -- # maps to CURIE prefix NCBITaxon - download: false - file: umls-ncbi.ttl - title: NCBI - url: http://purl.obolibrary.org/obo/ncbitaxon/subsets/taxslim.owl -- # maps to CURIE prefix NCIT - download: false - file: umls-nci.ttl - title: NCI Thesaurus - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/NCI -- # maps to CURIE prefix NDDF - download: false - file: umls-nddf.ttl - title: National Drug Data File - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/NDDF -- # maps to CURIE prefix NDFRT - download: false - file: umls-ndfrt.ttl - title: National Drug File - Reference Terminology - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/NDFRT -- # maps to CURIE prefix OMIM - download: false - file: umls-omim.ttl - title: Online Mendelian Inheritance in Man - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/OMIM -- # maps to CURIE prefix PDQ - download: false - file: umls-pdq.ttl - title: Physician Data Query - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/PDQ -- # maps to CURIE prefix PSY - download: false - file: umls-psy.ttl - title: Psychological Index Terms - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/PSY -- # maps to CURIE prefix RXNORM - download: false - file: umls-rxnorm.ttl - title: RXNORM - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/RXNORM -# - # maps to CURIE prefix SNOMED -# download: false -# file: umls-snomedct_us.ttl -# title: SNOMED Clinical Terms US Edition -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/SNOMEDCT -# ==> unable to find an online set of pages for SNOMEDCT_VET concepts but I want to find one so that -# I can include SNOMEDCT_VET in the kg2 build, thus am keeping this section commented out [SAR]: -# - -# download: false -# file: umls-snomedct_vet.ttl -# title: Veterinary Extension to SNOMED CT -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/SNOMEDCT_VET -# ==> this section (UMLS Source Terminology Names) seems like it could be useful in the future, but -# I can't find purls to its concepts anywhere: -# - -# download: false -# file: umls-src.ttl -# title: Source Terminology Names (UMLS) -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/SRC -- # maps to CURIE prefix VANDF - download: false - file: umls-vandf.ttl - title: National Drug File - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/VANDF - # maps to CURIE prefix: BFO url: http://purl.obolibrary.org/obo/bfo.owl file: bfo.owl @@ -292,8 +118,3 @@ file: mi.owl download: true title: Molecular Interactions Controlled Vocabulary -- # maps to CURIE prefix umls - download: false - file: umls-mth.ttl - title: Metathesaurus Names - url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MTH diff --git a/snakemake-config-var.yaml b/snakemake-config-var.yaml index 2e2c923c..10335509 100644 --- a/snakemake-config-var.yaml +++ b/snakemake-config-var.yaml @@ -3,12 +3,19 @@ edges_suffix: -edges validation_placeholder: ${BUILD_DIR}/validation-placeholder.empty -ont_extraction_base: extract-umls +umls_extraction_base: extract-umls +umls_conversion_base: umls_list_jsonl_to_kg_jsonl +umls_output_base: kg2-umls +umls_extraction_script: ${CODE_DIR}/${umls_extraction_base}.sh +umls_extraction_log: ${BUILD_DIR}/${umls_extraction_base}${test_suffix}.log +umls_extract_file: ${BUILD_DIR}/umls.jsonl +umls_conversion_script: ${CODE_DIR}/${umls_conversion_base}.py +umls_conversion_log: ${BUILD_DIR}/${umls_conversion_base}${test_suffix}.log +umls_output_nodes_file: ${BUILD_DIR}/${umls_output_base}${nodes_suffix}${test_suffix}.jsonl +umls_output_edges_file: ${BUILD_DIR}/${umls_output_base}${edges_suffix}${test_suffix}.jsonl + ont_conversion_base: build-multi-ont-kg ont_output_base: kg2-ont -ont_extraction_script: ${CODE_DIR}/${ont_extraction_base}.sh -ont_extraction_log: ${BUILD_DIR}/${ont_extraction_base}${test_suffix}.log -umls_cui_file: ${BUILD_DIR}/umls_cuis.tsv ont_conversion_script: ${CODE_DIR}/${ont_conversion_base}.sh ont_conversion_log: ${BUILD_DIR}/${ont_conversion_base}${test_suffix}.log ont_output_nodes_file: ${BUILD_DIR}/${ont_output_base}${nodes_suffix}${test_suffix}.jsonl From 9d6cb464a7dd967281372c8e2d254dfea2ea764d Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 7 Sep 2023 19:00:02 -0700 Subject: [PATCH 107/117] #316 it no longer makes sense to validate ont load inventory ttl files --- run-validation-tests.sh | 7 ------- 1 file changed, 7 deletions(-) diff --git a/run-validation-tests.sh b/run-validation-tests.sh index 50d1bb02..f8d3a468 100755 --- a/run-validation-tests.sh +++ b/run-validation-tests.sh @@ -69,13 +69,6 @@ ${python_command} -u ${CODE_DIR}/validate_predicate_remap_yaml.py \ ${biolink_model_yaml_url} \ ${biolink_model_yaml_local_file} -${python_command} -u ${CODE_DIR}/validate_ont_load_inventory.py \ - ${ont_load_inventory_file} \ - ${curies_to_urls_file} \ - ${umls2rdf_config_master} \ - ${biolink_model_owl_url} \ - ${biolink_model_owl_local_file} - ${python_command} -u ${CODE_DIR}/validate_provided_by_to_infores_map_yaml.py \ ${infores_mapping_file} \ ${infores_catalog_yaml} From 1907352c48d15263fd131fa057aa03acacb1d1c0 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 8 Sep 2023 10:50:04 -0700 Subject: [PATCH 108/117] #316 remove log file for testing to figure out what's going on --- build-kg2-snakemake.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build-kg2-snakemake.sh b/build-kg2-snakemake.sh index 28d06b28..b0a1a5d3 100755 --- a/build-kg2-snakemake.sh +++ b/build-kg2-snakemake.sh @@ -71,7 +71,7 @@ then trap "cat ${build_kg2_log_file}" EXIT fi -{ +# { echo "================= starting build-kg2-snakemake.sh ==================" date @@ -118,7 +118,7 @@ cd ~ && ${VENV_DIR}/bin/snakemake --snakefile ${snakefile} ${run_flag} -R Finish date echo "================ script finished ============================" -} > ${build_kg2_log_file} 2>&1 +# } > ${build_kg2_log_file} 2>&1 if [[ "${ci_flag}" != "ci" && "${dryrun}" != "-n" ]] then From 1e3c97d9c23ed29386fef52ba56644e0ef8dc4ca Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 8 Sep 2023 11:03:56 -0700 Subject: [PATCH 109/117] #316 SemMed UMLS input --- Snakefile-conversion | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile-conversion b/Snakefile-conversion index 00167044..ba15511d 100644 --- a/Snakefile-conversion +++ b/Snakefile-conversion @@ -9,7 +9,7 @@ rule UMLS_Conversion: log: config['UMLS_CONVERSION_LOG'] shell: - "bash -x {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" + config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" rule Ontologies_Conversion: input: @@ -27,7 +27,7 @@ rule SemMedDB_Conversion: input: code = config['SEMMEDDB_CONVERSION_SCRIPT'], real = config['SEMMEDDB_TUPLELIST_FILE'], - mrcui_req = config['UMLS_CUI_FILE'], + mrcui_req = config['UMLS_EXTRACT_FILE'], exclusion_list = config['SEMMEDDB_EXCLUSION_FILE'], version_file = config['SEMMEDDB_VERSION_FILE'], validation = config['VALIDATION_PLACEHOLDER'] From ffcb318eb623408afd85260f2d4e8bb31a4d96b8 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 8 Sep 2023 11:15:36 -0700 Subject: [PATCH 110/117] #316 add UMLS into the merge --- Snakefile-post-etl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Snakefile-post-etl b/Snakefile-post-etl index e47913b3..05fd073b 100644 --- a/Snakefile-post-etl +++ b/Snakefile-post-etl @@ -1,6 +1,8 @@ rule Merge: input: code = config['MERGE_SCRIPT'], + umls_nodes = config['UMLS_OUTPUT_NODES_FILE'], + umls_edges = config['UMLS_OUTPUT_EDGES_FILE'], ont_nodes = config['ONT_OUTPUT_NODES_FILE'], ont_edges = config['ONT_OUTPUT_EDGES_FILE'], uniprot_nodes = config['UNIPROTKB_OUTPUT_NODES_FILE'], @@ -53,6 +55,7 @@ rule Merge: " --outputNodesFile {output.nodes} " + \ " --outputEdgesFile {output.edges} " + \ " --kgNodesFiles " + \ + "{input.umls_nodes} " + \ "{input.ont_nodes} " + \ "{input.semmeddb_nodes} " + \ "{input.uniprot_nodes} " + \ @@ -74,6 +77,7 @@ rule Merge: "{input.disgenet_nodes} " + \ "{input.kegg_nodes} " + \ " --kgEdgesFiles " + \ + "{input.umls_edges} " + \ "{input.ont_edges} " + \ "{input.semmeddb_edges} " + \ "{input.uniprot_edges} " + \ From e9eace1a06b563b7a74777d85c6769834b785f6d Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 8 Sep 2023 11:28:56 -0700 Subject: [PATCH 111/117] #316 parameterize UMLS conversion --- Snakefile-conversion | 5 ++++- snakemake-config-var.yaml | 2 ++ umls_list_jsonl_to_kg_jsonl.py | 12 +++++++++--- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/Snakefile-conversion b/Snakefile-conversion index ba15511d..0d33b54f 100644 --- a/Snakefile-conversion +++ b/Snakefile-conversion @@ -2,6 +2,9 @@ rule UMLS_Conversion: input: code = config['UMLS_CONVERSION_SCRIPT'], real = config['UMLS_EXTRACT_FILE'], + curies_to_urls_map = config['CURIES_TO_URLS_FILE'], + umls_name_heirarchy = config['UMLS_NAME_HEIRARCHY'], + tui_map = config['UMLS_TUI_MAP'], validation = config['VALIDATION_PLACEHOLDER'] output: nodes = config['UMLS_OUTPUT_NODES_FILE'], @@ -9,7 +12,7 @@ rule UMLS_Conversion: log: config['UMLS_CONVERSION_LOG'] shell: - config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" + config['PYTHON_COMMAND'] + " {input.code} {input.real} {input.curies_to_urls_map} {input.umls_name_heirarchy} {input.tui_map} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" rule Ontologies_Conversion: input: diff --git a/snakemake-config-var.yaml b/snakemake-config-var.yaml index 10335509..40569c1b 100644 --- a/snakemake-config-var.yaml +++ b/snakemake-config-var.yaml @@ -11,6 +11,8 @@ umls_extraction_log: ${BUILD_DIR}/${umls_extraction_base}${test_suffix}.log umls_extract_file: ${BUILD_DIR}/umls.jsonl umls_conversion_script: ${CODE_DIR}/${umls_conversion_base}.py umls_conversion_log: ${BUILD_DIR}/${umls_conversion_base}${test_suffix}.log +umls_name_heirarchy: ${CODE_DIR}/umls-name-heirarchy.yaml +umls_tui_map: ${CODE_DIR}/tui_combo_mappings.json umls_output_nodes_file: ${BUILD_DIR}/${umls_output_base}${nodes_suffix}${test_suffix}.jsonl umls_output_edges_file: ${BUILD_DIR}/${umls_output_base}${edges_suffix}${test_suffix}.jsonl diff --git a/umls_list_jsonl_to_kg_jsonl.py b/umls_list_jsonl_to_kg_jsonl.py index 3ff28081..0d07b788 100644 --- a/umls_list_jsonl_to_kg_jsonl.py +++ b/umls_list_jsonl_to_kg_jsonl.py @@ -26,6 +26,9 @@ def get_args(): arg_parser = argparse.ArgumentParser(description='umls_list_jsonl_to_kg_jsonl.py: converts UMLS MySQL JSON Lines dump into KG2 JSON format') arg_parser.add_argument('inputFile', type=str) + arg_parser.add_argument('curiesToURIFile', type=str) + arg_parser.add_argument('umlsNameHeirarchy', type=str) + arg_parser.add_argument('TUIComboMappings', type=str) arg_parser.add_argument('outputNodesFile', type=str) arg_parser.add_argument('outputEdgesFile', type=str) arg_parser.add_argument('--test', dest='test', action="store_true", default=False) @@ -43,6 +46,9 @@ def extract_node_id(node_id_str): args = get_args() input_file_name = args.inputFile test_mode = args.test + curies_to_urls_map_file_name = args.curiesToURIFile + umls_name_heirarchy_file_name = args.umlsNameHeirarchy + tui_combo_mappings_file_name = args.TUIComboMappings output_nodes_file_name = args.outputNodesFile output_edges_file_name = args.outputEdgesFile @@ -53,11 +59,11 @@ def extract_node_id(node_id_str): input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) input_items = input_read_jsonlines_info[0] - with open('tui_combo_mappings.json') as mappings: + with open(tui_combo_mappings_file_name) as mappings: TUI_MAPPINGS = json.load(mappings) - iri_mappings_raw = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string('curies-to-urls-map.yaml'))['use_for_bidirectional_mapping'] - full_heirarchy = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string('umls-name-heirarchy.yaml')) + iri_mappings_raw = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_map_file_name))['use_for_bidirectional_mapping'] + full_heirarchy = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(umls_name_heirarchy_file_name)) for item in iri_mappings_raw: for prefix in item: IRI_MAPPINGS[prefix] = item[prefix] From 75498b5a97dffcd6f097e0b14b55cadb1ee59ee7 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 8 Sep 2023 12:41:53 -0700 Subject: [PATCH 112/117] #316 comment the log again --- build-kg2-snakemake.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build-kg2-snakemake.sh b/build-kg2-snakemake.sh index b0a1a5d3..28d06b28 100755 --- a/build-kg2-snakemake.sh +++ b/build-kg2-snakemake.sh @@ -71,7 +71,7 @@ then trap "cat ${build_kg2_log_file}" EXIT fi -# { +{ echo "================= starting build-kg2-snakemake.sh ==================" date @@ -118,7 +118,7 @@ cd ~ && ${VENV_DIR}/bin/snakemake --snakefile ${snakefile} ${run_flag} -R Finish date echo "================ script finished ============================" -# } > ${build_kg2_log_file} 2>&1 +} > ${build_kg2_log_file} 2>&1 if [[ "${ci_flag}" != "ci" && "${dryrun}" != "-n" ]] then From ef9d55ca238d77586dd4678a2b5a61185e1fc7b9 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 8 Sep 2023 13:12:35 -0700 Subject: [PATCH 113/117] #349 Uberon link was wrong --- ont-load-inventory.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ont-load-inventory.yaml b/ont-load-inventory.yaml index aa98ff61..b6728bcf 100644 --- a/ont-load-inventory.yaml +++ b/ont-load-inventory.yaml @@ -24,7 +24,7 @@ download: true title: Relation Ontology - - url: http://purl.obolibrary.org/obo/uberon/ext.owl + url: http://purl.obolibrary.org/obo/uberon.owl file: uberon-ext.owl download: true title: Uber-anatomy Ontology From 86cb186b4425b74a759dafa5d06b6e93d5482c26 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 8 Sep 2023 21:05:23 -0700 Subject: [PATCH 114/117] #316 #349 addressing unbound variable --- extract-umls.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract-umls.sh b/extract-umls.sh index 1b91cb2c..9fbd63d6 100755 --- a/extract-umls.sh +++ b/extract-umls.sh @@ -17,7 +17,7 @@ date config_dir=`dirname "$0"` source ${config_dir}/master-config.shinc -umls_cui_file=${2:-${BUILD_DIR}/umls_cuis.tsv} +output_file=${2:-${BUILD_DIR}/umls.jsonl} umls_ver=2023AA umls_file_base=umls-${umls_ver}-metathesaurus-full From 10ec96180ade46f6b2c6261894394e3866d0ae92 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 9 Sep 2023 13:18:07 -0700 Subject: [PATCH 115/117] #316 editing uberon to match new one --- kg2-provided-by-curie-to-infores-curie.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kg2-provided-by-curie-to-infores-curie.yaml b/kg2-provided-by-curie-to-infores-curie.yaml index eeda5413..4a907523 100644 --- a/kg2-provided-by-curie-to-infores-curie.yaml +++ b/kg2-provided-by-curie-to-infores-curie.yaml @@ -110,7 +110,7 @@ OBO:ro.owl: source_name: Relations Ontology infores_curie: infores:ro knowledge_type: knowledge_source -OBO:uberon/ext.owl: +OBO:uberon: source_name: Uber Anatomy Ontology infores_curie: infores:uberon knowledge_type: knowledge_source From f792f9169057403c25138d6402444e7c14879b25 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 9 Sep 2023 13:31:40 -0700 Subject: [PATCH 116/117] #316 try number 2 to fix uberon issue --- kg2-provided-by-curie-to-infores-curie.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kg2-provided-by-curie-to-infores-curie.yaml b/kg2-provided-by-curie-to-infores-curie.yaml index 4a907523..587ccd9f 100644 --- a/kg2-provided-by-curie-to-infores-curie.yaml +++ b/kg2-provided-by-curie-to-infores-curie.yaml @@ -110,7 +110,7 @@ OBO:ro.owl: source_name: Relations Ontology infores_curie: infores:ro knowledge_type: knowledge_source -OBO:uberon: +OBO:uberon.owl: source_name: Uber Anatomy Ontology infores_curie: infores:uberon knowledge_type: knowledge_source From 569cb093280ea3c2952dcce461f807ad54313397 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 9 Sep 2023 15:34:25 -0700 Subject: [PATCH 117/117] #316 dealing with new qualifiers --- predicate-remap.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/predicate-remap.yaml b/predicate-remap.yaml index f617a3bb..0a6030e7 100644 --- a/predicate-remap.yaml +++ b/predicate-remap.yaml @@ -1055,6 +1055,11 @@ DrugCentral:positive_modulator: qualifiers: object_aspect: activity object_direction: increased +DrugCentral:reduce_risk: + operation: keep + core_predicate: biolink:affects_risk_for + qualifiers: + object_direction: decreased DrugCentral:releasing_agent: operation: keep core_predicate: biolink:affects @@ -3131,6 +3136,12 @@ ORPHANET:C016: ORPHANET:C017: operation: keep core_predicate: biolink:related_to +ORPHANET:C056: + operation: keep + core_predicate: biolink:close_match +ORPHANET:C057: + operation: keep + core_predicate: biolink:close_match # PATO:0000085: # operation: keep # core_predicate: biolink:associated_with_sensitivity_to @@ -3782,6 +3793,9 @@ RO:0002411: RO:0002412: operation: keep core_predicate: biolink:precedes +RO:0002428: + operation: keep + core_predicate: biolink:regulates # RO:0002432: # operation: keep # core_predicate: biolink:active_in