Skip to content

Commit

Permalink
Merge pull request RTXteam#351 from RTXteam/issue-316
Browse files Browse the repository at this point in the history
UMLS Re-ETL
  • Loading branch information
ecwood authored Sep 13, 2023
2 parents 002a0f6 + 569cb09 commit d7dd8e3
Show file tree
Hide file tree
Showing 20 changed files with 7,161 additions and 578 deletions.
23 changes: 19 additions & 4 deletions Snakefile-conversion
Original file line number Diff line number Diff line change
@@ -1,21 +1,36 @@
rule Ontologies_and_TTL:
rule UMLS_Conversion:
input:
code = config['UMLS_CONVERSION_SCRIPT'],
real = config['UMLS_EXTRACT_FILE'],
curies_to_urls_map = config['CURIES_TO_URLS_FILE'],
umls_name_heirarchy = config['UMLS_NAME_HEIRARCHY'],
tui_map = config['UMLS_TUI_MAP'],
validation = config['VALIDATION_PLACEHOLDER']
output:
nodes = config['UMLS_OUTPUT_NODES_FILE'],
edges = config['UMLS_OUTPUT_EDGES_FILE']
log:
config['UMLS_CONVERSION_LOG']
shell:
config['PYTHON_COMMAND'] + " {input.code} {input.real} {input.curies_to_urls_map} {input.umls_name_heirarchy} {input.tui_map} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1"

rule Ontologies_Conversion:
input:
code = config['ONT_CONVERSION_SCRIPT'],
real = config['UMLS_CUI_FILE'],
validation = config['VALIDATION_PLACEHOLDER']
output:
nodes = config['ONT_OUTPUT_NODES_FILE'],
edges = config['ONT_OUTPUT_EDGES_FILE']
log:
config['ONT_CONVERSION_LOG']
shell:
"bash -x {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1"
"bash -x {input.code} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1"

rule SemMedDB_Conversion:
input:
code = config['SEMMEDDB_CONVERSION_SCRIPT'],
real = config['SEMMEDDB_TUPLELIST_FILE'],
mrcui_req = config['UMLS_CUI_FILE'],
mrcui_req = config['UMLS_EXTRACT_FILE'],
exclusion_list = config['SEMMEDDB_EXCLUSION_FILE'],
version_file = config['SEMMEDDB_VERSION_FILE'],
validation = config['VALIDATION_PLACEHOLDER']
Expand Down
8 changes: 4 additions & 4 deletions Snakefile-extraction
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
rule UMLS:
input:
code = config['ONT_EXTRACTION_SCRIPT'],
code = config['UMLS_EXTRACTION_SCRIPT'],
validation = config['VALIDATION_PLACEHOLDER']
output:
config['UMLS_CUI_FILE']
config['UMLS_EXTRACT_FILE']
log:
config['ONT_EXTRACTION_LOG']
config['UMLS_EXTRACTION_LOG']
shell:
"bash -x {input.code} " + config['BUILD_DIR'] + " {output} > {log} 2>&1"
"bash -x {input.code} {output} > {log} 2>&1"

rule SemMedDB:
input:
Expand Down
4 changes: 4 additions & 0 deletions Snakefile-post-etl
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
rule Merge:
input:
code = config['MERGE_SCRIPT'],
umls_nodes = config['UMLS_OUTPUT_NODES_FILE'],
umls_edges = config['UMLS_OUTPUT_EDGES_FILE'],
ont_nodes = config['ONT_OUTPUT_NODES_FILE'],
ont_edges = config['ONT_OUTPUT_EDGES_FILE'],
uniprot_nodes = config['UNIPROTKB_OUTPUT_NODES_FILE'],
Expand Down Expand Up @@ -53,6 +55,7 @@ rule Merge:
" --outputNodesFile {output.nodes} " + \
" --outputEdgesFile {output.edges} " + \
" --kgNodesFiles " + \
"{input.umls_nodes} " + \
"{input.ont_nodes} " + \
"{input.semmeddb_nodes} " + \
"{input.uniprot_nodes} " + \
Expand All @@ -74,6 +77,7 @@ rule Merge:
"{input.disgenet_nodes} " + \
"{input.kegg_nodes} " + \
" --kgEdgesFiles " + \
"{input.umls_edges} " + \
"{input.ont_edges} " + \
"{input.semmeddb_edges} " + \
"{input.uniprot_edges} " + \
Expand Down
23 changes: 5 additions & 18 deletions build-multi-ont-kg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
set -o nounset -o pipefail -o errexit

if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
echo Usage: "$0 <input_file.tsv> <output_nodes_file.jsonl> <output_edges_file.jsonl> [test]"
echo Usage: "$0 <output_nodes_file.jsonl> <output_edges_file.jsonl> [test]"
exit 2
fi

Expand All @@ -20,7 +20,7 @@ config_dir=`dirname "$0"`
source ${config_dir}/master-config.shinc

## supply a default value for the build_flag string
build_flag=${4:-""}
build_flag=${3:-""}
biolink_base_url_no_version=https://raw.githubusercontent.com/biolink/biolink-model/

# Issue #300: Need "v" before version number for URL to resolve
Expand All @@ -44,9 +44,8 @@ else
test_arg=''
fi

umls_cuis_file=${1:-"${BUILD_DIR}/umls_cuis.tsv"}
output_nodes_file=${2:-"${BUILD_DIR}/kg2-ont-nodes${test_suffix}.json"}
output_edges_file=${3:-"${BUILD_DIR}/kg2-ont-edges${test_suffix}.json"}
output_nodes_file=${1:-"${BUILD_DIR}/kg2-ont-nodes${test_suffix}.json"}
output_edges_file=${2:-"${BUILD_DIR}/kg2-ont-edges${test_suffix}.json"}

## set the path to include ${BUILD_DIR}
export PATH=$PATH:${BUILD_DIR}
Expand All @@ -56,16 +55,6 @@ mem_gb=`${CODE_DIR}/get-system-memory-gb.sh`
export OWLTOOLS_MEMORY=${mem_gb}G
export DEBUG=1 ## for owltools

node_datatype_properties_file="${BUILD_DIR}/node_datatype_properties.json"

## temporary work around for ontobio issue (see biolink issue #507)
${BUILD_DIR}/robot convert --input ${BUILD_DIR}/umls-hgnc.ttl --output ${BUILD_DIR}/umls-hgnc.owl
${BUILD_DIR}/robot convert --input ${BUILD_DIR}/umls-omim.ttl --output ${BUILD_DIR}/umls-omim.owl
${python_command} ${CODE_DIR}/save_owl_datatypeproperties.py \
${BUILD_DIR}/umls-hgnc.owl \
${BUILD_DIR}/umls-omim.owl \
--outputFile ${node_datatype_properties_file}

${s3_cp_cmd} s3://${s3_bucket}/foodon.pickle ${BUILD_DIR}/

## run the multi_ont_to_json_kg.py script
Expand All @@ -75,9 +64,7 @@ cd ${BUILD_DIR} && ${python_command} ${CODE_DIR}/multi_ont_to_kg_jsonl.py \
${curies_to_urls_file} \
${ont_load_inventory_file} \
${output_nodes_file} \
${output_edges_file} \
${umls_cuis_file} \
${node_datatype_properties_file} \
${output_edges_file}

date
echo "================= finished build-multi-ont-kg.sh ================="
2 changes: 2 additions & 0 deletions curies-to-urls-map.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ use_for_bidirectional_mapping:
CHEMBL.TARGET: "https://identifiers.org/chembl.target:"
-
CHMO: http://purl.obolibrary.org/obo/CHMO_
-
CHV: http://purl.bioontology.org/ontology/CHV/
-
CID: 'http://pubchem.ncbi.nlm.nih.gov/compound/'
-
Expand Down
46 changes: 4 additions & 42 deletions extract-umls.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,22 @@
set -o nounset -o pipefail -o errexit

if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
echo Usage: "$0 [output_dir] [umls_cui_file]"
echo Usage: "$0 [umls_cui_file]"
exit 2
fi

# Usage: extract-umls.sh [OUTPUT_DIR] [UMLS_CUI_FILE]
# Usage: extract-umls.sh [UMLS_CUI_FILE]

echo "================= starting extract-umls.sh ================="
date

config_dir=`dirname "$0"`
source ${config_dir}/master-config.shinc

output_dir=${1:-${BUILD_DIR}}
umls_cui_file=${2:-${BUILD_DIR}/umls_cuis.tsv}
output_file=${2:-${BUILD_DIR}/umls.jsonl}

umls_ver=2023AA
umls_file_base=umls-${umls_ver}-metathesaurus-full
umls2rdf_release=rtx-2.2 # This is the version of umls2rdf NOT RTX-KG2; do not change to update RTX-KG2 version
umls2rdf_pkgname=umls2rdf-${umls2rdf_release}
umls2rdf_dir=${umls_dir}/${umls2rdf_pkgname}
config_file=${umls_dir}/config.prop
mysql_dbname=umls

Expand Down Expand Up @@ -79,41 +75,7 @@ sed -i "s/@LINE_TERMINATION@/'\n'/g" ${umls_dest_dir}/mysql_tables.sql
cd ${umls_dest_dir}
bash -x populate_mysql_db_configured.sh

## download and unpack the umls2rdf software
${curl_get} https://github.com/RTXteam/umls2rdf/archive/${umls2rdf_release}.tar.gz > ${umls2rdf_pkgname}.tar.gz
tar xzf ${umls2rdf_pkgname}.tar.gz -C ${umls_dir}

## make the umls2rdf config file
cat ${umls2rdf_dir}/conf_sample.py | sed 's/your-host/localhost/g' | \
sed "s/umls2015ab/${mysql_dbname}/g" | \
sed "s/your db user/${mysql_user}/g" | \
sed "s/your db pass/${mysql_password}/g" | \
sed "s|output|${output_dir}|g" | \
sed "s/2015ab/${umls_ver}/g" > ${umls2rdf_dir}/conf.py

cp ${umls2rdf_config_master} ${umls2rdf_dir}/umls.conf

## change to the umls2rdf_dir directory
cd ${umls2rdf_dir}

## run umls2rdf
${VENV_DIR}/bin/python3 umls2rdf.py

## verify the output files
./checkOutputSyntax.sh ${output_dir} # uses "rapper" command from the "raptor" package

umls_cuis_query="SELECT DISTINCT s.CUI, GROUP_CONCAT(DISTINCT s.TUI), GROUP_CONCAT(DISTINCT c.STR)
FROM MRSTY s
INNER JOIN MRCONSO c
ON s.CUI=c.CUI
WHERE c.LAT='ENG'
AND c.TS='P'
AND STT='PF'
AND ISPREF='Y'
GROUP BY s.CUI"

mysql --defaults-extra-file=${mysql_conf} --database=${mysql_dbname} \
-e "${umls_cuis_query}" > ${umls_cui_file}
${python_command} ${CODE_DIR}/umls_mysql_to_list_jsonl.py ${mysql_conf} ${mysql_dbname} ${output_file}

date
echo "================= finished extract-umls.sh ================="
2 changes: 1 addition & 1 deletion kg2-provided-by-curie-to-infores-curie.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ OBO:ro.owl:
source_name: Relations Ontology
infores_curie: infores:ro
knowledge_type: knowledge_source
OBO:uberon/ext.owl:
OBO:uberon.owl:
source_name: Uber Anatomy Ontology
infores_curie: infores:uberon
knowledge_type: knowledge_source
Expand Down
10 changes: 10 additions & 0 deletions kg2_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
CURIE_PREFIX_CHEMBL_COMPOUND = 'CHEMBL.COMPOUND'
CURIE_PREFIX_CHEMBL_MECHANISM = 'CHEMBL.MECHANISM'
CURIE_PREFIX_CHEMBL_TARGET = 'CHEMBL.TARGET'
CURIE_PREFIX_CHV = 'CHV'
CURIE_PREFIX_CLINICALTRIALS = 'clinicaltrials'
CURIE_PREFIX_DCTERMS = 'dcterms'
CURIE_PREFIX_DGIDB = 'DGIdb'
Expand All @@ -65,12 +66,17 @@
CURIE_PREFIX_DRUGCENTRAL = 'DrugCentral'
CURIE_PREFIX_ENSEMBL = 'ENSEMBL'
CURIE_PREFIX_ENSEMBL_GENOMES = 'EnsemblGenomes'
CURIE_PREFIX_FMA = 'FMA'
CURIE_PREFIX_GO = 'GO'
CURIE_PREFIX_GTPI = 'GTPI'
CURIE_PREFIX_GTPI_SOURCE = 'GTPI_source'
CURIE_PREFIX_HCPCS = 'HCPCS'
CURIE_PREFIX_HGNC = 'HGNC'
CURIE_PREFIX_HMDB = 'HMDB'
CURIE_PREFIX_HP = 'HP'
CURIE_PREFIX_IAO = 'IAO'
CURIE_PREFIX_ICD10PCS = 'ICD10PCS'
CURIE_PREFIX_ICD9 = 'ICD9'
CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY = 'identifiers_org_registry'
CURIE_PREFIX_ISBN = 'ISBN'
CURIE_PREFIX_KEGG = 'KEGG'
Expand All @@ -87,6 +93,7 @@
CURIE_PREFIX_NCBI_GENE = 'NCBIGene'
CURIE_PREFIX_NCBI_TAXON = 'NCBITaxon'
CURIE_PREFIX_NCIT = 'NCIT'
CURIE_PREFIX_NDDF = 'NDDF'
CURIE_PREFIX_OBO = 'OBO'
CURIE_PREFIX_OBO_FORMAT = 'oboFormat'
CURIE_PREFIX_OIO = 'OIO'
Expand All @@ -99,7 +106,9 @@
CURIE_PREFIX_PATHWHIZ_REACTION = 'PathWhiz.Reaction'
CURIE_PREFIX_PATHWHIZ_BOUND = 'PathWhiz.Bound'
CURIE_PREFIX_PATHWHIZ_PROTEIN_COMPLEX = 'PathWhiz.ProteinComplex'
CURIE_PREFIX_PDQ = 'PDQ'
CURIE_PREFIX_PMID = 'PMID'
CURIE_PREFIX_PSY = 'PSY'
CURIE_PREFIX_RDF = 'rdf'
CURIE_PREFIX_RDFS = 'rdfs'
CURIE_PREFIX_REACTOME='REACT'
Expand All @@ -108,6 +117,7 @@
CURIE_PREFIX_RHEA_COMP = 'RHEA.COMP'
CURIE_PREFIX_RO = 'RO'
CURIE_PREFIX_RTX = 'RTX'
CURIE_PREFIX_RXNORM = 'RXNORM'
CURIE_PREFIX_SEMMEDDB = 'SEMMEDDB'
CURIE_PREFIX_SKOS = 'skos'
CURIE_PREFIX_SMPDB = 'SMPDB'
Expand Down
5 changes: 5 additions & 0 deletions misc-tools/mysql_table_to_md_table.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
sed -i -E "s/\+(-)+/\|--/g" umls_table.txt
sed -i -E "s/^\|( )*//g" umls_table.txt
sed -i -E "s/--\+$/--/g" umls_table.txt
sed -i -E "s/( )+/ /g" umls_table.txt
sed -i -E "s/<|>//g" umls_table.txt
Loading

0 comments on commit d7dd8e3

Please sign in to comment.