From 292cb3be8d94d97faacf6467c3da3a67506e14aa Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Thu, 9 Jan 2025 10:15:05 -0500 Subject: [PATCH] study biosamples field sample gold records --- assets/sample_gold_records/Ga0031151.json | 32 ++++++++++ assets/sample_gold_records/Gb0051341.json | 70 +++++++++++++++++++++ assets/sample_gold_records/Go0004567.json | 49 +++++++++++++++ assets/sample_gold_records/Gp0004567.json | 76 +++++++++++++++++++++++ assets/sample_gold_records/Gs0000008.json | 62 ++++++++++++++++++ make-gold-cache.Makefile | 35 ++++------- sample_annotator/gold_to_mongo.py | 26 +++++++- 7 files changed, 326 insertions(+), 24 deletions(-) create mode 100644 assets/sample_gold_records/Ga0031151.json create mode 100644 assets/sample_gold_records/Gb0051341.json create mode 100644 assets/sample_gold_records/Go0004567.json create mode 100644 assets/sample_gold_records/Gp0004567.json create mode 100644 assets/sample_gold_records/Gs0000008.json diff --git a/assets/sample_gold_records/Ga0031151.json b/assets/sample_gold_records/Ga0031151.json new file mode 100644 index 0000000..dd7dda3 --- /dev/null +++ b/assets/sample_gold_records/Ga0031151.json @@ -0,0 +1,32 @@ +[ + { + "apGoldId": "Ga0031151", + "organismGoldId": null, + "referenceApGoldId": null, + "apName": "Syntrophothermus lipocalidus DSM 12680", + "apType": "Genome Analysis (Isolate)", + "studyId": "Gs0000008", + "itsApId": 1377450, + "imgSubmissionId": null, + "imgTaxonOid": 646564577, + "imgPipelineVersion": "IMG/W 3.2", + "assemblyMethod": "Newbler v. 2.1 PreRelease-4/28/2009, Velvet v. 1.1.04, Phrap v. 4.24 SPS", + "publications": [], + "sraRuns": [], + "projects": [ + "Gp0004567" + ], + "contacts": [ + { + "name": "Jonathan Eisen", + "email": "jonathan.eisen@gmail.com", + "roles": [ + "submitter", + "PI" + ] + } + ], + "modDate": "2023-07-05", + "addDate": "2009-01-18" + } +] \ No newline at end of file diff --git a/assets/sample_gold_records/Gb0051341.json b/assets/sample_gold_records/Gb0051341.json new file mode 100644 index 0000000..1a989bc --- /dev/null +++ b/assets/sample_gold_records/Gb0051341.json @@ -0,0 +1,70 @@ +[ + { + "biosampleGoldId": "Gb0051341", + "biosampleName": "Hot spring microbial communities from Yellowstone National Park, Wyoming, USA - YNP2 Nymph Lake 10", + "ncbiTaxId": 433727, + "ncbiTaxName": "hot springs metagenome", + "sampleCollectionSite": "Combined lysosyme plus beat-beating ", + "geographicLocation": "USA: Yellowstone National Park, Wyoming", + "latitude": 44.7523206, + "longitude": -110.7253926, + "ecosystemPathId": 3992, + "ecosystem": "Environmental", + "ecosystemCategory": "Aquatic", + "ecosystemType": "Thermal springs", + "ecosystemSubtype": "Hot (42-90C)", + "specificEcosystem": "Unclassified", + "altitudeInMeters": null, + "altitudeInMeters2": null, + "dateCollected": "2007-08-20", + "depthInMeters": null, + "depthInMeters2": null, + "description": "Small acidic pool on hillside north of Nymph Lake.", + "hostDiseases": [], + "elevationInMeters": 2280, + "elevationInMeters2": null, + "geoLocation": "USA: Yellowstone National Park, Wyoming", + "habitat": "Hot spring", + "hostGender": null, + "hostName": null, + "hostNcbiTaxid": null, + "isoCountry": "USA", + "longhurst": null, + "nitrateConcentration": null, + "oxygenConcentration": null, + "ph": "3.75", + "pressure": null, + "salinity": null, + "salinityConcentration": null, + "sampleBodySite": null, + "sampleBodySubsite": null, + "sampleCollectionTemperature": "90 C", + "subsurfaceDepthInMeters": null, + "subsurfaceDepthInMeters2": null, + "visitNum": null, + "mixsPackage": "Standard", + "envoBroadScale": { + "id": "ENVO_00002030", + "label": "aquatic biome" + }, + "envoLocalScale": { + "id": "ENVO_00000051", + "label": "hot spring" + }, + "envoMedium": { + "id": "ENVO_03600065", + "label": "spring water" + }, + "modDate": "2022-08-05", + "addDate": "2008-11-25", + "contacts": [ + { + "name": "Zack Jay", + "email": "zackary.jay@montana.edu", + "roles": [ + "submitter" + ] + } + ] + } +] \ No newline at end of file diff --git a/assets/sample_gold_records/Go0004567.json b/assets/sample_gold_records/Go0004567.json new file mode 100644 index 0000000..b2e9e1c --- /dev/null +++ b/assets/sample_gold_records/Go0004567.json @@ -0,0 +1,49 @@ +[ + { + "organismGoldId": "Go0004567", + "organismName": "Syntrophothermus lipocalidus DSM 12680", + "ncbiTaxId": 643648, + "ncbiTaxonName": "Syntrophothermus lipocalidus DSM 12680", + "ncbiSuperkingdom": "Bacteria", + "ncbiKingdom": "Bacillati", + "ncbiPhylum": "Bacillota", + "ncbiClass": "Clostridia", + "ncbiOrder": "Eubacteriales", + "ncbiFamily": "Syntrophomonadaceae", + "ncbiGenus": "Syntrophothermus", + "ncbiSpecies": "Syntrophothermus lipocalidus", + "bioticRelationships": "Symbiotic", + "oxygenRequirement": "Obligate anaerobe", + "metabolism": "Syntrophic", + "energySources": "", + "gramStain": "Gram+", + "isolationPubmedId": null, + "habitat": "Bioreactor|Sludge", + "sampleCollectionSite": "anaerobic granular sludge, thermophilic UASB reactor", + "ecosystemPathId": 4264, + "ecosystem": "Engineered", + "ecosystemCategory": "Wastewater", + "ecosystemType": "Unclassified", + "ecosystemSubtype": "Unclassified", + "specificEcosystem": "Unclassified", + "isolationHostName": null, + "hostTaxonomyId": null, + "hostBodySite": null, + "hostBodySubsite": null, + "salinity": null, + "salinityConcentration": null, + "cellDiameter": null, + "cellShape": "Rod-shaped", + "color": null, + "motility": "Motile", + "ph": "null", + "pressure": null, + "sporulation": "Nonsporulating", + "carbonSource": null, + "growthTemperature": "55", + "cellLength": null, + "cultured": "Yes", + "modDate": "2023-03-28", + "addDate": "2009-01-18" + } +] \ No newline at end of file diff --git a/assets/sample_gold_records/Gp0004567.json b/assets/sample_gold_records/Gp0004567.json new file mode 100644 index 0000000..4c29f50 --- /dev/null +++ b/assets/sample_gold_records/Gp0004567.json @@ -0,0 +1,76 @@ +[ + { + "projectGoldId": "Gp0004567", + "projectName": "Syntrophothermus lipocalidus DSM 12680", + "legacyGoldId": "Gc01292", + "studyGoldId": "Gs0000008", + "biosampleGoldId": null, + "organismGoldId": "Go0004567", + "itsProposalId": 867, + "itsSpid": null, + "itsSampleId": null, + "pmoProjectId": 97981, + "gptsProposalId": 97601, + "ncbiBioProjectAccession": "PRJNA37873", + "ncbiBioSampleAccession": "SAMN02598518", + "projectStatus": "Complete and Published", + "sequencingStatus": "Complete", + "sequencingCenters": [ + "DOE Joint Genome Institute (JGI)" + ], + "jgiFundingProgram": "GEBA", + "jgiFundingYear": 2007, + "hmpId": null, + "seqMethod": [ + "454 GS FLX", + "Illumina GAIIx" + ], + "contacts": [ + { + "name": "Nikos Kyrpides", + "email": "nckyrpides@lbl.gov", + "roles": [ + "submitter" + ] + }, + { + "name": "Jonathan Eisen", + "email": "jonathan.eisen@gmail.com", + "roles": [ + "other", + "PI" + ] + } + ], + "genomePublications": [ + { + "pubmedId": 21304731, + "journalName": "Standards in genomic sciences", + "volume": "3", + "issue": "3", + "page": "268-75", + "title": "Complete genome sequence of Syntrophothermus lipocalidus type strain (TGB-C1).", + "publicationDate": null, + "doi": "10.4056/sigs.1233249" + }, + { + "pubmedId": 28604660, + "journalName": "Nature biotechnology", + "volume": "35", + "issue": "7", + "page": "676-683", + "title": "1,003 reference genomes of bacterial and archaeal isolates expand coverage of the tree of life.", + "publicationDate": "2017-02-01", + "doi": "10.1038/nbt.3886" + } + ], + "otherPublications": [], + "sraExperimentIds": [ + "SRX1950465", + "SRX1950464" + ], + "modDate": "2014-04-08", + "addDate": "2009-01-18", + "sequencingStrategy": "Whole Genome Sequencing" + } +] \ No newline at end of file diff --git a/assets/sample_gold_records/Gs0000008.json b/assets/sample_gold_records/Gs0000008.json new file mode 100644 index 0000000..eff64f8 --- /dev/null +++ b/assets/sample_gold_records/Gs0000008.json @@ -0,0 +1,62 @@ +[ + { + "studyGoldId": "Gs0000008", + "studyName": "A Genomic Encyclopedia of Bacteria and Archaea (GEBA)", + "description": "Genome sequencing of a phylogeny-driven genomic encyclopaedia of Bacteria and Archaea", + "modDate": "2022-08-05", + "addDate": "2007-11-27", + "contacts": [ + { + "name": "Nikos Kyrpides", + "email": "nckyrpides@lbl.gov", + "roles": [ + "submitter" + ] + }, + { + "name": "Genome Biology Program", + "email": null, + "roles": [ + "other" + ] + }, + { + "name": "Jonathan Eisen", + "email": "jonathan.eisen@gmail.com", + "roles": [ + "PI", + "sample contact" + ] + }, + { + "name": "Ilya V. Kublanov", + "email": "kublanov.ilya@gmail.com", + "roles": [ + "other" + ] + }, + { + "name": "Lynne Goodwin", + "email": "lynneg@lanl.gov", + "roles": [ + "PM" + ] + }, + { + "name": "Nicole Shapiro", + "email": "nrshapiro@lbl.gov", + "roles": [ + "PM" + ] + }, + { + "name": "Bruce D", + "email": "microbe@cuba.jgi-psf.org", + "roles": [ + "other", + "PI" + ] + } + ] + } +] \ No newline at end of file diff --git a/make-gold-cache.Makefile b/make-gold-cache.Makefile index 6570189..f72d051 100644 --- a/make-gold-cache.Makefile +++ b/make-gold-cache.Makefile @@ -1,11 +1,8 @@ # review and fix readmes and poetry dependencies -# use .gitkeep for keeping directories -# todo: just get environmental microbiome studies? +MAX_STUDIES=70000 # 2025-01 -# in separate PRs! - -MAX_STUDIES=300 +.PHONY: load-gold-biosamples-into-mongo downloads/goldData.xlsx: wget -O $@ "https://gold.jgi.doe.gov/download?mode=site_excel" @@ -17,7 +14,7 @@ local/gold-studies.tsv: downloads/goldData.xlsx --output-file $@ local/gold-study-ids.txt: local/gold-studies.tsv - # this introduces some noise (non-id rows) + # without the grep filter, this introduces some noise (non-id rows) tail -n +2 $< | cut -f 1 | sort | grep 'Gs' > $@ local/gold-study-ids-subset.txt: local/gold-study-ids.txt @@ -25,11 +22,8 @@ local/gold-study-ids-subset.txt: local/gold-study-ids.txt local/gold-cache.json: local/gold-study-ids-subset.txt # ~ 3 seconds/uncached study - # ~ 30 studies/minute - # ~ 2k studies/hour - # ~ 50k studies/day # GOLD has ~ 63k studies - # < 2 days to cache all studies ? + # < 2 days to fetch all studies ? poetry run python sample_annotator/clients/gold_client.py \ --verbose \ fetch-studies \ @@ -39,6 +33,14 @@ local/gold-cache.json: local/gold-study-ids-subset.txt --authentication-file config/gold-key.txt \ $< +load-gold-biosamples-into-mongo: local/gold-study-ids-subset.txt + # --purge-mongodb + # --purge-diskcache + poetry run python sample_annotator/gold_to_mongo.py \ + --authentication-file config/gold-key.txt \ + --mongo-db-name gold_metadata \ + --study-ids-file $< + #.PHONY: split-out-gold-biosamples #split-out-gold-biosamples: local/gold-cache.json # poetry run python sample_annotator/file_utils/split_out_gold_biosamples.py \ @@ -47,15 +49,4 @@ local/gold-cache.json: local/gold-study-ids-subset.txt # --biosample-output-file local/gold-biosamples-only.json \ # --project-output-file local/gold-projects-only.json \ # --remove-contacts \ -# --remove-nulls - -.PHONY: load-gold-biosamples-into-mongo -load-gold-biosamples-into-mongo: local/gold-study-ids-subset.txt - # --purge-mongodb - # --purge-diskcache - poetry run python sample_annotator/gold_to_mongo.py \ - --authentication-file config/gold-key.txt \ - --mongo-db-name gold_metadata \ - --study-ids-file $< \ - --purge-mongodb \ - --purge-diskcache +# --remove-nulls \ No newline at end of file diff --git a/sample_annotator/gold_to_mongo.py b/sample_annotator/gold_to_mongo.py index 2c321cb..0a129b7 100644 --- a/sample_annotator/gold_to_mongo.py +++ b/sample_annotator/gold_to_mongo.py @@ -10,6 +10,10 @@ # todo might need better API error handling # should be more consistent about bundling (projects in biosamples) vs getting biosamples separate from studies +# todo document the fact that a biosamples key is added to studies +# biosamples kave no foreign keys +# (sequencing) projects include native study and biosample foreign keys + # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') @@ -116,16 +120,34 @@ def main(mongo_db_name: str, study_ids_file: str, authentication_file: str, for study_id in study_ids: logging.info(f"Processing study {study_id}...") + + # Fetch the study record study = gc.fetch_study(study_id, **args) - insert_document(study_collection, study, study_id) + # Fetch biosamples associated with the study biosamples = gc.fetch_biosamples_by_study(study_id, **args) logging.info(f"Retrieved {len(biosamples)} biosamples for study {study_id}") + # Collect biosampleGoldIds for the study + biosample_ids = [] + for biosample in biosamples: + biosample_id = biosample.get('biosampleGoldId', None) + if biosample_id: + biosample_ids.append(biosample_id) + + # Handle associated projects for project in biosample.pop('projects', []): insert_document(project_collection, project, project.get('projectGoldId', 'Unknown')) - insert_document(biosample_collection, biosample, biosample.get('biosampleGoldId', 'Unknown')) + + # Insert biosample into MongoDB + insert_document(biosample_collection, biosample, biosample_id) + + # Add the biosamples list to the study record + study['biosamples'] = biosample_ids + + # Insert the study record into MongoDB + insert_document(study_collection, study, study_id) # Close the connection client.close()