From 292cb3be8d94d97faacf6467c3da3a67506e14aa Mon Sep 17 00:00:00 2001
From: "Mark A. Miller" <MAM@lbl.gov>
Date: Thu, 9 Jan 2025 10:15:05 -0500
Subject: [PATCH] study biosamples field

sample gold records
---
 assets/sample_gold_records/Ga0031151.json | 32 ++++++++++
 assets/sample_gold_records/Gb0051341.json | 70 +++++++++++++++++++++
 assets/sample_gold_records/Go0004567.json | 49 +++++++++++++++
 assets/sample_gold_records/Gp0004567.json | 76 +++++++++++++++++++++++
 assets/sample_gold_records/Gs0000008.json | 62 ++++++++++++++++++
 make-gold-cache.Makefile                  | 35 ++++-------
 sample_annotator/gold_to_mongo.py         | 26 +++++++-
 7 files changed, 326 insertions(+), 24 deletions(-)
 create mode 100644 assets/sample_gold_records/Ga0031151.json
 create mode 100644 assets/sample_gold_records/Gb0051341.json
 create mode 100644 assets/sample_gold_records/Go0004567.json
 create mode 100644 assets/sample_gold_records/Gp0004567.json
 create mode 100644 assets/sample_gold_records/Gs0000008.json

diff --git a/assets/sample_gold_records/Ga0031151.json b/assets/sample_gold_records/Ga0031151.json
new file mode 100644
index 0000000..dd7dda3
--- /dev/null
+++ b/assets/sample_gold_records/Ga0031151.json
@@ -0,0 +1,32 @@
+[
+  {
+    "apGoldId": "Ga0031151",
+    "organismGoldId": null,
+    "referenceApGoldId": null,
+    "apName": "Syntrophothermus lipocalidus DSM 12680",
+    "apType": "Genome Analysis (Isolate)",
+    "studyId": "Gs0000008",
+    "itsApId": 1377450,
+    "imgSubmissionId": null,
+    "imgTaxonOid": 646564577,
+    "imgPipelineVersion": "IMG/W 3.2",
+    "assemblyMethod": "Newbler v. 2.1 PreRelease-4/28/2009, Velvet v. 1.1.04, Phrap v. 4.24 SPS",
+    "publications": [],
+    "sraRuns": [],
+    "projects": [
+      "Gp0004567"
+    ],
+    "contacts": [
+      {
+        "name": "Jonathan Eisen",
+        "email": "jonathan.eisen@gmail.com",
+        "roles": [
+          "submitter",
+          "PI"
+        ]
+      }
+    ],
+    "modDate": "2023-07-05",
+    "addDate": "2009-01-18"
+  }
+]
\ No newline at end of file
diff --git a/assets/sample_gold_records/Gb0051341.json b/assets/sample_gold_records/Gb0051341.json
new file mode 100644
index 0000000..1a989bc
--- /dev/null
+++ b/assets/sample_gold_records/Gb0051341.json
@@ -0,0 +1,70 @@
+[
+  {
+    "biosampleGoldId": "Gb0051341",
+    "biosampleName": "Hot spring microbial communities from Yellowstone National Park, Wyoming, USA - YNP2   Nymph Lake 10",
+    "ncbiTaxId": 433727,
+    "ncbiTaxName": "hot springs metagenome",
+    "sampleCollectionSite": "Combined lysosyme plus beat-beating ",
+    "geographicLocation": "USA: Yellowstone National Park, Wyoming",
+    "latitude": 44.7523206,
+    "longitude": -110.7253926,
+    "ecosystemPathId": 3992,
+    "ecosystem": "Environmental",
+    "ecosystemCategory": "Aquatic",
+    "ecosystemType": "Thermal springs",
+    "ecosystemSubtype": "Hot (42-90C)",
+    "specificEcosystem": "Unclassified",
+    "altitudeInMeters": null,
+    "altitudeInMeters2": null,
+    "dateCollected": "2007-08-20",
+    "depthInMeters": null,
+    "depthInMeters2": null,
+    "description": "Small acidic pool on hillside north of Nymph Lake.",
+    "hostDiseases": [],
+    "elevationInMeters": 2280,
+    "elevationInMeters2": null,
+    "geoLocation": "USA: Yellowstone National Park, Wyoming",
+    "habitat": "Hot spring",
+    "hostGender": null,
+    "hostName": null,
+    "hostNcbiTaxid": null,
+    "isoCountry": "USA",
+    "longhurst": null,
+    "nitrateConcentration": null,
+    "oxygenConcentration": null,
+    "ph": "3.75",
+    "pressure": null,
+    "salinity": null,
+    "salinityConcentration": null,
+    "sampleBodySite": null,
+    "sampleBodySubsite": null,
+    "sampleCollectionTemperature": "90 C",
+    "subsurfaceDepthInMeters": null,
+    "subsurfaceDepthInMeters2": null,
+    "visitNum": null,
+    "mixsPackage": "Standard",
+    "envoBroadScale": {
+      "id": "ENVO_00002030",
+      "label": "aquatic biome"
+    },
+    "envoLocalScale": {
+      "id": "ENVO_00000051",
+      "label": "hot spring"
+    },
+    "envoMedium": {
+      "id": "ENVO_03600065",
+      "label": "spring water"
+    },
+    "modDate": "2022-08-05",
+    "addDate": "2008-11-25",
+    "contacts": [
+      {
+        "name": "Zack Jay",
+        "email": "zackary.jay@montana.edu",
+        "roles": [
+          "submitter"
+        ]
+      }
+    ]
+  }
+]
\ No newline at end of file
diff --git a/assets/sample_gold_records/Go0004567.json b/assets/sample_gold_records/Go0004567.json
new file mode 100644
index 0000000..b2e9e1c
--- /dev/null
+++ b/assets/sample_gold_records/Go0004567.json
@@ -0,0 +1,49 @@
+[
+  {
+    "organismGoldId": "Go0004567",
+    "organismName": "Syntrophothermus lipocalidus DSM 12680",
+    "ncbiTaxId": 643648,
+    "ncbiTaxonName": "Syntrophothermus lipocalidus DSM 12680",
+    "ncbiSuperkingdom": "Bacteria",
+    "ncbiKingdom": "Bacillati",
+    "ncbiPhylum": "Bacillota",
+    "ncbiClass": "Clostridia",
+    "ncbiOrder": "Eubacteriales",
+    "ncbiFamily": "Syntrophomonadaceae",
+    "ncbiGenus": "Syntrophothermus",
+    "ncbiSpecies": "Syntrophothermus lipocalidus",
+    "bioticRelationships": "Symbiotic",
+    "oxygenRequirement": "Obligate anaerobe",
+    "metabolism": "Syntrophic",
+    "energySources": "",
+    "gramStain": "Gram+",
+    "isolationPubmedId": null,
+    "habitat": "Bioreactor|Sludge",
+    "sampleCollectionSite": "anaerobic granular sludge, thermophilic UASB reactor",
+    "ecosystemPathId": 4264,
+    "ecosystem": "Engineered",
+    "ecosystemCategory": "Wastewater",
+    "ecosystemType": "Unclassified",
+    "ecosystemSubtype": "Unclassified",
+    "specificEcosystem": "Unclassified",
+    "isolationHostName": null,
+    "hostTaxonomyId": null,
+    "hostBodySite": null,
+    "hostBodySubsite": null,
+    "salinity": null,
+    "salinityConcentration": null,
+    "cellDiameter": null,
+    "cellShape": "Rod-shaped",
+    "color": null,
+    "motility": "Motile",
+    "ph": "null",
+    "pressure": null,
+    "sporulation": "Nonsporulating",
+    "carbonSource": null,
+    "growthTemperature": "55",
+    "cellLength": null,
+    "cultured": "Yes",
+    "modDate": "2023-03-28",
+    "addDate": "2009-01-18"
+  }
+]
\ No newline at end of file
diff --git a/assets/sample_gold_records/Gp0004567.json b/assets/sample_gold_records/Gp0004567.json
new file mode 100644
index 0000000..4c29f50
--- /dev/null
+++ b/assets/sample_gold_records/Gp0004567.json
@@ -0,0 +1,76 @@
+[
+  {
+    "projectGoldId": "Gp0004567",
+    "projectName": "Syntrophothermus lipocalidus DSM 12680",
+    "legacyGoldId": "Gc01292",
+    "studyGoldId": "Gs0000008",
+    "biosampleGoldId": null,
+    "organismGoldId": "Go0004567",
+    "itsProposalId": 867,
+    "itsSpid": null,
+    "itsSampleId": null,
+    "pmoProjectId": 97981,
+    "gptsProposalId": 97601,
+    "ncbiBioProjectAccession": "PRJNA37873",
+    "ncbiBioSampleAccession": "SAMN02598518",
+    "projectStatus": "Complete and Published",
+    "sequencingStatus": "Complete",
+    "sequencingCenters": [
+      "DOE Joint Genome Institute (JGI)"
+    ],
+    "jgiFundingProgram": "GEBA",
+    "jgiFundingYear": 2007,
+    "hmpId": null,
+    "seqMethod": [
+      "454 GS FLX",
+      "Illumina GAIIx"
+    ],
+    "contacts": [
+      {
+        "name": "Nikos Kyrpides",
+        "email": "nckyrpides@lbl.gov",
+        "roles": [
+          "submitter"
+        ]
+      },
+      {
+        "name": "Jonathan Eisen",
+        "email": "jonathan.eisen@gmail.com",
+        "roles": [
+          "other",
+          "PI"
+        ]
+      }
+    ],
+    "genomePublications": [
+      {
+        "pubmedId": 21304731,
+        "journalName": "Standards in genomic sciences",
+        "volume": "3",
+        "issue": "3",
+        "page": "268-75",
+        "title": "Complete genome sequence of Syntrophothermus lipocalidus type strain (TGB-C1).",
+        "publicationDate": null,
+        "doi": "10.4056/sigs.1233249"
+      },
+      {
+        "pubmedId": 28604660,
+        "journalName": "Nature biotechnology",
+        "volume": "35",
+        "issue": "7",
+        "page": "676-683",
+        "title": "1,003 reference genomes of bacterial and archaeal isolates expand coverage of the tree of life.",
+        "publicationDate": "2017-02-01",
+        "doi": "10.1038/nbt.3886"
+      }
+    ],
+    "otherPublications": [],
+    "sraExperimentIds": [
+      "SRX1950465",
+      "SRX1950464"
+    ],
+    "modDate": "2014-04-08",
+    "addDate": "2009-01-18",
+    "sequencingStrategy": "Whole Genome Sequencing"
+  }
+]
\ No newline at end of file
diff --git a/assets/sample_gold_records/Gs0000008.json b/assets/sample_gold_records/Gs0000008.json
new file mode 100644
index 0000000..eff64f8
--- /dev/null
+++ b/assets/sample_gold_records/Gs0000008.json
@@ -0,0 +1,62 @@
+[
+  {
+    "studyGoldId": "Gs0000008",
+    "studyName": "A Genomic Encyclopedia of Bacteria and Archaea (GEBA)",
+    "description": "Genome sequencing of a phylogeny-driven genomic encyclopaedia of Bacteria and Archaea",
+    "modDate": "2022-08-05",
+    "addDate": "2007-11-27",
+    "contacts": [
+      {
+        "name": "Nikos Kyrpides",
+        "email": "nckyrpides@lbl.gov",
+        "roles": [
+          "submitter"
+        ]
+      },
+      {
+        "name": "Genome Biology Program",
+        "email": null,
+        "roles": [
+          "other"
+        ]
+      },
+      {
+        "name": "Jonathan Eisen",
+        "email": "jonathan.eisen@gmail.com",
+        "roles": [
+          "PI",
+          "sample contact"
+        ]
+      },
+      {
+        "name": "Ilya V. Kublanov",
+        "email": "kublanov.ilya@gmail.com",
+        "roles": [
+          "other"
+        ]
+      },
+      {
+        "name": "Lynne Goodwin",
+        "email": "lynneg@lanl.gov",
+        "roles": [
+          "PM"
+        ]
+      },
+      {
+        "name": "Nicole Shapiro",
+        "email": "nrshapiro@lbl.gov",
+        "roles": [
+          "PM"
+        ]
+      },
+      {
+        "name": "Bruce D",
+        "email": "microbe@cuba.jgi-psf.org",
+        "roles": [
+          "other",
+          "PI"
+        ]
+      }
+    ]
+  }
+]
\ No newline at end of file
diff --git a/make-gold-cache.Makefile b/make-gold-cache.Makefile
index 6570189..f72d051 100644
--- a/make-gold-cache.Makefile
+++ b/make-gold-cache.Makefile
@@ -1,11 +1,8 @@
 # review and fix readmes and poetry dependencies
-# use .gitkeep for keeping directories
 
-# todo: just get environmental microbiome studies?
+MAX_STUDIES=70000 # 2025-01
 
-# in separate PRs!
-
-MAX_STUDIES=300
+.PHONY: load-gold-biosamples-into-mongo
 
 downloads/goldData.xlsx:
 	wget -O $@ "https://gold.jgi.doe.gov/download?mode=site_excel"
@@ -17,7 +14,7 @@ local/gold-studies.tsv: downloads/goldData.xlsx
 		--output-file $@
 
 local/gold-study-ids.txt: local/gold-studies.tsv
-	# this introduces some noise (non-id rows)
+	# without the grep filter, this introduces some noise (non-id rows)
 	tail -n +2 $< | cut -f 1 | sort | grep 'Gs' > $@
 
 local/gold-study-ids-subset.txt: local/gold-study-ids.txt
@@ -25,11 +22,8 @@ local/gold-study-ids-subset.txt: local/gold-study-ids.txt
 
 local/gold-cache.json: local/gold-study-ids-subset.txt
 	# ~ 3 seconds/uncached study
-	# ~ 30 studies/minute
-	# ~ 2k studies/hour
-	# ~ 50k studies/day
 	# GOLD has ~ 63k studies
-	# < 2 days to cache all studies ?
+	# < 2 days to fetch all studies ?
 	poetry run python sample_annotator/clients/gold_client.py \
 		--verbose \
 		fetch-studies \
@@ -39,6 +33,14 @@ local/gold-cache.json: local/gold-study-ids-subset.txt
 		--authentication-file config/gold-key.txt \
 		$<
 
+load-gold-biosamples-into-mongo: local/gold-study-ids-subset.txt
+	# 		--purge-mongodb
+	# 		--purge-diskcache
+	poetry run python sample_annotator/gold_to_mongo.py \
+		--authentication-file config/gold-key.txt \
+		--mongo-db-name gold_metadata \
+		--study-ids-file $<
+
 #.PHONY: split-out-gold-biosamples
 #split-out-gold-biosamples: local/gold-cache.json
 #	poetry run python sample_annotator/file_utils/split_out_gold_biosamples.py \
@@ -47,15 +49,4 @@ local/gold-cache.json: local/gold-study-ids-subset.txt
 #		--biosample-output-file local/gold-biosamples-only.json \
 #		--project-output-file local/gold-projects-only.json \
 #		--remove-contacts \
-#		--remove-nulls
-
-.PHONY: load-gold-biosamples-into-mongo
-load-gold-biosamples-into-mongo: local/gold-study-ids-subset.txt
-	# 		--purge-mongodb
-	# 		--purge-diskcache
-	poetry run python sample_annotator/gold_to_mongo.py \
-		--authentication-file config/gold-key.txt \
-		--mongo-db-name gold_metadata \
-		--study-ids-file $< \
-		--purge-mongodb \
-		--purge-diskcache
+#		--remove-nulls
\ No newline at end of file
diff --git a/sample_annotator/gold_to_mongo.py b/sample_annotator/gold_to_mongo.py
index 2c321cb..0a129b7 100644
--- a/sample_annotator/gold_to_mongo.py
+++ b/sample_annotator/gold_to_mongo.py
@@ -10,6 +10,10 @@
 # todo might need better API error handling
 #   should be more consistent about bundling (projects in biosamples) vs getting biosamples separate from studies
 
+# todo document the fact that a biosamples key is added to studies
+#   biosamples kave no foreign keys
+#   (sequencing) projects include native study and biosample foreign keys
+
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
@@ -116,16 +120,34 @@ def main(mongo_db_name: str, study_ids_file: str, authentication_file: str,
 
     for study_id in study_ids:
         logging.info(f"Processing study {study_id}...")
+
+        # Fetch the study record
         study = gc.fetch_study(study_id, **args)
-        insert_document(study_collection, study, study_id)
 
+        # Fetch biosamples associated with the study
         biosamples = gc.fetch_biosamples_by_study(study_id, **args)
         logging.info(f"Retrieved {len(biosamples)} biosamples for study {study_id}")
 
+        # Collect biosampleGoldIds for the study
+        biosample_ids = []
+
         for biosample in biosamples:
+            biosample_id = biosample.get('biosampleGoldId', None)
+            if biosample_id:
+                biosample_ids.append(biosample_id)
+
+            # Handle associated projects
             for project in biosample.pop('projects', []):
                 insert_document(project_collection, project, project.get('projectGoldId', 'Unknown'))
-            insert_document(biosample_collection, biosample, biosample.get('biosampleGoldId', 'Unknown'))
+
+            # Insert biosample into MongoDB
+            insert_document(biosample_collection, biosample, biosample_id)
+
+        # Add the biosamples list to the study record
+        study['biosamples'] = biosample_ids
+
+        # Insert the study record into MongoDB
+        insert_document(study_collection, study, study_id)
 
     # Close the connection
     client.close()