Skip to content

Commit

Permalink
study biosamples field
Browse files Browse the repository at this point in the history
sample gold records
  • Loading branch information
turbomam committed Jan 9, 2025
1 parent ed7d151 commit 292cb3b
Show file tree
Hide file tree
Showing 7 changed files with 326 additions and 24 deletions.
32 changes: 32 additions & 0 deletions assets/sample_gold_records/Ga0031151.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[
{
"apGoldId": "Ga0031151",
"organismGoldId": null,
"referenceApGoldId": null,
"apName": "Syntrophothermus lipocalidus DSM 12680",
"apType": "Genome Analysis (Isolate)",
"studyId": "Gs0000008",
"itsApId": 1377450,
"imgSubmissionId": null,
"imgTaxonOid": 646564577,
"imgPipelineVersion": "IMG/W 3.2",
"assemblyMethod": "Newbler v. 2.1 PreRelease-4/28/2009, Velvet v. 1.1.04, Phrap v. 4.24 SPS",
"publications": [],
"sraRuns": [],
"projects": [
"Gp0004567"
],
"contacts": [
{
"name": "Jonathan Eisen",
"email": "[email protected]",
"roles": [
"submitter",
"PI"
]
}
],
"modDate": "2023-07-05",
"addDate": "2009-01-18"
}
]
70 changes: 70 additions & 0 deletions assets/sample_gold_records/Gb0051341.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
[
{
"biosampleGoldId": "Gb0051341",
"biosampleName": "Hot spring microbial communities from Yellowstone National Park, Wyoming, USA - YNP2 Nymph Lake 10",
"ncbiTaxId": 433727,
"ncbiTaxName": "hot springs metagenome",
"sampleCollectionSite": "Combined lysosyme plus beat-beating ",
"geographicLocation": "USA: Yellowstone National Park, Wyoming",
"latitude": 44.7523206,
"longitude": -110.7253926,
"ecosystemPathId": 3992,
"ecosystem": "Environmental",
"ecosystemCategory": "Aquatic",
"ecosystemType": "Thermal springs",
"ecosystemSubtype": "Hot (42-90C)",
"specificEcosystem": "Unclassified",
"altitudeInMeters": null,
"altitudeInMeters2": null,
"dateCollected": "2007-08-20",
"depthInMeters": null,
"depthInMeters2": null,
"description": "Small acidic pool on hillside north of Nymph Lake.",
"hostDiseases": [],
"elevationInMeters": 2280,
"elevationInMeters2": null,
"geoLocation": "USA: Yellowstone National Park, Wyoming",
"habitat": "Hot spring",
"hostGender": null,
"hostName": null,
"hostNcbiTaxid": null,
"isoCountry": "USA",
"longhurst": null,
"nitrateConcentration": null,
"oxygenConcentration": null,
"ph": "3.75",
"pressure": null,
"salinity": null,
"salinityConcentration": null,
"sampleBodySite": null,
"sampleBodySubsite": null,
"sampleCollectionTemperature": "90 C",
"subsurfaceDepthInMeters": null,
"subsurfaceDepthInMeters2": null,
"visitNum": null,
"mixsPackage": "Standard",
"envoBroadScale": {
"id": "ENVO_00002030",
"label": "aquatic biome"
},
"envoLocalScale": {
"id": "ENVO_00000051",
"label": "hot spring"
},
"envoMedium": {
"id": "ENVO_03600065",
"label": "spring water"
},
"modDate": "2022-08-05",
"addDate": "2008-11-25",
"contacts": [
{
"name": "Zack Jay",
"email": "[email protected]",
"roles": [
"submitter"
]
}
]
}
]
49 changes: 49 additions & 0 deletions assets/sample_gold_records/Go0004567.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
[
{
"organismGoldId": "Go0004567",
"organismName": "Syntrophothermus lipocalidus DSM 12680",
"ncbiTaxId": 643648,
"ncbiTaxonName": "Syntrophothermus lipocalidus DSM 12680",
"ncbiSuperkingdom": "Bacteria",
"ncbiKingdom": "Bacillati",
"ncbiPhylum": "Bacillota",
"ncbiClass": "Clostridia",
"ncbiOrder": "Eubacteriales",
"ncbiFamily": "Syntrophomonadaceae",
"ncbiGenus": "Syntrophothermus",
"ncbiSpecies": "Syntrophothermus lipocalidus",
"bioticRelationships": "Symbiotic",
"oxygenRequirement": "Obligate anaerobe",
"metabolism": "Syntrophic",
"energySources": "",
"gramStain": "Gram+",
"isolationPubmedId": null,
"habitat": "Bioreactor|Sludge",
"sampleCollectionSite": "anaerobic granular sludge, thermophilic UASB reactor",
"ecosystemPathId": 4264,
"ecosystem": "Engineered",
"ecosystemCategory": "Wastewater",
"ecosystemType": "Unclassified",
"ecosystemSubtype": "Unclassified",
"specificEcosystem": "Unclassified",
"isolationHostName": null,
"hostTaxonomyId": null,
"hostBodySite": null,
"hostBodySubsite": null,
"salinity": null,
"salinityConcentration": null,
"cellDiameter": null,
"cellShape": "Rod-shaped",
"color": null,
"motility": "Motile",
"ph": "null",
"pressure": null,
"sporulation": "Nonsporulating",
"carbonSource": null,
"growthTemperature": "55",
"cellLength": null,
"cultured": "Yes",
"modDate": "2023-03-28",
"addDate": "2009-01-18"
}
]
76 changes: 76 additions & 0 deletions assets/sample_gold_records/Gp0004567.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
[
{
"projectGoldId": "Gp0004567",
"projectName": "Syntrophothermus lipocalidus DSM 12680",
"legacyGoldId": "Gc01292",
"studyGoldId": "Gs0000008",
"biosampleGoldId": null,
"organismGoldId": "Go0004567",
"itsProposalId": 867,
"itsSpid": null,
"itsSampleId": null,
"pmoProjectId": 97981,
"gptsProposalId": 97601,
"ncbiBioProjectAccession": "PRJNA37873",
"ncbiBioSampleAccession": "SAMN02598518",
"projectStatus": "Complete and Published",
"sequencingStatus": "Complete",
"sequencingCenters": [
"DOE Joint Genome Institute (JGI)"
],
"jgiFundingProgram": "GEBA",
"jgiFundingYear": 2007,
"hmpId": null,
"seqMethod": [
"454 GS FLX",
"Illumina GAIIx"
],
"contacts": [
{
"name": "Nikos Kyrpides",
"email": "[email protected]",
"roles": [
"submitter"
]
},
{
"name": "Jonathan Eisen",
"email": "[email protected]",
"roles": [
"other",
"PI"
]
}
],
"genomePublications": [
{
"pubmedId": 21304731,
"journalName": "Standards in genomic sciences",
"volume": "3",
"issue": "3",
"page": "268-75",
"title": "Complete genome sequence of Syntrophothermus lipocalidus type strain (TGB-C1).",
"publicationDate": null,
"doi": "10.4056/sigs.1233249"
},
{
"pubmedId": 28604660,
"journalName": "Nature biotechnology",
"volume": "35",
"issue": "7",
"page": "676-683",
"title": "1,003 reference genomes of bacterial and archaeal isolates expand coverage of the tree of life.",
"publicationDate": "2017-02-01",
"doi": "10.1038/nbt.3886"
}
],
"otherPublications": [],
"sraExperimentIds": [
"SRX1950465",
"SRX1950464"
],
"modDate": "2014-04-08",
"addDate": "2009-01-18",
"sequencingStrategy": "Whole Genome Sequencing"
}
]
62 changes: 62 additions & 0 deletions assets/sample_gold_records/Gs0000008.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
[
{
"studyGoldId": "Gs0000008",
"studyName": "A Genomic Encyclopedia of Bacteria and Archaea (GEBA)",
"description": "Genome sequencing of a phylogeny-driven genomic encyclopaedia of Bacteria and Archaea",
"modDate": "2022-08-05",
"addDate": "2007-11-27",
"contacts": [
{
"name": "Nikos Kyrpides",
"email": "[email protected]",
"roles": [
"submitter"
]
},
{
"name": "Genome Biology Program",
"email": null,
"roles": [
"other"
]
},
{
"name": "Jonathan Eisen",
"email": "[email protected]",
"roles": [
"PI",
"sample contact"
]
},
{
"name": "Ilya V. Kublanov",
"email": "[email protected]",
"roles": [
"other"
]
},
{
"name": "Lynne Goodwin",
"email": "[email protected]",
"roles": [
"PM"
]
},
{
"name": "Nicole Shapiro",
"email": "[email protected]",
"roles": [
"PM"
]
},
{
"name": "Bruce D",
"email": "[email protected]",
"roles": [
"other",
"PI"
]
}
]
}
]
35 changes: 13 additions & 22 deletions make-gold-cache.Makefile
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
# review and fix readmes and poetry dependencies
# use .gitkeep for keeping directories

# todo: just get environmental microbiome studies?
MAX_STUDIES=70000 # 2025-01

# in separate PRs!

MAX_STUDIES=300
.PHONY: load-gold-biosamples-into-mongo

downloads/goldData.xlsx:
wget -O $@ "https://gold.jgi.doe.gov/download?mode=site_excel"
Expand All @@ -17,19 +14,16 @@ local/gold-studies.tsv: downloads/goldData.xlsx
--output-file $@

local/gold-study-ids.txt: local/gold-studies.tsv
# this introduces some noise (non-id rows)
# without the grep filter, this introduces some noise (non-id rows)
tail -n +2 $< | cut -f 1 | sort | grep 'Gs' > $@

local/gold-study-ids-subset.txt: local/gold-study-ids.txt
head -n $(MAX_STUDIES) $< > $@

local/gold-cache.json: local/gold-study-ids-subset.txt
# ~ 3 seconds/uncached study
# ~ 30 studies/minute
# ~ 2k studies/hour
# ~ 50k studies/day
# GOLD has ~ 63k studies
# < 2 days to cache all studies ?
# < 2 days to fetch all studies ?
poetry run python sample_annotator/clients/gold_client.py \
--verbose \
fetch-studies \
Expand All @@ -39,6 +33,14 @@ local/gold-cache.json: local/gold-study-ids-subset.txt
--authentication-file config/gold-key.txt \
$<

load-gold-biosamples-into-mongo: local/gold-study-ids-subset.txt
# --purge-mongodb
# --purge-diskcache
poetry run python sample_annotator/gold_to_mongo.py \
--authentication-file config/gold-key.txt \
--mongo-db-name gold_metadata \
--study-ids-file $<

#.PHONY: split-out-gold-biosamples
#split-out-gold-biosamples: local/gold-cache.json
# poetry run python sample_annotator/file_utils/split_out_gold_biosamples.py \
Expand All @@ -47,15 +49,4 @@ local/gold-cache.json: local/gold-study-ids-subset.txt
# --biosample-output-file local/gold-biosamples-only.json \
# --project-output-file local/gold-projects-only.json \
# --remove-contacts \
# --remove-nulls

.PHONY: load-gold-biosamples-into-mongo
load-gold-biosamples-into-mongo: local/gold-study-ids-subset.txt
# --purge-mongodb
# --purge-diskcache
poetry run python sample_annotator/gold_to_mongo.py \
--authentication-file config/gold-key.txt \
--mongo-db-name gold_metadata \
--study-ids-file $< \
--purge-mongodb \
--purge-diskcache
# --remove-nulls
Loading

0 comments on commit 292cb3b

Please sign in to comment.