Skip to content

Commit

Permalink
write to MongoDB
Browse files Browse the repository at this point in the history
  • Loading branch information
turbomam committed Jan 8, 2025
1 parent 42a9213 commit ed7d151
Show file tree
Hide file tree
Showing 2 changed files with 165 additions and 12 deletions.
42 changes: 30 additions & 12 deletions make-gold-cache.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

# in separate PRs!

MAX_STUDY_ID=100
MAX_STUDIES=300

downloads/goldData.xlsx:
wget -O $@ "https://gold.jgi.doe.gov/download?mode=site_excel"
Expand All @@ -17,12 +17,19 @@ local/gold-studies.tsv: downloads/goldData.xlsx
--output-file $@

local/gold-study-ids.txt: local/gold-studies.tsv
tail -n +2 $< | cut -f 1 > $@
# this introduces some noise (non-id rows)
tail -n +2 $< | cut -f 1 | sort | grep 'Gs' > $@

local/gold-study-ids-subset.txt: local/gold-study-ids.txt
head -n $(MAX_STUDY_ID) $< > $@
head -n $(MAX_STUDIES) $< > $@

local/gold-cache.json: local/gold-study-ids-subset.txt
# ~ 3 seconds/uncached study
# ~ 30 studies/minute
# ~ 2k studies/hour
# ~ 50k studies/day
# GOLD has ~ 63k studies
# < 2 days to cache all studies ?
poetry run python sample_annotator/clients/gold_client.py \
--verbose \
fetch-studies \
Expand All @@ -32,12 +39,23 @@ local/gold-cache.json: local/gold-study-ids-subset.txt
--authentication-file config/gold-key.txt \
$<

.PHONY: split-out-gold-biosamples
split-out-gold-biosamples: local/gold-cache.json
poetry run python sample_annotator/file_utils/split_out_gold_biosamples.py \
--input-file $< \
--study-output-file local/gold-studies-only.json \
--biosample-output-file local/gold-biosamples-only.json \
--project-output-file local/gold-projects-only.json \
--remove-contacts \
--remove-nulls
#.PHONY: split-out-gold-biosamples
#split-out-gold-biosamples: local/gold-cache.json
# poetry run python sample_annotator/file_utils/split_out_gold_biosamples.py \
# --input-file $< \
# --study-output-file local/gold-studies-only.json \
# --biosample-output-file local/gold-biosamples-only.json \
# --project-output-file local/gold-projects-only.json \
# --remove-contacts \
# --remove-nulls

.PHONY: load-gold-biosamples-into-mongo
load-gold-biosamples-into-mongo: local/gold-study-ids-subset.txt
# --purge-mongodb
# --purge-diskcache
poetry run python sample_annotator/gold_to_mongo.py \
--authentication-file config/gold-key.txt \
--mongo-db-name gold_metadata \
--study-ids-file $< \
--purge-mongodb \
--purge-diskcache
135 changes: 135 additions & 0 deletions sample_annotator/gold_to_mongo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import logging
from typing import List, Set

import click
from pymongo import MongoClient, ASCENDING
from pymongo.errors import DuplicateKeyError

from clients.gold_client import GoldClient

# todo might need better API error handling
# should be more consistent about bundling (projects in biosamples) vs getting biosamples separate from studies

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def create_unique_index(collection, field_name: str, index_name: str) -> None:
"""
Creates a unique index on the specified field for a MongoDB collection.
Args:
collection: The MongoDB collection object.
field_name: The field to index.
index_name: The name of the index.
"""
try:
collection.create_index([(field_name, ASCENDING)], name=index_name, unique=True)
except Exception as e:
logging.error(f"Failed to create index '{index_name}': {e}")


def insert_document(collection, document: dict, key_name: str) -> None:
"""
Inserts a document into a MongoDB collection, handling duplicate key errors.
Args:
collection: The MongoDB collection object.
document: The document to insert.
key_name: The key used for the unique index, for logging purposes.
"""
try:
collection.insert_one(document)
except DuplicateKeyError as e:
# logging.warning(f"Duplicate key error for {key_name}: {e}")
logging.warning(f"Duplicate key error for {key_name}")


def process_study_ids(file_path: str) -> List[str]:
"""
Reads and processes study IDs from a file.
Args:
file_path: Path to the file containing study IDs.
Returns:
A sorted list of unique study IDs.
"""
ids = set()
with open(file_path) as file:
for line in file:
if line.startswith("Gs"):
ids.add(line.strip())
if not ids:
raise ValueError(f"No valid IDs found in {file_path}")
return sorted(ids)


@click.command()
@click.option('--mongo-db-name', '-d', required=True,
help='Name of the local, unauthenticated MongoDB database to use.')
@click.option('--study-ids-file', '-i',
type=click.Path(exists=True, dir_okay=False, readable=True),
required=True,
help='Path to the input text file containing one GOLD study ID per line.')
@click.option('--authentication-file', '-a', default="config/gold-key.txt",
help='Path to the authentication file. Contents should be user:pass.')
@click.option('--purge-mongodb', '-p', is_flag=True, default=False,
help='Purge the destination MongoDB database before running.')
@click.option('--purge-diskcache', '-P', is_flag=True, default=False,
help='Purge the input disk cache before running.')
def main(mongo_db_name: str, study_ids_file: str, authentication_file: str,
purge_mongodb: bool, purge_diskcache: bool, **args):
"""
Fetch, process, and store biosamples, studies, and projects into MongoDB in real-time.
"""
# MongoDB setup
client = MongoClient('mongodb://localhost:27017/')
db = client[mongo_db_name]

if purge_mongodb:
logging.info("Purging MongoDB collections...")
db.drop_collection('biosamples')
db.drop_collection('studies')
db.drop_collection('projects')

# Setup collections and indexes
biosample_collection = db['biosamples']
study_collection = db['studies']
project_collection = db['projects']

create_unique_index(biosample_collection, "biosampleGoldId", "biosampleGoldId_index")
create_unique_index(study_collection, "studyGoldId", "studyGoldId_index")
create_unique_index(project_collection, "projectGoldId", "projectGoldId_index")

# Initialize GoldClient
gc = GoldClient()

if purge_diskcache:
logging.info("Purging disk cache...")
gc.clear_cache()

gc.load_key(authentication_file)

# Process study IDs
study_ids = process_study_ids(study_ids_file)

for study_id in study_ids:
logging.info(f"Processing study {study_id}...")
study = gc.fetch_study(study_id, **args)
insert_document(study_collection, study, study_id)

biosamples = gc.fetch_biosamples_by_study(study_id, **args)
logging.info(f"Retrieved {len(biosamples)} biosamples for study {study_id}")

for biosample in biosamples:
for project in biosample.pop('projects', []):
insert_document(project_collection, project, project.get('projectGoldId', 'Unknown'))
insert_document(biosample_collection, biosample, biosample.get('biosampleGoldId', 'Unknown'))

# Close the connection
client.close()


if __name__ == "__main__":
main()

0 comments on commit ed7d151

Please sign in to comment.