write to MongoDB

microbiomedata · Jan 8, 2025 · ed7d151 · ed7d151
1 parent 42a9213
commit ed7d151
Show file tree

Hide file tree

Showing 2 changed files with 165 additions and 12 deletions.
diff --git a/make-gold-cache.Makefile b/make-gold-cache.Makefile
@@ -5,7 +5,7 @@
 
 # in separate PRs!
 
-MAX_STUDY_ID=100
+MAX_STUDIES=300
 
 downloads/goldData.xlsx:
 	wget -O $@ "https://gold.jgi.doe.gov/download?mode=site_excel"
@@ -17,12 +17,19 @@ local/gold-studies.tsv: downloads/goldData.xlsx
 		--output-file $@
 
 local/gold-study-ids.txt: local/gold-studies.tsv
-	tail -n +2 $< | cut -f 1 > $@
+	# this introduces some noise (non-id rows)
+	tail -n +2 $< | cut -f 1 | sort | grep 'Gs' > $@
 
 local/gold-study-ids-subset.txt: local/gold-study-ids.txt
-	head -n $(MAX_STUDY_ID) $< > $@
+	head -n $(MAX_STUDIES) $< > $@
 
 local/gold-cache.json: local/gold-study-ids-subset.txt
+	# ~ 3 seconds/uncached study
+	# ~ 30 studies/minute
+	# ~ 2k studies/hour
+	# ~ 50k studies/day
+	# GOLD has ~ 63k studies
+	# < 2 days to cache all studies ?
 	poetry run python sample_annotator/clients/gold_client.py \
 		--verbose \
 		fetch-studies \
@@ -32,12 +39,23 @@ local/gold-cache.json: local/gold-study-ids-subset.txt
 		--authentication-file config/gold-key.txt \
 		$<
 
-.PHONY: split-out-gold-biosamples
-split-out-gold-biosamples: local/gold-cache.json
-	poetry run python sample_annotator/file_utils/split_out_gold_biosamples.py \
-		--input-file $< \
-		--study-output-file local/gold-studies-only.json \
-		--biosample-output-file local/gold-biosamples-only.json \
-		--project-output-file local/gold-projects-only.json \
-		--remove-contacts \
-		--remove-nulls
+#.PHONY: split-out-gold-biosamples
+#split-out-gold-biosamples: local/gold-cache.json
+#	poetry run python sample_annotator/file_utils/split_out_gold_biosamples.py \
+#		--input-file $< \
+#		--study-output-file local/gold-studies-only.json \
+#		--biosample-output-file local/gold-biosamples-only.json \
+#		--project-output-file local/gold-projects-only.json \
+#		--remove-contacts \
+#		--remove-nulls
+
+.PHONY: load-gold-biosamples-into-mongo
+load-gold-biosamples-into-mongo: local/gold-study-ids-subset.txt
+	# 		--purge-mongodb
+	# 		--purge-diskcache
+	poetry run python sample_annotator/gold_to_mongo.py \
+		--authentication-file config/gold-key.txt \
+		--mongo-db-name gold_metadata \
+		--study-ids-file $< \
+		--purge-mongodb \
+		--purge-diskcache
diff --git a/sample_annotator/gold_to_mongo.py b/sample_annotator/gold_to_mongo.py
@@ -0,0 +1,135 @@
+import logging
+from typing import List, Set
+
+import click
+from pymongo import MongoClient, ASCENDING
+from pymongo.errors import DuplicateKeyError
+
+from clients.gold_client import GoldClient
+
+# todo might need better API error handling
+#   should be more consistent about bundling (projects in biosamples) vs getting biosamples separate from studies
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+
+def create_unique_index(collection, field_name: str, index_name: str) -> None:
+    """
+    Creates a unique index on the specified field for a MongoDB collection.
+
+    Args:
+        collection: The MongoDB collection object.
+        field_name: The field to index.
+        index_name: The name of the index.
+    """
+    try:
+        collection.create_index([(field_name, ASCENDING)], name=index_name, unique=True)
+    except Exception as e:
+        logging.error(f"Failed to create index '{index_name}': {e}")
+
+
+def insert_document(collection, document: dict, key_name: str) -> None:
+    """
+    Inserts a document into a MongoDB collection, handling duplicate key errors.
+
+    Args:
+        collection: The MongoDB collection object.
+        document: The document to insert.
+        key_name: The key used for the unique index, for logging purposes.
+    """
+    try:
+        collection.insert_one(document)
+    except DuplicateKeyError as e:
+        # logging.warning(f"Duplicate key error for {key_name}: {e}")
+        logging.warning(f"Duplicate key error for {key_name}")
+
+
+def process_study_ids(file_path: str) -> List[str]:
+    """
+    Reads and processes study IDs from a file.
+
+    Args:
+        file_path: Path to the file containing study IDs.
+
+    Returns:
+        A sorted list of unique study IDs.
+    """
+    ids = set()
+    with open(file_path) as file:
+        for line in file:
+            if line.startswith("Gs"):
+                ids.add(line.strip())
+    if not ids:
+        raise ValueError(f"No valid IDs found in {file_path}")
+    return sorted(ids)
+
+
+@click.command()
+@click.option('--mongo-db-name', '-d', required=True,
+              help='Name of the local, unauthenticated MongoDB database to use.')
+@click.option('--study-ids-file', '-i',
+              type=click.Path(exists=True, dir_okay=False, readable=True),
+              required=True,
+              help='Path to the input text file containing one GOLD study ID per line.')
+@click.option('--authentication-file', '-a', default="config/gold-key.txt",
+              help='Path to the authentication file. Contents should be user:pass.')
+@click.option('--purge-mongodb', '-p', is_flag=True, default=False,
+              help='Purge the destination MongoDB database before running.')
+@click.option('--purge-diskcache', '-P', is_flag=True, default=False,
+              help='Purge the input disk cache before running.')
+def main(mongo_db_name: str, study_ids_file: str, authentication_file: str,
+         purge_mongodb: bool, purge_diskcache: bool, **args):
+    """
+    Fetch, process, and store biosamples, studies, and projects into MongoDB in real-time.
+    """
+    # MongoDB setup
+    client = MongoClient('mongodb://localhost:27017/')
+    db = client[mongo_db_name]
+
+    if purge_mongodb:
+        logging.info("Purging MongoDB collections...")
+        db.drop_collection('biosamples')
+        db.drop_collection('studies')
+        db.drop_collection('projects')
+
+    # Setup collections and indexes
+    biosample_collection = db['biosamples']
+    study_collection = db['studies']
+    project_collection = db['projects']
+
+    create_unique_index(biosample_collection, "biosampleGoldId", "biosampleGoldId_index")
+    create_unique_index(study_collection, "studyGoldId", "studyGoldId_index")
+    create_unique_index(project_collection, "projectGoldId", "projectGoldId_index")
+
+    # Initialize GoldClient
+    gc = GoldClient()
+
+    if purge_diskcache:
+        logging.info("Purging disk cache...")
+        gc.clear_cache()
+
+    gc.load_key(authentication_file)
+
+    # Process study IDs
+    study_ids = process_study_ids(study_ids_file)
+
+    for study_id in study_ids:
+        logging.info(f"Processing study {study_id}...")
+        study = gc.fetch_study(study_id, **args)
+        insert_document(study_collection, study, study_id)
+
+        biosamples = gc.fetch_biosamples_by_study(study_id, **args)
+        logging.info(f"Retrieved {len(biosamples)} biosamples for study {study_id}")
+
+        for biosample in biosamples:
+            for project in biosample.pop('projects', []):
+                insert_document(project_collection, project, project.get('projectGoldId', 'Unknown'))
+            insert_document(biosample_collection, biosample, biosample.get('biosampleGoldId', 'Unknown'))
+
+    # Close the connection
+    client.close()
+
+
+if __name__ == "__main__":
+    main()