CDCgov · dthoward96 · Sep 25, 2024 · Sep 13, 2024 · Sep 13, 2024 · Sep 19, 2024
diff --git a/.github/ISSUE_TEMPLATE/support-my-organism.md b/.github/ISSUE_TEMPLATE/support-my-organism.md
@@ -0,0 +1,23 @@
+---
+name: Support my organism
+about: I want SeqSender to support my organism.
+title: "[ORGANISM : FEATURE / BUG]"
+labels: help wanted
+assignees: ''
+
+---
+
+**Have you attempted to upload your organism using SeqSender? If so, please list the organism affected. If not, please attempt using SeqSender with it first, as SeqSender currently supports a wide variety of organisms, databases, and submission options.**
+Be sure to check the Submission Wizard in the documentation for all the available customizations for submitting your samples to repositories.
+
+**Which databases are you uploading to? Are all of them affected? If not list which ones are affected:**
+BIOSAMPLE/SRA/GENBANK/GISAID
+
+**Is the problem related to a metadata field, an additional file, available submission options, or the submission process itself?**
+If the field is only an attribute, it can be added to any database even if not validated by SeqSender, by simply adding the correct column name with the database prefix.
+
+**If possible describe the solution you'd like to see along with any other additional details.**
+A clear and concise description of what you think needs to change/made available to resolve your issue.
+
+**Error Logs**
+Add any other context, logs, or screenshots related to the issue here.
diff --git a/.github/workflows/DH_GHCR_upload.yml b/.github/workflows/DH_GHCR_upload.yml
@@ -0,0 +1,52 @@
+name: Create and publish docker image to DockerHub and GitHub Container Repository
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  push_to_registry:
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+      contents: read
+      attestations: write
+      id-token: write
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v4
+
+      - name: Log into GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            cdcgov/seqsender
+            ghcr.io/${{ github.repository }}
+
+      - name: Build and push Docker image
+        id: push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          tags: |
+            cdcgov/seqsender:${{ github.ref_name }}
+            cdcgov/seqsender:latest
+            ghcr.io/cdcgov/seqsender:${{ github.ref_name }}
+            ghcr.io/cdcgov/seqsender:latest
+          labels: "Genomic sequence pipeline to automate the process of generating necessary submission files and batch uploading them to public databases."
diff --git a/.github/workflows/GHCR_docker.yml b/.github/workflows/GHCR_docker.yml
diff --git a/.gitignore b/.gitignore
@@ -22,5 +22,4 @@ docker-compose-*.yaml
 **/.Rproj.user
 **/test_data/*
 **/gisaid_cli/*
-**/COV_TEST_DATA/*
-**/FLU_TEST_DATA/*
+**/*_TEST_DATA/*
diff --git a/README.Rmd b/README.Rmd
@@ -26,7 +26,7 @@ github_pages_url <- description$GITHUB_PAGES
 
 <p style="font-size: 16px;"><em>Public Database Submission Pipeline</em></p>
 
-**Beta Version**: v1.2.1. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome! 
+**Beta Version**: v1.2.3. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome! 
 
 **General Disclaimer**: This repository was created for use by CDC programs to collaborate on public health related projects in support of the [CDC mission](https://www.cdc.gov/about/organization/mission.htm).  GitHub is not hosted by the CDC, but is a third party website used by CDC and its partners to share information and collaborate on software. CDC use of GitHub does not imply an endorsement of any one particular service, product, or enterprise.
 

diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@
 
 </p>
 
-**Beta Version**: 1.2.1. This pipeline is currently in Beta testing, and
+**Beta Version**: 1.2.3. This pipeline is currently in Beta testing, and
 issues could appear during submission. Please use it at your own risk.
 Feedback and suggestions are welcome\!
 

diff --git a/biosample_sra_handler.py b/biosample_sra_handler.py
@@ -244,7 +244,15 @@ def process_biosample_sra_report(report_file: str, database: str, submission_dir
 	if "Action" not in report_dict["SubmissionStatus"]:
 		return submission_status, submission_id
 	try:
-		for action_dict in report_dict["SubmissionStatus"]["Action"]:
+		# If only a single sample, convert into list for proper formatting
+		if isinstance(report_dict["SubmissionStatus"]["Action"], list):
+			action_list = report_dict["SubmissionStatus"]["Action"]
+		elif isinstance(report_dict["SubmissionStatus"]["Action"], dict):
+			action_list = [report_dict["SubmissionStatus"]["Action"]]
+		else:
+			print(f"Error: Unable to correctly process BioSample report at: {report_file}", file=sys.stderr)
+			return submission_status, submission_id
+		for action_dict in action_list:
 			# Skip if incorrect database
 			if "@target_db" not in action_dict or action_dict["@target_db"].lower() != database.lower():
 				continue
@@ -271,6 +279,8 @@ def process_biosample_sra_report(report_file: str, database: str, submission_dir
 					sample_info.append({sample_name_col:sample_name, f"{column_prefix}_status":action_dict["@status"], f"{column_prefix}_accession":accession, f"{column_prefix}_message":""})
 	except:
 		pass
+	if submission_status == "PROCESSED" and not sample_info:
+		print(f"Error: Unable to process {database} report.xml to retrieve accessions at: {report_file}", file=sys.stderr)
 	if sample_info:
 		update_df = pd.DataFrame(sample_info)
 		upload_log.update_submission_status_csv(submission_dir=submission_dir, update_database=database, update_df=update_df)

diff --git a/config/gisaid/gisaid_FLU_schema.py b/config/gisaid/gisaid_FLU_schema.py
@@ -150,6 +150,56 @@
 			description="Additional information regarding patient (e.g. Patient infected while interacting with animal).",
 			title="Additional host information",
 		),
+		"gs-Sampling_Strategy": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Sampling strategy for sequence. (i.e. Baseline surveillance)",
+			title="Sampling strategy",
+		),
+		"gs-Sequencing_Strategy": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Sequencing purpose for sample. (i.e. DNA amplification)",
+			title="Sequencing strategy",
+		),
+		"gs-Sequencing_Technology": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Add the sequencer brand and model (e.g. Illumina MiSeq, Sanger, Nanopore MinION).",
+			title="Sequencing technology",
+		),
+		"gs-Assembly_Method": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Additional information regarding patient (e.g. Patient infected while interacting with animal).",
+			title="Assembly Method",
+		),
+		"gs-Coverage": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Average genome coverage (e.g. 50x, 100x, 1,000x).",
+			title="Average coverage",
+		),
 		"gs-Submitting_Sample_Id": Column(
 			dtype="object",
 			checks=None,
@@ -198,7 +248,17 @@
 			coerce=False,
 			required=False,
 			description="",
-			title="adamantanes resistance",
+			title="adamantanes resistance genotype",
+		),
+		"gs-Adamantanes_Resistance_pheno": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="",
+			title="adamantanes resistance phenotype",
 		),
 		"gs-Oseltamivir_Resistance_geno": Column(
 			dtype="object",
@@ -208,7 +268,17 @@
 			coerce=False,
 			required=False,
 			description="",
-			title="oseltamivir resistance",
+			title="oseltamivir resistance genotype",
+		),
+		"gs-Oseltamivir_Resistance_pheno": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="",
+			title="oseltamivir resistance phenotype",
 		),
 		"gs-Zanamivir_Resistance_geno": Column(
 			dtype="object",
@@ -218,7 +288,17 @@
 			coerce=False,
 			required=False,
 			description="",
-			title="zanamivir resistance",
+			title="zanamivir resistance genotype",
+		),
+		"gs-Zanamivir_Resistance_pheno": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="",
+			title="zanamivir resistance phenotype",
 		),
 		"gs-Peramivir_Resistance_geno": Column(
 			dtype="object",
@@ -228,7 +308,17 @@
 			coerce=False,
 			required=False,
 			description="",
-			title="peramivir resistance",
+			title="peramivir resistance genotype",
+		),
+		"gs-Peramivir_Resistance_pheno": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="",
+			title="peramivir resistance phenotype",
 		),
 		"gs-Other_Resistance_geno": Column(
 			dtype="object",
@@ -238,7 +328,17 @@
 			coerce=False,
 			required=False,
 			description="",
-			title="other resistances",
+			title="other resistances genotype",
+		),
+		"gs-Other_Resistance_pheno": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="",
+			title="other resistances phenotype",
 		),
 		"gs-Host_Gender": Column(
 			dtype="object",

diff --git a/docs/app.json b/docs/app.json
diff --git a/gisaid_handler.py b/gisaid_handler.py
@@ -6,7 +6,7 @@
 
 import shutil
 import subprocess
-from typing import Dict, Any, List, Optional, Match
+from typing import Dict, Any, List, Optional, Match, Any
 import os
 import pandas as pd
 import file_handler
@@ -15,6 +15,8 @@
 from Bio import SeqIO
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
+import warnings
+warnings.filterwarnings("ignore", 'This pattern has match groups')
 import re
 
 import upload_log
@@ -39,13 +41,15 @@ def create_gisaid_files(organism: str, database: str, submission_name: str, subm
 		gisaid_df["fn"] = "sequence.fsa"
 		first_cols = ["submitter", "fn", sample_name_column]
 	elif "FLU" in organism:
-		gisaid_df = gisaid_df.rename(columns = {"authors": "Authors", "collection_date": "Collection_Date"})
+		gisaid_df = gisaid_df.rename(columns = {"authors": "Authors"})
+		# Parse out dates into respective columns
+		gisaid_df[["Collection_Date", "Collection_Year", "Collection_Month"]] = gisaid_df["collection_date"].apply(process_flu_dates)
 		gisaid_df["Isolate_Id"] = ""
 		gisaid_df["Segment_Ids"] = ""
 		# Pivot FLU segment names from long form to wide form
 		gisaid_df["segment"] = "Seq_Id (" + gisaid_df["segment"].astype(str) + ")"
 		group_df = gisaid_df.pivot(index="Isolate_Name", columns="segment", values="sample_name").reset_index()
-		gisaid_df = gisaid_df.drop(columns=["sample_name", "segment"])
+		gisaid_df = gisaid_df.drop(columns=["sample_name", "segment", "collection_date"])
 		gisaid_df = gisaid_df.drop_duplicates(keep="first")
 		gisaid_df = gisaid_df.merge(group_df, on="Isolate_Name", how="inner", validate="1:1")
 		first_cols = ["Isolate_Id","Segment_Ids","Isolate_Name"]
@@ -58,6 +62,27 @@ def create_gisaid_files(organism: str, database: str, submission_name: str, subm
 	file_handler.create_fasta(database="GISAID", metadata=metadata, submission_dir=submission_dir)
 	shutil.copy(os.path.join(submission_dir, "sequence.fsa"), os.path.join(submission_dir, "orig_sequence.fsa"))
 
+# Flu collection dates require partial dates to use different columns
+def process_flu_dates(row: Any) -> pd.Series:
+	sections = row.strip().split("-")
+	if len(sections) == 1:
+		full_date = ""
+		year = sections[0]
+		month = ""
+	elif len(sections) == 2:
+		full_date = ""
+		year = sections[0]
+		month = sections[1]
+	elif len(sections) == 3:
+		full_date = row.strip()
+		year = ""
+		month = ""
+	else:
+		print(f"Error: Unable to process 'Collection_Date' column for FLU GISAID submission. The field should be in format 'YYYY-MM-DD'. Value unable to process: {row.strip()}", file=sys.stderr)
+		sys.exit(1)
+	return pd.Series([full_date, year, month])
+
+
 # Read output log from gisaid submission script
 def process_gisaid_log(log_file: str, submission_dir: str) -> pd.DataFrame:
 	file_handler.validate_file(file_type="GISAID log", file_path=log_file)

diff --git a/settings.py b/settings.py
@@ -12,7 +12,7 @@
 PROG_DIR: str = os.path.dirname(os.path.abspath(__file__))
 
 # SeqSender version
-VERSION: str = "1.2.1 (Beta)"
+VERSION: str = "1.2.3 (Beta)"
 
 # Organism options with unique submission options
 ORGANISM_CHOICES: List[str] = ["FLU", "COV", "POX", "ARBO", "OTHER"]