diff --git a/.gitignore b/.gitignore index 0d767cc..0bc7b0a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ +.DS_Store .vscode/ +testdata/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 8ae5031..2ef9d12 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,67 @@ The package can be installed with `pip`: pip install tp53 ``` +## Upload a VCF to the Seshat TP53 Annotation Server + +Upload a VCF to the [Seshat TP53 annotation server](http://vps338341.ovh.net/) using a headless browser. + +```bash +❯ python -m tp53.seshat.upload_vcf \ + --input "input.vcf" \ + --email "example@gmail.com" +``` +```console +INFO:tp53.seshat.upload_vcf:Uploading 0 %... +INFO:tp53.seshat.upload_vcf:Uploading 53%... +INFO:tp53.seshat.upload_vcf:Uploading 53%... +INFO:tp53.seshat.upload_vcf:Uploading 60%... +INFO:tp53.seshat.upload_vcf:Uploading 60%... +INFO:tp53.seshat.upload_vcf:Uploading 66%... +INFO:tp53.seshat.upload_vcf:Uploading 66%... +INFO:tp53.seshat.upload_vcf:Uploading 80%... +INFO:tp53.seshat.upload_vcf:Uploading 80%... +INFO:tp53.seshat.upload_vcf:Upload complete! +``` + +This tool is used to programmatically configure and upload batch variants in VCF format to the Seshat annotation server. +The tool works by building a headless Chrome browser instance and then interacting with the Seshat website directly through simulated key presses and mouse clicks. +Unfortunately, Seshat does not provide a native programmatic API and one could not be reverse engineered. +Seshat also utilizes custom JavaScript in their form processing, so a lightweight approach of simply interacting with the HTML form elements was also not possible. + +###### VCF Input Requirements + +Seshat will not let the user know why a VCF fails to annotate, but it has been observed that Seshat can fail to parse some of [VarDictJava](https://github.com/AstraZeneca-NGS/VarDictJava)'s structural variants (SVs) as valid variant records. +One solution that has worked in the past is to remove SVs. +The following command will exclude all variants with a non-empty SVTYPE INFO key: + +```bash +❯ bcftools view in.vcf --exclude 'SVTYPE!="."' > out.noSV.vcf +``` + +###### Automation + +There are no terms and conditions posted on the Seshat annotation server's website, and there is no server-side `robots.txt` rule set. +In lieu of usage terms, we strongly encourage all users of this script to respect the Seshat resource by adhering to the following best practice: + +- **Minimize Load**: Limit the rate of requests to the server +- **Minimize Connections**: Limit the number of concurrent requests + +If you need to batch process dozens, or hundreds, of VCF callsets, you may consider improving this underlying Python script to randomize the user agent and IP address of your headless browser session to prevent from being labelled as a bot. + +###### Environment Setup + +This script relies on Google Chrome: + +```console +❯ brew install --cask google-chrome +``` + +Distributions of MacOS may require you to authenticate the Chrome driver ([link](https://stackoverflow.com/a/60362134)). + ## Development and Testing See the [contributing guide](./CONTRIBUTING.md) for more information. + +## References + +- [Soussi, Thierry, et al. “Recommendations for Analyzing and Reporting TP53 Gene Variants in the High-Throughput Sequencing Era.” Human Mutation, vol. 35, no. 6, 2014, pp. 766–778., doi:10.1002/humu.22561](https://doi.org/10.1002/humu.22561) diff --git a/poetry.lock b/poetry.lock index 609f47e..1a13af8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -284,6 +284,17 @@ files = [ {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"}, ] +[[package]] +name = "chromedriver-py" +version = "131.0.6778.108" +description = "chromedriver binaries for all platforms" +optional = false +python-versions = "*" +files = [ + {file = "chromedriver_py-131.0.6778.108-py3-none-any.whl", hash = "sha256:c2e9b8231f300c50e27019ec0a89dc3a5f79f635d213692bc0d61ecfe9630354"}, + {file = "chromedriver_py-131.0.6778.108.tar.gz", hash = "sha256:cffdcd722eac1a911e641594143195f50cfde78e15a8f94092c4bdb913a9b9b9"}, +] + [[package]] name = "colorama" version = "0.4.6" @@ -1486,4 +1497,4 @@ h11 = ">=0.9.0,<1" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "8e668178ad27800ddd0de052c0cae38c8dca11ef898c84af8b79432b04fd6546" +content-hash = "cab3938b0b260f4500c5f1d3dd23bd7878d0bd9ce66c22afeffe978bef1d4160" diff --git a/pyproject.toml b/pyproject.toml index a368da3..b7eaecc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ classifiers = [ [tool.poetry.dependencies] python = "^3.11" beautifulsoup4 = "~4.12" +chromedriver-py = "*" google-api-python-client = "~2.151" google-auth-httplib2 = "~0.2" google-auth-oauthlib = "~1.2.1" @@ -123,7 +124,7 @@ exclude = [ ] [[tool.mypy.overrides]] -module = "defopt" +module = "chromedriver_py" ignore_missing_imports = true [[tool.mypy.overrides]] diff --git a/tests/seshat/test_upload.py b/tests/seshat/test_upload.py index 9920c9b..2c31a02 100644 --- a/tests/seshat/test_upload.py +++ b/tests/seshat/test_upload.py @@ -1,4 +1,4 @@ -from tp53.seshat import HumanGenomeAssembly +from tp53.seshat.upload_vcf import HumanGenomeAssembly def test_human_genome_assembly() -> None: diff --git a/tp53/seshat/__init__.py b/tp53/seshat/__init__.py index 287d908..432e6b7 100644 --- a/tp53/seshat/__init__.py +++ b/tp53/seshat/__init__.py @@ -1,4 +1 @@ from tp53.seshat._exceptions import SeshatError as SeshatError -from tp53.seshat._gmail_find import find_in_gmail as find_in_gmail -from tp53.seshat._upload import HumanGenomeAssembly as HumanGenomeAssembly -from tp53.seshat._upload import upload_vcf as upload_vcf diff --git a/tp53/seshat/find_in_gmail/__init__.py b/tp53/seshat/find_in_gmail/__init__.py new file mode 100644 index 0000000..ea6d0bf --- /dev/null +++ b/tp53/seshat/find_in_gmail/__init__.py @@ -0,0 +1 @@ +from ._find_in_gmail import find_in_gmail as find_in_gmail diff --git a/tp53/seshat/find_in_gmail/__main__.py b/tp53/seshat/find_in_gmail/__main__.py new file mode 100644 index 0000000..c32dc2b --- /dev/null +++ b/tp53/seshat/find_in_gmail/__main__.py @@ -0,0 +1,2 @@ +if __name__ == "__main__": + ... diff --git a/tp53/seshat/_gmail_find.py b/tp53/seshat/find_in_gmail/_find_in_gmail.py similarity index 99% rename from tp53/seshat/_gmail_find.py rename to tp53/seshat/find_in_gmail/_find_in_gmail.py index 512b5f3..68e749b 100644 --- a/tp53/seshat/_gmail_find.py +++ b/tp53/seshat/find_in_gmail/_find_in_gmail.py @@ -20,7 +20,7 @@ from google_auth_oauthlib.flow import InstalledAppFlow from googleapiclient.discovery import build as build_google_client -from ._exceptions import SeshatError +from .._exceptions import SeshatError logger: Logger = getLogger("tp53.seshat") diff --git a/tp53/seshat/upload_vcf/__init__.py b/tp53/seshat/upload_vcf/__init__.py new file mode 100644 index 0000000..b03bfd4 --- /dev/null +++ b/tp53/seshat/upload_vcf/__init__.py @@ -0,0 +1,2 @@ +from ._upload_vcf import HumanGenomeAssembly as HumanGenomeAssembly +from ._upload_vcf import upload_vcf as upload_vcf diff --git a/tp53/seshat/upload_vcf/__main__.py b/tp53/seshat/upload_vcf/__main__.py new file mode 100644 index 0000000..d1b009f --- /dev/null +++ b/tp53/seshat/upload_vcf/__main__.py @@ -0,0 +1,118 @@ +""" +Upload a VCF to the Seshat TP53 annotation server using a headless browser. + +This tool is used to programmatically configure and upload batch variants in VCF +format to the Seshat annotation server. The tool works by building a headless +Chrome browser instance and then interacting with the Seshat website directly +through simulated key presses and mouse clicks. Unfortunately, Seshat does not +provide a native programmatic API and one could not be reverse engineered. +Seshat also utilizes custom JavaScript in their form processing, so a +lightweight approach of simply interacting with the HTML form elements was +also not possible. + +#### VCF Input Requirements + +Seshat will not let the user know why a VCF fails to annotate, but it has +been observed that Seshat can fail to parse some of VarDictJava's structural +variants (SVs) as valid variant records. One solution that has worked in the +past is to remove SVs. The following command will exclude all variants with a +non-empty SVTYPE INFO key: + + bcftools view in.vcf --exclude 'SVTYPE!="."' > out.noSV.vcf + +#### Automation + +There are no terms and conditions posted on the Seshat annotation server's +website, and there is no server-side `robots.txt` rule set. In lieu of usage +terms, we strongly encourage all users of this script to respect the Seshat +resource by adhering to the following best practice: + + - Minimize Load: Limit the rate of requests to the server + - Minimize Connections: Limit the number of concurrent requests + +If you need to batch process dozens, or hundreds, of VCF callsets, you may +consider improving this underlying Python script to randomize the user agent and +IP address of your headless browser session to prevent from being labelled as a +bot. + +#### Environment Setup + +This script relies on Chrome: + + brew install --cask google-chrome + +Distributions of MacOS require you to authenticate the Chrome driver: + + - https://stackoverflow.com/a/60362134 + +#### References + + 1. Soussi, Thierry, et al. “Recommendations for Analyzing and Reporting TP53 + Gene Variants in the High-Throughput Sequencing Era.” Human Mutation, + vol. 35, no. 6, 2014, pp. 766–778., doi:10.1002/humu.22561. + +─────── +""" + +import argparse +import logging +import sys +from pathlib import Path + +from ._upload_vcf import DEFAULT_REMOTE_URL +from ._upload_vcf import HumanGenomeAssembly +from ._upload_vcf import upload_vcf + +if __name__ == "__main__": + formatter = argparse.RawTextHelpFormatter + + cli_args = sys.argv[1:] + + parser = argparse.ArgumentParser( + description=__doc__, + add_help=True, + formatter_class=formatter, + epilog=r"Copyright © Clint Valentine 2024", + ) + + _ = parser.add_argument( + "--input", + required=True, + type=Path, + help="The path to the VCF to upload.", + ) + _ = parser.add_argument( + "--email", + required=True, + type=str, + help="The email address to receive annotated variants at.", + ) + _ = parser.add_argument( + "--assembly", + type=HumanGenomeAssembly, + default=HumanGenomeAssembly.hg38, + help="The human genome assembly of the VCF.\n(default: hg38)", + ) + _ = parser.add_argument( + "--url", + type=str, + default=DEFAULT_REMOTE_URL, + help="The Seshat TP53 web server URL.\n(default: http://vps338341.ovh.net/batch_analysis)", + ) + _ = parser.add_argument( + "--wait_for", + type=int, + default=5, + help="Seconds to wait for upload to occur before failure.\n(default: 5)", + ) + args = parser.parse_args(cli_args) + + logging.basicConfig(datefmt="[%X]", level=logging.INFO) + + upload_vcf( + vcf=args.input, + email=args.email, + assembly=args.assembly, + url=args.url, + wait_for=args.wait_for, + ) diff --git a/tp53/seshat/_upload.py b/tp53/seshat/upload_vcf/_upload_vcf.py similarity index 81% rename from tp53/seshat/_upload.py rename to tp53/seshat/upload_vcf/_upload_vcf.py index c9ec62f..1678b32 100644 --- a/tp53/seshat/_upload.py +++ b/tp53/seshat/upload_vcf/_upload_vcf.py @@ -1,18 +1,20 @@ +import logging +import time from datetime import datetime from datetime import timedelta from enum import StrEnum from enum import auto from logging import Logger -from logging import getLogger from pathlib import Path +from chromedriver_py import binary_path from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver -from ._exceptions import SeshatError +from .._exceptions import SeshatError -logger: Logger = getLogger("tp53.seshat") +logger: Logger = logging.getLogger("tp53.seshat.upload_vcf") DEFAULT_REMOTE_URL: str = "http://vps338341.ovh.net/batch_analysis" """The default remote Seshat batch analysis URL.""" @@ -34,7 +36,7 @@ class HumanGenomeAssembly(StrEnum): """The human genome assembly GRCh37 (hg19).""" -def seshat_upload_status(driver: RemoteWebDriver) -> str: +def upload_status(driver: RemoteWebDriver) -> str: """Query the file uploading status and return its text representation.""" modal = driver.find_element(By.XPATH, '//*[@id="uploading-status-text"]') inner = modal.get_attribute("innerText") @@ -55,16 +57,18 @@ def upload_vcf( Args: vcf: The path to the VCF to upload. - email: The email address to receive Seshat TP53 variant annotations. + email: The email address to receive annotated variants at. assembly: The human genome assembly of the VCF. url: The Seshat TP53 web server URL. - wait_for: The total amount of time in seconds to wait for the upload occur before failure. + wait_for: Seconds to wait for upload to occur before failure. """ vcf = str(Path(vcf).expanduser().absolute()) + service = webdriver.ChromeService(executable_path=binary_path) options = webdriver.ChromeOptions() options.add_argument("headless") - driver = webdriver.Chrome(options=options) + + driver = webdriver.Chrome(service=service, options=options) driver.get(url) driver.find_element(By.XPATH, f'//select[@id="reference"]/option[@value="{assembly}"]').click() @@ -75,8 +79,9 @@ def upload_vcf( status: str = "" while (SUCCESS not in status) and datetime.now() < upload_start + timedelta(seconds=wait_for): - status = seshat_upload_status(driver) + status = upload_status(driver) logger.info(status) + time.sleep(0.1) driver.quit()