From ba4a4e0633cfc65b8a7b4f786543fbf13b8e7172 Mon Sep 17 00:00:00 2001 From: Christian Perez Llamas <932644+chris-zen@users.noreply.github.com> Date: Sun, 18 Aug 2024 14:52:27 +0200 Subject: [PATCH] Upgrading versions and adding Continuous Integration and Delivery --- .dockerignore | 4 + .github/workflows/build.yaml | 156 ++++++++++++++++++ .gitignore | 7 + .hadolint.yaml | 2 + .python-version | 1 + Dockerfile | 10 ++ MANIFEST.in | 4 - Makefile | 121 ++++++++++++++ README.md | 302 +++++++++++++++++++++++++++++++++++ oncodriveclustl/__init__.py | 3 +- oncodriveclustl/main.py | 36 ----- pyproject.toml | 46 ++++++ requirements.txt | 13 -- setup.py | 45 ------ 14 files changed, 650 insertions(+), 100 deletions(-) create mode 100644 .dockerignore create mode 100644 .github/workflows/build.yaml create mode 100644 .hadolint.yaml create mode 100644 .python-version create mode 100644 Dockerfile delete mode 100644 MANIFEST.in create mode 100644 Makefile create mode 100644 README.md create mode 100644 pyproject.toml delete mode 100644 requirements.txt delete mode 100644 setup.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..aea8d0b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +.venv/ +dist/ +example/ +oncodriveclustl.egg-info/ diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 0000000..102a0c2 --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,156 @@ +name: Build and Publish + +on: + push: + tags: + - "**" + branches: + - "**" + +permissions: + contents: read + +env: + TERM: xterm + PYTHON_VERSION: 3.12 + +jobs: + packages-build: + name: Build packages + runs-on: ubuntu-latest + env: + RUFF_FORMAT: github + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - uses: eifinger/setup-rye@v4 + + - name: Check format + run: | + make check-format || true + BOLDRED=$(tput bold && tput setaf 1) + RESET=$(tput sgr0) + echo "${BOLDRED}==> We won't fail on formatting errors for the time being, but we will in the future.${RESET}" + + - name: Check lint + run: | + make check-lint || true + BOLDRED=$(tput bold && tput setaf 1) + RESET=$(tput sgr0) + echo "${BOLDRED}==> We won't fail on lint errors for the time being, but we will in the future.${RESET}" + + - name: Build packages + run: make build-dist + + - name: Upload packages + uses: actions/upload-artifact@v4 + with: + name: python-packages + path: dist + + docker-build: + name: Build Docker image + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - uses: eifinger/setup-rye@v4 + + - name: Check Dockerfile + run: make check-docker + + - name: Build Docker image + run: make build-image + + # TODO: Enable this when we figure out how to run it without having to download several Gigabytes of data. + # - name: Test Docker image + # run: make run-example + + check-version: + name: Check version + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/') + needs: + - packages-build + - docker-build + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - uses: eifinger/setup-rye@v4 + + - name: Check version matching the tag + run: make check-version + + packages-publish: + name: Publish packages + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/') + needs: + - check-version + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - uses: eifinger/setup-rye@v4 + + - name: Download packages + uses: actions/download-artifact@v4 + with: + name: python-packages + + - name: Publish to PyPI + env: + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} + if: ${{ env.PYPI_TOKEN != '' }} + run: make publish-dist + + docker-push: + name: Push Docker image + if: startsWith(github.ref, 'refs/tags/') + runs-on: ubuntu-latest + env: + DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} + needs: + - check-version + + steps: + - if: ${{ env.DOCKER_USERNAME != '' }} + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - uses: eifinger/setup-rye@v4 + + - name: Login to DockerHub + if: ${{ env.DOCKER_USERNAME != '' }} + run: echo ${{ secrets.DOCKER_PASSWORD }} | docker login -u ${{ secrets.DOCKER_USERNAME }} --password-stdin + + - name: Push Docker image + if: ${{ env.DOCKER_USERNAME != '' }} + run: make push-image diff --git a/.gitignore b/.gitignore index 572b27a..de6a073 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ dataset_randomizator.py Distance_mutations.ipynb oncodriveclustl/count_mutations.py oncodriveclustl/mutations_intogen_local.txt +example/output # Singularity images *.simg @@ -35,7 +36,13 @@ oncodriveclustl/mutations_intogen_local.txt *~ # Python bytecode +__pycache__ *.pyc *.egg-info/ dist/ +# rye files +.venv/ +requirements-dev.lock +requirements.lock + diff --git a/.hadolint.yaml b/.hadolint.yaml new file mode 100644 index 0000000..bc1caf0 --- /dev/null +++ b/.hadolint.yaml @@ -0,0 +1,2 @@ +ignored: + - DL3003 \ No newline at end of file diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..455808f --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12.4 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6e638a7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.12-slim + +# hadolint ignore=DL3042 +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,target=/project,rw \ + cd /project && pip install . + +RUN oncodriveclustl --help + +ENTRYPOINT [ "/usr/local/bin/oncodriveclustl" ] diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 523dbf3..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -# Files to be included in the distribution -include MANIFEST.in -include requirements.txt -include data/* \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4b29b29 --- /dev/null +++ b/Makefile @@ -0,0 +1,121 @@ +ROOT_DIR := $(shell echo $(dir $(lastword $(MAKEFILE_LIST))) | sed 's|/*$$||') + +SHELL := /bin/bash + +VERSION = $(shell rye version) + +GIT_TAG_OR_SHA = $(shell git describe --tags --exact-match 2>/dev/null || git rev-parse --short HEAD) + +IMAGE_TAG = $(VERSION) +IMAGE := bbglab/oncodriveclustl:$(IMAGE_TAG) + +BOLDRED := $(shell tput bold && tput setaf 1) +BOLDGREEN := $(shell tput bold && tput setaf 2) +BOLDYELLOW := $(shell tput bold && tput setaf 3) +BOLDBLUE := $(shell tput bold && tput setaf 4) +LIGHTBLUE := $(shell tput setaf 6) +WHITE := $(shell tput sgr0 && tput setaf 7) +RESET := $(shell tput sgr0) + + +.PHONY: help +help: + @echo "$(BOLDYELLOW)Available targets:$(RESET)" + @echo + @echo "$(BOLDGREEN) checks $(WHITE)-> Run all the checks (format and lint)" + @echo "$(BOLDGREEN) check-format $(WHITE)-> Check for formatting errors" + @echo "$(BOLDGREEN) check-lint $(WHITE)-> Check for lint errors" + @echo "$(BOLDGREEN) check-docker $(WHITE)-> Check the Dockerfile" + @echo "$(BOLDGREEN) format $(WHITE)-> Format source code" + @echo "$(BOLDGREEN) build-dist $(WHITE)-> Build source and wheel distribution files" + @echo "$(BOLDGREEN) build-image $(WHITE)-> Build the Docker image" + @echo "$(BOLDGREEN) push-image $(WHITE)-> Push the Docker image into DockerHub" + @echo "$(BOLDGREEN) run-example $(WHITE)-> Run the included example using the Docker image" + @echo "$(BOLDGREEN) clean $(WHITE)-> Clean the working directory (build files, virtual environments, caches)" + @echo "$(RESET)" + +.PHONY: rye-installed +rye-installed: + @if ! which rye > /dev/null; then \ + echo "$(BOLDRED)This project build is managed by $(BOLDYELLOW)rye$(BOLDRED), which is not installed.$(RESET)"; \ + echo "$(LIGHTBLUE)Please follow these instructions to install it:$(RESET)"; \ + echo "$(LIGHTBLUE)--> $(BOLDBLUE)https://rye.astral.sh/guide/installation/$(RESET)"; \ + exit 1; \ + fi + +.PHONY: checks +checks: check-format check-lint check-docker + +.PHONY: check-format +check-format: rye-installed + @echo "$(BOLDGREEN)Checking code format ...$(RESET)" + rye fmt --check + @echo "$(BOLDGREEN)==> Success!$(RESET)" + +.PHONY: check-lint +check-lint: rye-installed + @echo "$(BOLDGREEN)Checking lint ...$(RESET)" + rye lint + @echo "$(BOLDGREEN)==> Success!$(RESET)" + +.PHONY: check-docker +check-docker: + @echo "$(BOLDGREEN)Checking Dockerfile ...$(RESET)" + docker run --rm -i \ + -v $$(pwd):/project \ + hadolint/hadolint hadolint \ + --config /project/.hadolint.yaml \ + /project/Dockerfile + @echo "$(BOLDGREEN)==> Success!$(RESET)" + +.PHONY: check-version +check-version: rye-installed + @echo "$(BOLDGREEN)Checking that the version matches the tag ...$(RESET)" + @if [ "$(VERSION)" != "$(GIT_TAG_OR_SHA)" ]; then \ + echo "$(BOLDRED)==> Version $(BOLDYELLOW)$(VERSION)$(BOLDRED) doesn't match the git tag $(BOLDYELLOW)$(GIT_TAG_OR_SHA)$(BOLDRED) !!!$(RESET)"; \ + echo "$(BOLDRED)==> Please update the $(BOLDYELLOW)__version__$(BOLDRED) in $(BOLDYELLOW)oncodrivefml/__init__.py$(BOLDRED) and re-create the tag.$(RESET)"; \ + exit 1; \ + fi + @echo "$(BOLDGREEN)==> Success!$(RESET)" + +.PHONY: format +format: rye-installed + @echo "$(BOLDGREEN)Formatting code ...$(RESET)" + rye fmt + +.PHONY: build-dist +build-dist: rye-installed + @echo "$(BOLDGREEN)Building packages ...$(RESET)" + rye build + +.PHONY: publish-dist +publish-dist: rye-installed + @echo "$(BOLDGREEN)Publishing OncodriveCLUSTL $(BOLDYELLOW)$(VERSION)$(BOLDGREEN) to PyPI ...$(RESET)" + @[[ -z "$(PYPI_TOKEN)" ]] && (echo "$(BOLDRED)==> Missing PyPI token !!!$(RESET)"; exit 1) + rye publish --token $(PYPI_TOKEN) + +.PHONY: build-image +build-image: rye-installed + @echo "$(BOLDGREEN)Building Docker image $(BOLDYELLOW)$(IMAGE)$(BOLDGREEN) ...$(RESET)" + docker build --progress=plain -t $(IMAGE) . + @echo "$(BOLDGREEN)==> Success!$(RESET)" + +.PHONY: build-image +push-image: rye-installed + @echo "$(BOLDGREEN)Pushing the Docker image into the DockerHub ...$(RESET)" + docker push $(IMAGE) + @echo "$(BOLDGREEN)==> Success!$(RESET)" + +.PHONY: run-example +run-example: rye-installed + @echo "$(BOLDGREEN)Running example ...$(RESET)" + rye run oncodriveclustl \ + -i example/PAAD.tsv.gz -r example/cds.hg19.regions.gz -o example/output \ + -sw 15 -cw 15 -simw 35 -sim region_restricted --concatenate --clustplot -e KRAS + @echo "$(BOLDGREEN)==> Success!$(RESET)" + +.PHONY: clean +clean: + @echo "$(BOLDGREEN)Cleaning the repository ...$(RESET)" + rm -rf ./oncodriveclustl.egg-info ./dist ./.ruff_cache ./.venv + find . -name "__pycache__" -type d -exec rm -r {} + diff --git a/README.md b/README.md new file mode 100644 index 0000000..9f78e91 --- /dev/null +++ b/README.md @@ -0,0 +1,302 @@ +# OncodriveCLUSTL + +OncodriveCLUSTL is a sequence-based clustering method to identify significant +clustering signals in nucleotide sequence. + +One of the main goals of cancer research is the identification of the genomic +elements that drive tumorigenesis. OncodriveCLUSTL is a new nucleotide +sequence-based clustering algorithm to detect significant clustering signals +across genomic regions. OncodriveCLUSTL is based on a local background model +derived from the nucleotide context mutational probabilities of the cohort under +study. Our method is able to identify well-known cancer drivers in coding +regions and it can be applied to non-coding regions and non-human data. + +## License + +OncodriveCLUSTL is available to the general public subject to certain conditions +described in its [LICENSE](LICENSE). + + +## Installation + +OncodriveCLUSTL depends on Python 3.5 or above. We recommend to install it using +the [Anaconda Python distribution](https://www.anaconda.com/download/): + +```bash +conda install -c bbglab oncodriveclustl +``` + +OncodriveCLUSTL can also be installed using pip: + +```bash +pip install oncodriveclustl +``` + +You can obtain the latest code from the repository and install it for development with pip: + +```bash +git clone git@bitbucket.org:bbglab/oncodriveclustl.git +cd oncodriveclustl +python -m venv .venv +.venv/bin/pip install -e . +source .venv/bin/activate +oncodriveclustl --help +``` + +> [!NOTE] +> The first time that you run OncodriveCLUSTL with a given reference genome, it +> will download it from our servers. By default the downloaded datasets go to +> `~/.bgdata`. If you want to move these datasets to another folder you have to +> define the system environment variable `BGDATA_LOCAL` with an export command. + +> [!NOTE] +> If you install a modern build tool like [rye](https://rye.astral.sh), you can +> simply do this: +> ```bash +> git clone git@bitbucket.org:bbglab/oncodriveclustl.git +> cd oncodriveclustl +> rye sync +> rye run oncodriveclustl --help +> ``` + +## Input data + +OncodriveCLUSTL only requires two main inputs, the mutations file and the +annotations file. + +### Mutations file + +TSV file containing SNVs (substitutions) mapped to a reference genome (e.g., +human hg19 or mouse c3h). If other mutation types are present (insertions, +deletions, double base substitutions, etc), they will be filtered out during the +analysis. This file must contain, at least, the following 5 columns with header: + +1. **CHROMOSOME**: 1, 2,..., X, Y +2. **POSITION**: Integer indicating the position of the mutation +3. **REF**: Reference nucleotide +4. **ALT**: Alternate nucleotide +5. **SAMPLE**: Identifier of the sample + +Additional columns are: + +6. **CANCER_TYPE**: Type of tumor. When specified, OncodriveCLUSTL will + calculate one mutational profile for each cancer type and mutations will be + randomized accordingly. +7. **SIGNATURE**: User-defined group to compute k-mer nucleotide mutational + probabilities. When specified, OncodriveCLUSTL will calculate one mutational + profile for each group and will randomize each mutation accordingly. + +> [!NOTE] +> OncodriveCLUSTL assumes all SNVs are mapped to the positive strand. + +> [!WARNING] +> When using the `--signature-group` option, please check that the number of +> mutations per group is sufficient for an accurate signatures calculation. + +### Annotations file + +TSV file containing the coordinates of genomic elements (GEs). This file must +contain, at least, the following 5 columns with header: + +1. **CHROMOSOME**: 1, 2,..., X, Y +2. **START**: Starting position of the genomic region +3. **END**: Final position of the genomic region +4. **ELEMENT**: Identifier of the GE +5. **SYMBOL**: Symbol of the GE. OncodriveCLUSTL will analyze GEs as **SYMBOL** + **ELEMENT**. + +Additional columns are: + +6. **STRAND**: Strand of the GE coordinates ("+" or "-"). + +> [!WARNING] +> Coordinates of a given GE cannot overlap. + +You can check the input formats in the files provided in the example. + +If you have a VCF file or directory of VCF files containing somatic mutations, +you can run our VCF parser to obtain a tabular file compatible with +OncodriveCLUSTL input format:: + +```bash +parse_vcf -i [INPUT_DIRECTORY] -o [OUTPUT_FILE] +``` + +Please, check [parsers/vcf.py](oncodriveclustl/parsers/vcf.py) module for more +details. + +If you would like to run OncodriveCLUSTL using a per-calculated signature or +mutational profile, you need to provide a dictionary containing the reference +k-mer to alternate mutational probabilities in JSON format: + +```json +{ + "my_dataset": { + "GCA>G": 0.02424271083094251, + "AGC>A": 0.023005887103025254, + "ACG>T": 0.037613802858829135, + "CGA>C": 0.10691031051670515, + "GAC>G": 0.017846071811001615, + "TTC>A": 0.024003748061871697, + "CTT>G": 0.024149863672267024, + "GGA>T": 0.011178562948734577, + "AGG>C": 0.010654720767868876, + "GGG>C": 0.012031686292218055, + "CAA>T": 0.014478959792844522, + "TGA>A": 0.01255651801972085, + "GGA>A": 0.011178562948734577, + "CGA>A": 0.03563677017223505, + "TCC>T": 0.011158347971568658, + "GCC>A": 0.010952316565906438, + // ... + } +} +``` + +OncodriveCLUSTL requires non-collapsed k-mer probabilities (192 for +tri-nucleotides, 3072 for penta-nucleotides). + +## Output data + +OncodriveCLUSTL generates three output files: + +### Elements results file ('elements_results.txt') + +TSV file containing results of the analyzed elements: + +1. **SYMBOL**: GE symbol #. ENSID: GE ID #. CGC: True if GE in the COSMIC Cancer Gene Census (CGC) list (Sondka et al., 2018) +2. **CHROMOSOME**: 1, 2,..., X, Y +3. **STRAND**: Strand of the GE ("+" or "-") +4. **LENGTH**: length (bp) of the GE +5. **TOTAL_MUT**: total substitutions observed in the GE +6. **CLUSTERED_MUT**: number of substitutions in a cluster +7. **CLUSTERS**: number of clusters +8. **SIM_CLUSTERS**: number of simulated clusters +9. **SCORE**: GE score +10. **P_EMPIRICAL**: empirical p-value of the GE +11. **Q_EMPIRICAL**: empirical q-value of the GE +12. **P_ANALYTICAL**: analytical p-value of the GE +13. **Q_ANALYTICAL**: analytical q-value of the GE +14. **P_TOPCLUSTER**: analytical p-value of the cluster with highest cluster score +15. **Q_TOPCLUSTER**: analytical q-value of the cluster with highest cluster score + +### Clusters results file ('clusters_results.tsv'). + +TSV file containing results of the clusters observed in the analyzed elements: + +1. **RANK**: Position of the GE in the list of +2. **SYMBOL**: GE symbol +3. **ENSID**: GE ID +4. **CGC**: True if GE in the CGC list +5. **CHROMOSOME**: 1, 2,..., X, Y +6. **STRAND**: Strand of the GE ("+" or "-") +7. **COORDINATES**: genomic coordinates of the cluster. It can be 'coord1,coord2' + for clusters inside a single region or 'coord1,coord2;coord3,coord4' for + those spanning regions (--concatenate flag) +8. **MAX_COORD**: genomic position with the highest smoothing score inside the cluster +9. **WIDTH**: cluster's width (pb) +10. **N_MUT**: number of substitutions in the cluster +11. **N_SAMPLES**: number of samples with a mutation in the cluster +12. **FRA_UNIQ_SAMPLES**: proportion of unique samples mutated in the cluster out of the total of mutations in the cluster +13. **SCORE**: cluster score +14. **P**: analytical p-value of the cluster + +### Log file ('results.log') + +TXT file containing OncodriveCLUSTL's run information. + +## Usage + +OncodriveCLUSTL is meant to be used through the command line. + +``` +Usage: oncodriveclustl [OPTIONS] + +Options: + -i, --input-file PATH File containing somatic mutations + [required] + -r, --regions-file PATH File with the genomic regions to analyze + [required] + -o, --output-directory TEXT Output directory to be created [required] + -sig, --input-signature PATH File containing input context based + mutational probabilities (signature) + -ef, --elements-file PATH File with the symbols of the elements to + analyze + -e, --elements TEXT Symbol of the element(s) to analyze + -g, --genome [hg38|hg19|mm10|c3h|car|cast|f344] + Genome to use + -emut, --element-mutations INTEGER + Cutoff of element mutations. Default is 2 + -cmut, --cluster-mutations INTEGER + Cutoff of cluster mutations. Default is 2 + -sw, --smooth-window INTEGER RANGE + Smoothing window. Default is 11 [3<=x<=101] + -cw, --cluster-window INTEGER RANGE + Cluster window. Default is 11 [3<=x<=101] + -kmer, --kmer [3|5] K-mer nucleotide context + -n, --n-simulations INTEGER number of simulations. Default is 1000 + -sim, --simulation-mode [mutation_centered|region_restricted] + Simulation mode + -simw, --simulation-window INTEGER RANGE + Simulation window. Default is 31 + [19<=x<=101] + -sigcalc, --signature-calculation [frequencies|region_normalized] + Signature calculation: mutation frequencies + (default) or k-mer mutation counts + normalized by k-mer region counts + -siggroup, --signature-group [SIGNATURE|SAMPLE|CANCER_TYPE] + Header of the column to group signatures + calculation + -c, --cores INTEGER RANGE Number of cores to use in the computation. + By default it will use all the available + cores. [1<=x<=10] + --seed INTEGER Seed to use in the simulations + --log-level [debug|info|warning|error|critical] + Verbosity of the logger + --concatenate Calculate clustering on concatenated genomic + regions (e.g., exons in coding sequences) + --clustplot Generate a needle plot with clusters for an + element + --qqplot Generate a quantile-quantile (Q-Q) plot for + a dataset + --gzip Gzip compress files + -h, --help Show this message and exit. +``` + +> [!NOTE] +> When using simulation mode 'mutation_centered', simulation windows can be +> simulated outside the GE. + +> [!NOTE] +> When using `--signature-calculation region_normalized`, k-mer mutation counts +> will be normalized by k-mer nucleotide counts in the genomic regions +> provided as input (`--regions-file`). + +# Run the example + +If you run OncodriveCLUSTL from the [source code], you can run an example of +TCGA pancreatic adenocarcinomas (Ellrott et al. 2018) for coding regions +(Mularoni et al., 2016) using 1000 simulations. First you need to download the +example folder. Then you run OncodriveCLUSTL with default mode and parameters +as: + +[source code]: https://github.com/bbglab/oncodriveclustl + +```bash +oncodriveclustl -i example/PAAD.tsv.gz -r example/cds.hg19.regions.gz -o example/output +``` + +The results will be saved in a folder named `output`. + +You can compute a more sophisticated analysis using non-default parameters and +generate a quantile-quantile plot by typing: + +```bash +oncodriveclustl -i example/PAAD.tsv.gz -r example/cds.hg19.regions.gz -o example/output -sw 15 -cw 15 -simw 35 -sim region_restricted --concatenate --qqplot +``` + +If you want to run a specific GE and generate a plot its observed clusters, you +can type:: + +```bash +oncodriveclustl -i example/PAAD.tsv.gz -r example/cds.hg19.regions.gz -o example/output -sw 15 -cw 15 -simw 35 -sim region_restricted --concatenate --clustplot -e KRAS +``` diff --git a/oncodriveclustl/__init__.py b/oncodriveclustl/__init__.py index 4a8a8d1..4a2bfa8 100644 --- a/oncodriveclustl/__init__.py +++ b/oncodriveclustl/__init__.py @@ -1,2 +1 @@ -VERSION = (1, 1, 4) -__version__ = '.'.join([str(i) for i in VERSION]) \ No newline at end of file +__version__ = "1.2.0" \ No newline at end of file diff --git a/oncodriveclustl/main.py b/oncodriveclustl/main.py index c3cd8a7..0d236c7 100644 --- a/oncodriveclustl/main.py +++ b/oncodriveclustl/main.py @@ -104,42 +104,6 @@ def main(input_file, qqplot, gzip ): - """ - OncodriveCLUSTL is a sequence based clustering method to identify cancer drivers across the genome - - Args: - input_file (str): path to mutations file - regions_file (str): path to input genomic coordinates file - output_directory(str): path to output directory. Output files will be generated in it. - input_signature (str): path to file containing input context based mutational probabilities. - By default (when no input signatures), OncodriveCLUSTL will calculate them from the mutations input file. - elements_file (str): path to file containing one element per row (optional) to analyzed the listed elements. - By default, OncodriveCLUSTL analyzes all genomic elements contained in `regions_file`. - elements (str): genomic element symbol (optional). The analysis will be performed only on the specified GEs. - genome (str): genome to use: 'hg38', 'hg19', 'mm10', 'c3h', 'car', 'cast' and 'f344' - element_mutations (int): minimum number of mutations per genomic element to undertake analysis - cluster_mutations (int): minimum number of mutations to define a cluster - smooth_window (int): Tukey kernel smoothing window length - cluster_window (int): clustering window length - kmer (int): context nucleotides to calculate the mutational probabilities (trinucleotides or pentanucleotides) - n_simulations (int): number of simulations - simulation_mode (str): simulation mode - simulation_window (int): window length to simulate mutations - signature_calculation (str): signature calculation, mutation frequencies (default) or mutation counts - normalized by k-mer region counts - signature_group (str): header of the column to group signatures. One signature will be computed for each group - cores (int): number of CPUs to use - seed (int): seed - log_level (str): verbosity of the logger - concatenate (bool): flag to calculate clustering on collapsed genomic regions (e.g., coding regions in a gene) - clustplot (bool): flag to generate a needle plot with clusters for an element - qqplot (bool): flat to generate a quantile-quantile (QQ) plot for a dataset - gzip (bool): flag to generate GZIP compressed output files - - Returns: - None - - """ global logger diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ed3ce4b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,46 @@ +[project] +name = "oncodriveclustl" +dynamic = ["version"] +description = "OncodriveCLUSTL is a clustering method to identify cancer drivers" +authors = [ + { name = "BBGLab (Barcelona Biomedical Genomics Lab)", email = "bbglab@irbbarcelona.org" } +] +dependencies = [ + "bgparsers==0.10", + "bgreference==0.7", + "bgsignature==0.2", + "click==8.1.7", + "daiquiri==3.2.5.1", + "intervaltree==3.1.0", + "matplotlib==3.9.2", + "numpy==2.0.1", + "pandas==2.2.2", + "scikit-learn==1.5.1", + "scipy==1.14.0", + "statsmodels==0.14.2", + "tqdm==4.66.5", +] +readme = "README.md" +requires-python = ">=3.5,<3.13" +license = { file = "LICENSE" } + +[project.scripts] +"oncodriveclustl" = "oncodriveclustl.main:main" +"parse_vcf" = "oncodriveclustl.parsers.vcf:vcf_to_tsv" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.rye] +managed = true +dev-dependencies = [] + +[tool.hatch.version] +path = "oncodriveclustl/__init__.py" + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = ["oncodriveclustl"] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index b14023e..0000000 --- a/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -bgparsers>=0.9 -bgreference>=0.5 -bgsignature>=0.2 -click>=6.7 -daiquiri>=1.3.0 -intervaltree>=2.1.0 -matplotlib>=2.0.2 -numpy>=1.13.3 -pandas>=0.22.0 -scikit-learn>=0.19.2 -scipy>=1.0.0 -statsmodels>=0.8.0 -tqdm>=4.19.4 \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 20fe750..0000000 --- a/setup.py +++ /dev/null @@ -1,45 +0,0 @@ -import sys -from os import path -from setuptools import setup, find_packages - -from oncodriveclustl import __version__ - -DESCRIPTION = "OncodriveCLUSTL is a clustering method to identify cancer drivers" - -# Check the python compatibility -if sys.hexversion < 0x03050000: - raise RuntimeError('This package requires Python 3.5 or later.') - - -directory = path.dirname(path.abspath(__file__)) -with open(path.join(directory, 'requirements.txt')) as f: - install_requires = f.read().splitlines() - - -# Get the long description from the README file -with open(path.join(directory, 'README.rst'), encoding='utf-8') as f: - long_description = f.read() - - -setup( - name="oncodriveclustl", - python_requires='>=3.5', - version=__version__, - packages=find_packages(), - package_data={'oncodriveclustl': ['data/*.tsv']}, - author='BBGLab (Barcelona Biomedical Genomics Lab)', - author_email='bbglab@irbbarcelona.org', - description=DESCRIPTION, - license="AGPLv3", - keywords="", - url="https://bitbucket.org/bbglab/oncodriveclustl", - download_url="https://bitbucket.org/bbglab/oncodriveclustl/get/" + __version__ + ".tar.gz", - long_description=long_description, - install_requires=install_requires, - entry_points={ - 'console_scripts': [ - 'oncodriveclustl = oncodriveclustl.main:main', - 'parse_vcf = oncodriveclustl.parsers.vcf:vcf_to_tsv', - ] - } -)