From ba4a4e0633cfc65b8a7b4f786543fbf13b8e7172 Mon Sep 17 00:00:00 2001
From: Christian Perez Llamas <932644+chris-zen@users.noreply.github.com>
Date: Sun, 18 Aug 2024 14:52:27 +0200
Subject: [PATCH] Upgrading versions and adding Continuous Integration and
 Delivery

---
 .dockerignore                |   4 +
 .github/workflows/build.yaml | 156 ++++++++++++++++++
 .gitignore                   |   7 +
 .hadolint.yaml               |   2 +
 .python-version              |   1 +
 Dockerfile                   |  10 ++
 MANIFEST.in                  |   4 -
 Makefile                     | 121 ++++++++++++++
 README.md                    | 302 +++++++++++++++++++++++++++++++++++
 oncodriveclustl/__init__.py  |   3 +-
 oncodriveclustl/main.py      |  36 -----
 pyproject.toml               |  46 ++++++
 requirements.txt             |  13 --
 setup.py                     |  45 ------
 14 files changed, 650 insertions(+), 100 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 .github/workflows/build.yaml
 create mode 100644 .hadolint.yaml
 create mode 100644 .python-version
 create mode 100644 Dockerfile
 delete mode 100644 MANIFEST.in
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100644 pyproject.toml
 delete mode 100644 requirements.txt
 delete mode 100644 setup.py

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..aea8d0b
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,4 @@
+.venv/
+dist/
+example/
+oncodriveclustl.egg-info/
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
new file mode 100644
index 0000000..102a0c2
--- /dev/null
+++ b/.github/workflows/build.yaml
@@ -0,0 +1,156 @@
+name: Build and Publish
+
+on:
+  push:
+    tags:
+      - "**"
+    branches:
+      - "**"
+
+permissions:
+  contents: read
+
+env:
+  TERM: xterm
+  PYTHON_VERSION: 3.12
+
+jobs:
+  packages-build:
+    name: Build packages
+    runs-on: ubuntu-latest
+    env:
+      RUFF_FORMAT: github
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - uses: eifinger/setup-rye@v4
+
+      - name: Check format
+        run: |
+          make check-format || true
+          BOLDRED=$(tput bold && tput setaf 1)
+          RESET=$(tput sgr0)
+          echo "${BOLDRED}==> We won't fail on formatting errors for the time being, but we will in the future.${RESET}"
+      
+      - name: Check lint
+        run: |
+          make check-lint || true
+          BOLDRED=$(tput bold && tput setaf 1)
+          RESET=$(tput sgr0)
+          echo "${BOLDRED}==> We won't fail on lint errors for the time being, but we will in the future.${RESET}"
+
+      - name: Build packages
+        run: make build-dist
+
+      - name: Upload packages
+        uses: actions/upload-artifact@v4
+        with:
+          name: python-packages
+          path: dist
+
+  docker-build:
+    name: Build Docker image
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - uses: eifinger/setup-rye@v4
+
+      - name: Check Dockerfile
+        run: make check-docker
+          
+      - name: Build Docker image
+        run: make build-image
+
+      # TODO: Enable this when we figure out how to run it without having to download several Gigabytes of data.
+      # - name: Test Docker image
+      #   run: make run-example
+
+  check-version:
+    name: Check version
+    runs-on: ubuntu-latest
+    if: startsWith(github.ref, 'refs/tags/')
+    needs:
+      - packages-build
+      - docker-build
+  
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - uses: eifinger/setup-rye@v4
+      
+      - name: Check version matching the tag
+        run: make check-version
+
+  packages-publish:
+    name: Publish packages
+    runs-on: ubuntu-latest
+    if: startsWith(github.ref, 'refs/tags/')
+    needs:
+      - check-version
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - uses: eifinger/setup-rye@v4
+      
+      - name: Download packages
+        uses: actions/download-artifact@v4
+        with:
+          name: python-packages
+      
+      - name: Publish to PyPI
+        env:
+          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
+        if: ${{ env.PYPI_TOKEN != '' }}
+        run: make publish-dist
+
+  docker-push:
+    name: Push Docker image
+    if: startsWith(github.ref, 'refs/tags/')
+    runs-on: ubuntu-latest
+    env:
+      DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}
+    needs:
+      - check-version
+
+    steps:
+      - if: ${{ env.DOCKER_USERNAME != '' }}
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - uses: eifinger/setup-rye@v4
+
+      - name: Login to DockerHub
+        if: ${{ env.DOCKER_USERNAME != '' }}
+        run: echo ${{ secrets.DOCKER_PASSWORD }} | docker login -u ${{ secrets.DOCKER_USERNAME }} --password-stdin
+
+      - name: Push Docker image
+        if: ${{ env.DOCKER_USERNAME != '' }}
+        run: make push-image
diff --git a/.gitignore b/.gitignore
index 572b27a..de6a073 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ dataset_randomizator.py
 Distance_mutations.ipynb
 oncodriveclustl/count_mutations.py
 oncodriveclustl/mutations_intogen_local.txt
+example/output
 
 # Singularity images
 *.simg
@@ -35,7 +36,13 @@ oncodriveclustl/mutations_intogen_local.txt
 *~
 
 # Python bytecode
+__pycache__
 *.pyc
 *.egg-info/
 dist/
 
+# rye files
+.venv/
+requirements-dev.lock
+requirements.lock
+
diff --git a/.hadolint.yaml b/.hadolint.yaml
new file mode 100644
index 0000000..bc1caf0
--- /dev/null
+++ b/.hadolint.yaml
@@ -0,0 +1,2 @@
+ignored:
+  - DL3003
\ No newline at end of file
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..455808f
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.12.4
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..6e638a7
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.12-slim
+
+# hadolint ignore=DL3042
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,target=/project,rw \
+    cd /project && pip install .
+
+RUN oncodriveclustl --help
+
+ENTRYPOINT [ "/usr/local/bin/oncodriveclustl" ]
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 523dbf3..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,4 +0,0 @@
-# Files to be included in the distribution
-include MANIFEST.in
-include requirements.txt
-include data/*
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4b29b29
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,121 @@
+ROOT_DIR := $(shell echo $(dir $(lastword $(MAKEFILE_LIST))) | sed 's|/*$$||')
+
+SHELL := /bin/bash
+
+VERSION = $(shell rye version)
+
+GIT_TAG_OR_SHA = $(shell git describe --tags --exact-match 2>/dev/null || git rev-parse --short HEAD)
+
+IMAGE_TAG = $(VERSION)
+IMAGE := bbglab/oncodriveclustl:$(IMAGE_TAG)
+
+BOLDRED := $(shell tput bold && tput setaf 1)
+BOLDGREEN := $(shell tput bold && tput setaf 2)
+BOLDYELLOW := $(shell tput bold && tput setaf 3)
+BOLDBLUE := $(shell tput bold && tput setaf 4)
+LIGHTBLUE := $(shell tput setaf 6)
+WHITE := $(shell tput sgr0 && tput setaf 7)
+RESET := $(shell tput sgr0)
+
+
+.PHONY: help
+help:
+	@echo "$(BOLDYELLOW)Available targets:$(RESET)"
+	@echo
+	@echo "$(BOLDGREEN)  checks       $(WHITE)-> Run all the checks (format and lint)"
+	@echo "$(BOLDGREEN)  check-format $(WHITE)-> Check for formatting errors"
+	@echo "$(BOLDGREEN)  check-lint   $(WHITE)-> Check for lint errors"
+	@echo "$(BOLDGREEN)  check-docker $(WHITE)-> Check the Dockerfile"
+	@echo "$(BOLDGREEN)  format       $(WHITE)-> Format source code"
+	@echo "$(BOLDGREEN)  build-dist   $(WHITE)-> Build source and wheel distribution files"
+	@echo "$(BOLDGREEN)  build-image  $(WHITE)-> Build the Docker image"
+	@echo "$(BOLDGREEN)  push-image   $(WHITE)-> Push the Docker image into DockerHub"
+	@echo "$(BOLDGREEN)  run-example  $(WHITE)-> Run the included example using the Docker image"
+	@echo "$(BOLDGREEN)  clean        $(WHITE)-> Clean the working directory (build files, virtual environments, caches)"
+	@echo "$(RESET)"
+
+.PHONY: rye-installed
+rye-installed:
+	@if ! which rye > /dev/null; then \
+		echo "$(BOLDRED)This project build is managed by $(BOLDYELLOW)rye$(BOLDRED), which is not installed.$(RESET)"; \
+		echo "$(LIGHTBLUE)Please follow these instructions to install it:$(RESET)"; \
+		echo "$(LIGHTBLUE)--> $(BOLDBLUE)https://rye.astral.sh/guide/installation/$(RESET)"; \
+		exit 1; \
+	fi
+
+.PHONY: checks
+checks: check-format check-lint check-docker
+
+.PHONY: check-format
+check-format: rye-installed
+	@echo "$(BOLDGREEN)Checking code format ...$(RESET)"
+	rye fmt --check
+	@echo "$(BOLDGREEN)==> Success!$(RESET)"
+
+.PHONY: check-lint
+check-lint: rye-installed
+	@echo "$(BOLDGREEN)Checking lint ...$(RESET)"
+	rye lint
+	@echo "$(BOLDGREEN)==> Success!$(RESET)"
+
+.PHONY: check-docker
+check-docker:
+	@echo "$(BOLDGREEN)Checking Dockerfile ...$(RESET)"
+	docker run --rm -i \
+		-v $$(pwd):/project \
+		hadolint/hadolint hadolint \
+		--config /project/.hadolint.yaml \
+		/project/Dockerfile
+	@echo "$(BOLDGREEN)==> Success!$(RESET)"
+
+.PHONY: check-version
+check-version: rye-installed
+	@echo "$(BOLDGREEN)Checking that the version matches the tag ...$(RESET)"
+	@if [ "$(VERSION)" != "$(GIT_TAG_OR_SHA)" ]; then \
+	    echo "$(BOLDRED)==> Version $(BOLDYELLOW)$(VERSION)$(BOLDRED) doesn't match the git tag $(BOLDYELLOW)$(GIT_TAG_OR_SHA)$(BOLDRED) !!!$(RESET)"; \
+		echo "$(BOLDRED)==> Please update the $(BOLDYELLOW)__version__$(BOLDRED) in $(BOLDYELLOW)oncodrivefml/__init__.py$(BOLDRED) and re-create the tag.$(RESET)"; \
+	    exit 1; \
+	fi
+	@echo "$(BOLDGREEN)==> Success!$(RESET)"
+
+.PHONY: format
+format: rye-installed
+	@echo "$(BOLDGREEN)Formatting code ...$(RESET)"
+	rye fmt
+
+.PHONY: build-dist
+build-dist: rye-installed
+	@echo "$(BOLDGREEN)Building packages ...$(RESET)"
+	rye build
+
+.PHONY: publish-dist
+publish-dist: rye-installed
+	@echo "$(BOLDGREEN)Publishing OncodriveCLUSTL $(BOLDYELLOW)$(VERSION)$(BOLDGREEN) to PyPI ...$(RESET)"
+	@[[ -z "$(PYPI_TOKEN)" ]] && (echo "$(BOLDRED)==> Missing PyPI token !!!$(RESET)"; exit 1)
+	rye publish --token $(PYPI_TOKEN)
+
+.PHONY: build-image
+build-image: rye-installed
+	@echo "$(BOLDGREEN)Building Docker image $(BOLDYELLOW)$(IMAGE)$(BOLDGREEN) ...$(RESET)"
+	docker build --progress=plain -t $(IMAGE) .
+	@echo "$(BOLDGREEN)==> Success!$(RESET)"
+
+.PHONY: build-image
+push-image: rye-installed
+	@echo "$(BOLDGREEN)Pushing the Docker image into the DockerHub ...$(RESET)"
+	docker push $(IMAGE)
+	@echo "$(BOLDGREEN)==> Success!$(RESET)"
+
+.PHONY: run-example
+run-example: rye-installed
+	@echo "$(BOLDGREEN)Running example ...$(RESET)"
+	rye run oncodriveclustl \
+		-i example/PAAD.tsv.gz -r example/cds.hg19.regions.gz -o example/output \
+		-sw 15 -cw 15 -simw 35 -sim region_restricted --concatenate --clustplot -e KRAS
+	@echo "$(BOLDGREEN)==> Success!$(RESET)"
+
+.PHONY: clean
+clean:
+	@echo "$(BOLDGREEN)Cleaning the repository ...$(RESET)"
+	rm -rf ./oncodriveclustl.egg-info ./dist ./.ruff_cache ./.venv
+	find . -name "__pycache__" -type d -exec rm -r {} +
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9f78e91
--- /dev/null
+++ b/README.md
@@ -0,0 +1,302 @@
+# OncodriveCLUSTL
+
+OncodriveCLUSTL is a sequence-based clustering method to identify significant
+clustering signals in nucleotide sequence.
+
+One of the main goals of cancer research is the identification of the genomic
+elements that drive tumorigenesis. OncodriveCLUSTL is a new nucleotide
+sequence-based clustering algorithm to detect significant clustering signals
+across genomic regions. OncodriveCLUSTL is based on a local background model
+derived from the nucleotide context mutational probabilities of the cohort under
+study. Our method is able to identify well-known cancer drivers in coding
+regions and it can be applied to non-coding regions and non-human data.
+
+## License
+
+OncodriveCLUSTL is available to the general public subject to certain conditions
+described in its [LICENSE](LICENSE).
+
+
+## Installation
+
+OncodriveCLUSTL depends on Python 3.5 or above. We recommend to install it using
+the [Anaconda Python distribution](https://www.anaconda.com/download/):
+
+```bash
+conda install -c bbglab oncodriveclustl
+```
+
+OncodriveCLUSTL can also be installed using pip:
+
+```bash
+pip install oncodriveclustl
+```
+
+You can obtain the latest code from the repository and install it for development with pip:
+
+```bash
+git clone git@bitbucket.org:bbglab/oncodriveclustl.git
+cd oncodriveclustl
+python -m venv .venv
+.venv/bin/pip install -e .
+source .venv/bin/activate
+oncodriveclustl --help
+```
+
+> [!NOTE]
+> The first time that you run OncodriveCLUSTL with a given reference genome, it
+> will download it from our servers. By default the downloaded datasets go to
+> `~/.bgdata`. If you want to move these datasets to another folder you have to
+> define the system environment variable `BGDATA_LOCAL` with an export command.
+
+> [!NOTE]
+> If you install a modern build tool like [rye](https://rye.astral.sh), you can
+> simply do this:
+> ```bash
+> git clone git@bitbucket.org:bbglab/oncodriveclustl.git
+> cd oncodriveclustl
+> rye sync
+> rye run oncodriveclustl --help
+> ```
+
+## Input data
+
+OncodriveCLUSTL only requires two main inputs, the mutations file and the
+annotations file.
+
+### Mutations file 
+
+TSV file containing SNVs (substitutions) mapped to a reference genome (e.g.,
+human hg19 or mouse c3h). If other mutation types are present (insertions,
+deletions, double base substitutions, etc), they will be filtered out during the
+analysis. This file must contain, at least, the following 5 columns with header:
+
+1. **CHROMOSOME**: 1, 2,..., X, Y
+2. **POSITION**: Integer indicating the position of the mutation
+3. **REF**: Reference nucleotide
+4. **ALT**: Alternate nucleotide
+5. **SAMPLE**: Identifier of the sample
+
+Additional columns are:
+
+6. **CANCER_TYPE**: Type of tumor. When specified, OncodriveCLUSTL will
+   calculate one mutational profile for each cancer type and mutations will be
+   randomized accordingly.
+7. **SIGNATURE**: User-defined group to compute k-mer nucleotide mutational
+   probabilities. When specified, OncodriveCLUSTL will calculate one mutational
+   profile for each group and will randomize each mutation accordingly.
+
+> [!NOTE]
+> OncodriveCLUSTL assumes all SNVs are mapped to the positive strand.
+
+> [!WARNING]
+> When using the `--signature-group` option, please check that the number of
+> mutations per group is sufficient for an accurate signatures calculation.
+
+### Annotations file
+
+TSV file containing the coordinates of genomic elements (GEs). This file must
+contain, at least, the following 5 columns with header:
+
+1. **CHROMOSOME**: 1, 2,..., X, Y
+2. **START**: Starting position of the genomic region
+3. **END**: Final position of the genomic region
+4. **ELEMENT**: Identifier of the GE
+5. **SYMBOL**: Symbol of the GE. OncodriveCLUSTL will analyze GEs as **SYMBOL** + **ELEMENT**.
+
+Additional columns are:
+
+6. **STRAND**: Strand of the GE coordinates ("+" or "-").
+
+> [!WARNING]
+> Coordinates of a given GE cannot overlap.
+
+You can check the input formats in the files provided in the example.
+
+If you have a VCF file or directory of VCF files containing somatic mutations,
+you can run our VCF parser to obtain a tabular file compatible with
+OncodriveCLUSTL input format::
+
+```bash
+parse_vcf -i [INPUT_DIRECTORY] -o [OUTPUT_FILE]
+```
+
+Please, check [parsers/vcf.py](oncodriveclustl/parsers/vcf.py) module for more
+details.
+
+If you would like to run OncodriveCLUSTL using a per-calculated signature or
+mutational profile, you need to provide a dictionary containing the reference
+k-mer to alternate mutational probabilities in JSON format:
+
+```json
+{
+    "my_dataset": {
+        "GCA>G": 0.02424271083094251,
+        "AGC>A": 0.023005887103025254,
+        "ACG>T": 0.037613802858829135,
+        "CGA>C": 0.10691031051670515,
+        "GAC>G": 0.017846071811001615,
+        "TTC>A": 0.024003748061871697,
+        "CTT>G": 0.024149863672267024,
+        "GGA>T": 0.011178562948734577,
+        "AGG>C": 0.010654720767868876,
+        "GGG>C": 0.012031686292218055,
+        "CAA>T": 0.014478959792844522,
+        "TGA>A": 0.01255651801972085,
+        "GGA>A": 0.011178562948734577,
+        "CGA>A": 0.03563677017223505,
+        "TCC>T": 0.011158347971568658,
+        "GCC>A": 0.010952316565906438,
+        // ...
+    }
+}
+```
+
+OncodriveCLUSTL requires non-collapsed k-mer probabilities (192 for
+tri-nucleotides, 3072 for penta-nucleotides).
+
+## Output data
+
+OncodriveCLUSTL generates three output files:
+
+### Elements results file ('elements_results.txt')
+
+TSV file containing results of the analyzed elements:
+
+1. **SYMBOL**: GE symbol #. ENSID: GE ID #. CGC: True if GE in the COSMIC Cancer Gene Census (CGC) list (Sondka et al., 2018)
+2. **CHROMOSOME**: 1, 2,..., X, Y
+3. **STRAND**: Strand of the GE ("+" or "-")
+4. **LENGTH**: length (bp) of the GE
+5. **TOTAL_MUT**: total substitutions observed in the GE
+6. **CLUSTERED_MUT**: number of substitutions in a cluster
+7. **CLUSTERS**: number of clusters
+8. **SIM_CLUSTERS**: number of simulated clusters
+9. **SCORE**: GE score
+10. **P_EMPIRICAL**: empirical p-value of the GE
+11. **Q_EMPIRICAL**: empirical q-value of the GE
+12. **P_ANALYTICAL**: analytical p-value of the GE
+13. **Q_ANALYTICAL**: analytical q-value of the GE
+14. **P_TOPCLUSTER**: analytical p-value of the cluster with highest cluster score
+15. **Q_TOPCLUSTER**: analytical q-value of the cluster with highest cluster score
+
+### Clusters results file ('clusters_results.tsv').
+
+TSV file containing results of the clusters observed in the analyzed elements:
+
+1. **RANK**: Position of the GE in the list of
+2. **SYMBOL**: GE symbol
+3. **ENSID**: GE ID
+4. **CGC**: True if GE in the CGC list
+5. **CHROMOSOME**: 1, 2,..., X, Y
+6. **STRAND**: Strand of the GE ("+" or "-")
+7. **COORDINATES**: genomic coordinates of the cluster. It can be 'coord1,coord2'
+   for clusters inside a single region or 'coord1,coord2;coord3,coord4' for
+   those spanning regions (--concatenate flag)
+8. **MAX_COORD**: genomic position with the highest smoothing score inside the cluster
+9. **WIDTH**: cluster's width (pb)
+10. **N_MUT**: number of substitutions in the cluster
+11. **N_SAMPLES**: number of samples with a mutation in the cluster
+12. **FRA_UNIQ_SAMPLES**: proportion of unique samples mutated in the cluster out of the total of mutations in the cluster
+13. **SCORE**: cluster score
+14. **P**: analytical p-value of the cluster
+
+### Log file ('results.log')
+
+TXT file containing OncodriveCLUSTL's run information.
+
+## Usage
+
+OncodriveCLUSTL is meant to be used through the command line.
+
+```
+Usage: oncodriveclustl [OPTIONS]
+
+Options:
+  -i, --input-file PATH           File containing somatic mutations
+                                  [required]
+  -r, --regions-file PATH         File with the genomic regions to analyze
+                                  [required]
+  -o, --output-directory TEXT     Output directory to be created  [required]
+  -sig, --input-signature PATH    File containing input context based
+                                  mutational probabilities (signature)
+  -ef, --elements-file PATH       File with the symbols of the elements to
+                                  analyze
+  -e, --elements TEXT             Symbol of the element(s) to analyze
+  -g, --genome [hg38|hg19|mm10|c3h|car|cast|f344]
+                                  Genome to use
+  -emut, --element-mutations INTEGER
+                                  Cutoff of element mutations. Default is 2
+  -cmut, --cluster-mutations INTEGER
+                                  Cutoff of cluster mutations. Default is 2
+  -sw, --smooth-window INTEGER RANGE
+                                  Smoothing window. Default is 11  [3<=x<=101]
+  -cw, --cluster-window INTEGER RANGE
+                                  Cluster window. Default is 11  [3<=x<=101]
+  -kmer, --kmer [3|5]             K-mer nucleotide context
+  -n, --n-simulations INTEGER     number of simulations. Default is 1000
+  -sim, --simulation-mode [mutation_centered|region_restricted]
+                                  Simulation mode
+  -simw, --simulation-window INTEGER RANGE
+                                  Simulation window. Default is 31
+                                  [19<=x<=101]
+  -sigcalc, --signature-calculation [frequencies|region_normalized]
+                                  Signature calculation: mutation frequencies
+                                  (default) or k-mer mutation counts
+                                  normalized by k-mer region counts
+  -siggroup, --signature-group [SIGNATURE|SAMPLE|CANCER_TYPE]
+                                  Header of the column to group signatures
+                                  calculation
+  -c, --cores INTEGER RANGE       Number of cores to use in the computation.
+                                  By default it will use all the available
+                                  cores.  [1<=x<=10]
+  --seed INTEGER                  Seed to use in the simulations
+  --log-level [debug|info|warning|error|critical]
+                                  Verbosity of the logger
+  --concatenate                   Calculate clustering on concatenated genomic
+                                  regions (e.g., exons in coding sequences)
+  --clustplot                     Generate a needle plot with clusters for an
+                                  element
+  --qqplot                        Generate a quantile-quantile (Q-Q) plot for
+                                  a dataset
+  --gzip                          Gzip compress files
+  -h, --help                      Show this message and exit.
+```
+
+> [!NOTE]
+> When using simulation mode 'mutation_centered', simulation windows can be
+> simulated outside the GE.
+
+> [!NOTE]
+> When using `--signature-calculation region_normalized`, k-mer mutation counts 
+> will be normalized by k-mer nucleotide counts in the genomic regions 
+> provided as input (`--regions-file`).
+
+# Run the example
+
+If you run OncodriveCLUSTL from the [source code], you can run an example of
+TCGA pancreatic adenocarcinomas (Ellrott et al. 2018) for coding regions
+(Mularoni et al., 2016) using 1000 simulations. First you need to download the
+example folder. Then you run OncodriveCLUSTL with default mode and parameters
+as:
+
+[source code]: https://github.com/bbglab/oncodriveclustl
+
+```bash
+oncodriveclustl -i example/PAAD.tsv.gz -r example/cds.hg19.regions.gz -o example/output
+```
+
+The results will be saved in a folder named `output`.
+
+You can compute a more sophisticated analysis using non-default parameters and
+generate a quantile-quantile plot by typing:
+
+```bash
+oncodriveclustl -i example/PAAD.tsv.gz -r example/cds.hg19.regions.gz -o example/output -sw 15 -cw 15 -simw 35 -sim region_restricted --concatenate --qqplot
+```
+
+If you want to run a specific GE and generate a plot its observed clusters, you
+can type::
+
+```bash
+oncodriveclustl -i example/PAAD.tsv.gz -r example/cds.hg19.regions.gz -o example/output -sw 15 -cw 15 -simw 35 -sim region_restricted --concatenate --clustplot -e KRAS
+```
diff --git a/oncodriveclustl/__init__.py b/oncodriveclustl/__init__.py
index 4a8a8d1..4a2bfa8 100644
--- a/oncodriveclustl/__init__.py
+++ b/oncodriveclustl/__init__.py
@@ -1,2 +1 @@
-VERSION = (1, 1, 4)
-__version__ = '.'.join([str(i) for i in VERSION])
\ No newline at end of file
+__version__ = "1.2.0"
\ No newline at end of file
diff --git a/oncodriveclustl/main.py b/oncodriveclustl/main.py
index c3cd8a7..0d236c7 100644
--- a/oncodriveclustl/main.py
+++ b/oncodriveclustl/main.py
@@ -104,42 +104,6 @@ def main(input_file,
          qqplot,
          gzip
          ):
-    """
-    OncodriveCLUSTL is a sequence based clustering method to identify cancer drivers across the genome
-
-    Args:
-        input_file (str): path to mutations file
-        regions_file (str): path to input genomic coordinates file
-        output_directory(str): path to output directory. Output files will be generated in it.
-        input_signature (str): path to file containing input context based mutational probabilities.
-            By default (when no input signatures), OncodriveCLUSTL will calculate them from the mutations input file.
-        elements_file (str): path to file containing one element per row (optional) to analyzed the listed elements.
-            By default, OncodriveCLUSTL analyzes all genomic elements contained in `regions_file`.
-        elements (str): genomic element symbol (optional). The analysis will be performed only on the specified GEs.
-        genome (str): genome to use: 'hg38', 'hg19', 'mm10', 'c3h', 'car', 'cast' and 'f344'
-        element_mutations (int): minimum number of mutations per genomic element to undertake analysis
-        cluster_mutations (int): minimum number of mutations to define a cluster
-        smooth_window (int): Tukey kernel smoothing window length
-        cluster_window (int): clustering window length
-        kmer (int): context nucleotides to calculate the mutational probabilities (trinucleotides or pentanucleotides)
-        n_simulations (int): number of simulations
-        simulation_mode (str): simulation mode
-        simulation_window (int): window length to simulate mutations
-        signature_calculation (str): signature calculation, mutation frequencies (default) or mutation counts
-            normalized by k-mer region counts
-        signature_group (str): header of the column to group signatures. One signature will be computed for each group
-        cores (int): number of CPUs to use
-        seed (int): seed
-        log_level (str): verbosity of the logger
-        concatenate (bool): flag to calculate clustering on collapsed genomic regions (e.g., coding regions in a gene)
-        clustplot (bool): flag to generate a needle plot with clusters for an element
-        qqplot (bool): flat to generate a quantile-quantile (QQ) plot for a dataset
-        gzip (bool): flag to generate GZIP compressed output files
-
-    Returns:
-        None
-
-    """
 
     global logger
 
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..ed3ce4b
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,46 @@
+[project]
+name = "oncodriveclustl"
+dynamic = ["version"]
+description = "OncodriveCLUSTL is a clustering method to identify cancer drivers"
+authors = [
+    { name = "BBGLab (Barcelona Biomedical Genomics Lab)", email = "bbglab@irbbarcelona.org" }
+]
+dependencies = [
+    "bgparsers==0.10",
+    "bgreference==0.7",
+    "bgsignature==0.2",
+    "click==8.1.7",
+    "daiquiri==3.2.5.1",
+    "intervaltree==3.1.0",
+    "matplotlib==3.9.2",
+    "numpy==2.0.1",
+    "pandas==2.2.2",
+    "scikit-learn==1.5.1",
+    "scipy==1.14.0",
+    "statsmodels==0.14.2",
+    "tqdm==4.66.5",
+]
+readme = "README.md"
+requires-python = ">=3.5,<3.13"
+license = { file = "LICENSE" }
+
+[project.scripts]
+"oncodriveclustl" = "oncodriveclustl.main:main"
+"parse_vcf" = "oncodriveclustl.parsers.vcf:vcf_to_tsv"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.rye]
+managed = true
+dev-dependencies = []
+
+[tool.hatch.version]
+path = "oncodriveclustl/__init__.py"
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.hatch.build.targets.wheel]
+packages = ["oncodriveclustl"]
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index b14023e..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-bgparsers>=0.9
-bgreference>=0.5
-bgsignature>=0.2
-click>=6.7
-daiquiri>=1.3.0
-intervaltree>=2.1.0
-matplotlib>=2.0.2
-numpy>=1.13.3
-pandas>=0.22.0
-scikit-learn>=0.19.2
-scipy>=1.0.0
-statsmodels>=0.8.0
-tqdm>=4.19.4
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 20fe750..0000000
--- a/setup.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import sys
-from os import path
-from setuptools import setup, find_packages
-
-from oncodriveclustl import __version__
-
-DESCRIPTION = "OncodriveCLUSTL is a clustering method to identify cancer drivers"
-
-# Check the python compatibility
-if sys.hexversion < 0x03050000:
-    raise RuntimeError('This package requires Python 3.5 or later.')
-
-
-directory = path.dirname(path.abspath(__file__))
-with open(path.join(directory, 'requirements.txt')) as f:
-    install_requires = f.read().splitlines()
-
-
-# Get the long description from the README file
-with open(path.join(directory, 'README.rst'), encoding='utf-8') as f:
-    long_description = f.read()
-
-
-setup(
-    name="oncodriveclustl",
-    python_requires='>=3.5',
-    version=__version__,
-    packages=find_packages(),
-    package_data={'oncodriveclustl': ['data/*.tsv']},
-    author='BBGLab (Barcelona Biomedical Genomics Lab)',
-    author_email='bbglab@irbbarcelona.org',
-    description=DESCRIPTION,
-    license="AGPLv3",
-    keywords="",
-    url="https://bitbucket.org/bbglab/oncodriveclustl",
-    download_url="https://bitbucket.org/bbglab/oncodriveclustl/get/" + __version__ + ".tar.gz",
-    long_description=long_description,
-    install_requires=install_requires,
-    entry_points={
-        'console_scripts': [
-            'oncodriveclustl = oncodriveclustl.main:main',
-            'parse_vcf = oncodriveclustl.parsers.vcf:vcf_to_tsv',
-        ]
-    }
-)