diff --git a/3rd_party/clusty b/3rd_party/clusty
index d80c26a..7b109d4 160000
--- a/3rd_party/clusty
+++ b/3rd_party/clusty
@@ -1 +1 @@
-Subproject commit d80c26aec4c09a4715cb43763fa66c5baf8d9968
+Subproject commit 7b109d42a4c603e26dead5b566d43c0506a858d7
diff --git a/3rd_party/kmer-db b/3rd_party/kmer-db
index 18719c7..742b494 160000
--- a/3rd_party/kmer-db
+++ b/3rd_party/kmer-db
@@ -1 +1 @@
-Subproject commit 18719c7329b4f3c9bb0a0ac44d030c517c7a1bbb
+Subproject commit 742b4942b71271e8b0a1be63405e86b0d1f795ec
diff --git a/3rd_party/lz-ani b/3rd_party/lz-ani
index cdcaa0c..e3cc571 160000
--- a/3rd_party/lz-ani
+++ b/3rd_party/lz-ani
@@ -1 +1 @@
-Subproject commit cdcaa0ccb416d48a0689839cfdf78faaf67bf8a9
+Subproject commit e3cc571d973aedf634afd349c641dbb1328ea493
diff --git a/3rd_party/ref-utils b/3rd_party/ref-utils
index cbee86d..21d36c7 160000
--- a/3rd_party/ref-utils
+++ b/3rd_party/ref-utils
@@ -1 +1 @@
-Subproject commit cbee86d539a83ded811a9abbdb3f8c892b4fdc07
+Subproject commit 21d36c7c5a629e23446400d51cfd317c57ac5dc7
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..887a3f6
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,4 @@
+include README.md
+include LICENSE
+recursive-include bin *
+recursive-exclude 3rd_party *
diff --git a/README.md b/README.md
index ded09b4..0373e77 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,14 @@
# Vclust
-![version](https://img.shields.io/badge/version-1.2.7-blue.svg)
-[![GitHub downloads](https://img.shields.io/github/downloads/refresh-bio/vclust/total.svg?style=flag&label=GitHub%20downloads)](https://github.com/refresh-bio/vclust/releases)
-[![Bioconda downloads](https://img.shields.io/conda/dn/bioconda/vclust.svg?style=flag&label=Bioconda%20downloads)](https://anaconda.org/bioconda/vclust)
+![version](https://img.shields.io/badge/version-1.2.8-blue.svg)
+![PyPI - Version](https://img.shields.io/pypi/v/vclust?label=PyPI%20version&color=blue)
[![Build and tests](../../workflows/Build%20and%20tests/badge.svg)](../../actions/workflows/main.yml)
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
+![PyPI - Downloads](https://img.shields.io/pypi/dm/vclust?label=PyPI%20downloads)
+[![GitHub downloads](https://img.shields.io/github/downloads/refresh-bio/vclust/total.svg?style=flag&label=GitHub%20downloads)](https://github.com/refresh-bio/vclust/releases)
+[![Bioconda downloads](https://img.shields.io/conda/dn/bioconda/vclust.svg?style=flag&label=Bioconda%20downloads)](https://anaconda.org/bioconda/vclust)
+
![x86-64](https://img.shields.io/static/v1?label=%E2%80%8B&message=x86-64&color=yellow&logo=PCGamingWiki&logoColor=white)
![ARM](https://img.shields.io/static/v1?label=%E2%80%8B&message=ARM&color=yellow&logo=Raspberry%20Pi&logoColor=white)
![Apple M](https://img.shields.io/static/v1?label=%E2%80%8B&message=Apple%20M&color=yellow&logo=Apple&logoColor=white)
@@ -51,18 +54,17 @@ For datasets containing up to 1000 viral genomes, Vclust is available at [http:/
## Quick start
```bash
-# Clone repository and build Vclust
-git clone --recurse-submodules https://github.com/refresh-bio/vclust
-cd vclust && make -j
+# Install Vclust (requires Python >= 3.7)
+pip install vclust
# Prefilter similar genome sequence pairs before conducting pairwise alignments.
-./vclust.py prefilter -i example/multifasta.fna -o fltr.txt
+vclust prefilter -i example/multifasta.fna -o fltr.txt
# Align similar genome sequence pairs and calculate pairwise ANI measures.
-./vclust.py align -i example/multifasta.fna -o ani.tsv --filter fltr.txt
+vclust align -i example/multifasta.fna -o ani.tsv --filter fltr.txt
# Cluster genome sequences based on given ANI measure and minimum threshold.
-./vclust.py cluster -i ani.tsv -o clusters.tsv --ids ani.ids.tsv --metric ani --ani 0.95
+vclust cluster -i ani.tsv -o clusters.tsv --ids ani.ids.tsv --metric ani --ani 0.95
```
## Documentation
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..3fa8bdb
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,56 @@
+[build-system]
+requires = ["setuptools>=61.0.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools]
+py-modules = ["vclust"]
+
+[tool.setuptools.packages.find]
+where = ["./"]
+
+[project]
+name = "vclust-test"
+description = """Fast and accurate tool for calculating \
+Average Nucleotide Identity (ANI) and clustering virus \
+genomes and metagenomic contigs"""
+readme = "README.md"
+license = { file = "LICENSE" }
+authors = [
+ { name = "Andrzej Zielezinski", email = "andrzej.zielezinski@amu.edu.pl" },
+ { name = "Adam GudyĆ", email = "adam.gudys@polsl.pl" },
+ { name = "Sebastian Deorowicz", email = "sebastian.deorowicz@polsl.pl" },
+]
+requires-python = ">=3.7"
+dynamic = ["version"]
+classifiers = [
+ "Development Status :: 5 - Production/Stable",
+ "Natural Language :: English",
+ "Intended Audience :: Developers",
+ "Intended Audience :: Science/Research",
+ "Topic :: Scientific/Engineering",
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
+ "Operating System :: POSIX :: Linux",
+ "Operating System :: MacOS",
+ "License :: OSI Approved :: GNU Affero General Public License v3",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+]
+
+[tool.setuptools.dynamic]
+version = { attr = "vclust.__version__" }
+
+[tool.setuptools.package-data]
+"*" = ["bin/*"]
+
+[project.scripts]
+vclust = "vclust:main"
+
+[project.urls]
+Homepage = "https://github.com/refresh-bio/vclust"
+Documentation = "https://github.com/refresh-bio/vclust/wiki"
+Website = "http://vclust.org"
\ No newline at end of file
diff --git a/vclust.py b/vclust.py
index be353f6..00c7269 100755
--- a/vclust.py
+++ b/vclust.py
@@ -9,13 +9,14 @@
import multiprocessing
import os
import pathlib
+import platform
import shutil
import subprocess
import sys
import typing
import uuid
-__version__ = '1.2.7'
+__version__ = '1.2.8'
DEFAULT_THREAD_COUNT = min(multiprocessing.cpu_count(), 64)
@@ -59,7 +60,7 @@ def ranged_float_type(value):
return f
parser = argparse.ArgumentParser(
- description=f'%(prog)s v.{__version__}: calculate ANI and cluster '
+ description=f'%(prog)s v{__version__}: calculate ANI and cluster '
'virus (meta)genome sequences',
add_help=False,
)
@@ -117,7 +118,7 @@ def ranged_float_type(value):
'--min-kmers',
metavar="",
type=int,
- default=10,
+ default=20,
help='Filter genome pairs based on minimum number of shared k-mers '
'[%(default)s]'
)
@@ -531,7 +532,7 @@ def ranged_float_type(value):
'--bin',
metavar='',
type=pathlib.Path,
- dest="BIN_CLUSTY",
+ dest="bin_clusty",
default=f'{BIN_CLUSTY}',
help='Path to the Clusty binary [%(default)s]'
)
@@ -603,8 +604,8 @@ def get_uuid() -> str:
return f'vclust-{str(uuid.uuid4().hex)[:10]}'
-def validate_binary(bin_path: pathlib.Path) -> pathlib.Path:
- """Validates the existence and executability of a binary file.
+def _validate_binary(bin_path: pathlib.Path) -> pathlib.Path:
+ """Validates the presence and executability of a binary file.
This function checks if the provided path points to an existing binary file
and if it is executable. It also attempts to run the binary to ensure it
@@ -618,16 +619,16 @@ def validate_binary(bin_path: pathlib.Path) -> pathlib.Path:
pathlib.Path: The resolved path to the binary file.
Raises:
- SystemExit: If the binary file does not exist, is not executable, or
- if running the binary encounters an error.
+ RuntimeError: If the binary file does not exist, is not executable,
+ or if running the binary encounters an error.
"""
bin_path = bin_path.resolve()
if not bin_path.exists():
- exit(f'error: Executable not found: {bin_path}')
+ raise RuntimeError(f'File not found: {bin_path}')
if not bin_path.is_file() or not os.access(bin_path, os.X_OK):
- exit(f'error: Binary file not executable: {bin_path}')
+ raise RuntimeError(f'Binary file not executable: {bin_path}')
try:
subprocess.run(
@@ -638,14 +639,21 @@ def validate_binary(bin_path: pathlib.Path) -> pathlib.Path:
check=True
)
except subprocess.CalledProcessError as e:
- exit(f'error: Running {bin_path} failed with message: {e.stderr}')
+ raise RuntimeError(f'Running {bin_path} failed with message: {e.stderr}')
except OSError as e:
- exit(f'error: OSError in {bin_path} - {e}')
+ raise RuntimeError(f'OSError in {bin_path} - {e}')
except Exception as e:
- exit(f'error: Unexpected error in binary {bin_path} - {e}')
+ raise RuntimeError(f'Unexpected error in binary {bin_path} - {e}')
return bin_path
+def validate_binary(bin_path: pathlib.Path) -> pathlib.Path:
+ try:
+ return _validate_binary(bin_path)
+ except RuntimeError as e:
+ sys.exit(f'error: {e}')
+
+
def validate_args_fasta_input(args, parser) -> argparse.Namespace:
"""Validates the arguments for FASTA input."""
args.is_multifasta = True
@@ -732,13 +740,13 @@ def run(
)
except subprocess.CalledProcessError as e:
logger.error(f'Process {" ".join(cmd)} failed with message: {e.stderr}')
- exit(1)
+ sys.exit(1)
except OSError as e:
logger.error(f'OSError: {" ".join(cmd)} failed with message: {e}')
- exit(1)
+ sys.exit(1)
except Exception as e:
logger.error(f'Unexpected: {" ".join(cmd)} failed with message: {e}')
- exit(1)
+ sys.exit(1)
logger.info(f'Done')
return process
@@ -1145,11 +1153,75 @@ def cmd_clusty(
return cmd
-def vclust_info():
- print(f'Vclust {__version__}')
- for bin_path in [BIN_KMERDB, BIN_FASTASPLIT, BIN_LZANI, BIN_CLUSTY]:
- validate_binary(bin_path)
- print(f'{bin_path.name:<20} ok')
+def vclust_info() -> None:
+ """
+ Displays the Vclust version, installation paths, and binary dependencies.
+ Checks for the presence and executable status of required binaries.
+
+ Exits with a non-zero status if any dependencies are missing or
+ not executable.
+
+ Returns:
+ None
+
+ Raises:
+ SystemExit: If any binary dependencies are missing or not executable.
+
+ """
+ # ANSI color codes for terminal output.
+ GREEN = '\033[92m'
+ RED = '\033[91m'
+ RESET = '\033[0m'
+
+ binaries = {
+ 'Kmer-db': BIN_KMERDB,
+ 'LZ-ANI': BIN_LZANI,
+ 'Clusty': BIN_CLUSTY,
+ 'multi-fasta-split': BIN_FASTASPLIT,
+ }
+
+ output_lines = [
+ f'Vclust version {__version__} (Python {platform.python_version()})',
+ '',
+ 'Installed at:',
+ f' {pathlib.Path(__file__).resolve()}',
+ f' {BIN_DIR.resolve()}',
+ '',
+ 'Binary dependencies:',
+ ]
+
+ errors = [] # List to collect any errors encountered during binary checks.
+
+ # Check each binary's presence and version.
+ for name, path in binaries.items():
+ try:
+ _validate_binary(path)
+ version = subprocess.run(
+ [str(path), '-version' if name == 'Kmer-db' else '--version'],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ check=True
+ ).stderr.strip()
+ output_lines.append(f' {name:<20} v{version:<10}')
+ except Exception as e:
+ output_lines.append(f' {name:<20} [error]')
+ errors.append((name, e))
+
+ # Append the status summary based on any encountered errors.
+ output_lines.append('')
+
+ if errors:
+ output_lines.append(f'{RED}Status: error{RESET}')
+ output_lines.extend(f" - {name}: {error}" for name, error in errors)
+ else:
+ output_lines.append(f'{GREEN}Status: ok{RESET}')
+
+ # Output the complete information.
+ print('\n'.join(output_lines))
+
+ if errors:
+ sys.exit(1)
class CustomHelpFormatter(argparse.HelpFormatter):
@@ -1324,7 +1396,7 @@ def main():
# Cluster
elif args.command == 'cluster':
- args.BIN_CLUSTY = validate_binary(args.BIN_CLUSTY)
+ args.bin_clusty = validate_binary(args.bin_clusty)
args = validate_args_cluster(args, parser)
cmd = cmd_clusty(
@@ -1344,7 +1416,7 @@ def main():
leiden_resolution=args.leiden_resolution,
leiden_beta=args.leiden_beta,
leiden_iterations=args.leiden_iterations,
- bin_path=args.BIN_CLUSTY,
+ bin_path=args.bin_clusty,
)
p = run(cmd, args.verbose, logger)