From 2f0b39b91fca22d77f23a1c3863f93cdd01dd25e Mon Sep 17 00:00:00 2001 From: Leo Burgy Date: Mon, 9 Oct 2023 16:26:59 +0200 Subject: [PATCH 1/2] docs: Update README.md Add the change directory in the quickstart --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 54502e6..bcb6200 100755 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ If you run into any problems feel free to contact me at [dmoi@unil.ch](dmoi@unil to install from github ``` $ git clone https://github.com/DessimozLab/HogProf.git +$ cd HogProf $ pip install -r pipreqs.txt . ``` or to install from pypi From e564fe8c319d77d17d00f4586536df862b114b3e Mon Sep 17 00:00:00 2001 From: Leo Burgy Date: Mon, 9 Oct 2023 16:38:16 +0200 Subject: [PATCH 2/2] Trim project and add poetry --- .gitignore | 5 +- docs/.nojekyll | 0 docsource/conf.py | 44 - docsource/index.rst | 76 - environment.yml | 45 - pipreqs.txt | 65 - poetry.lock | 1263 +++++++++++ pyproject.toml | 72 +- src/HogProf.egg-info/PKG-INFO | 113 - .../__pycache__/__init__.cpython-310.pyc | Bin 206 -> 0 bytes .../__pycache__/profiler.cpython-310.pyc | Bin 14899 -> 0 bytes src/HogProf/build/lib/notebooks/__init__.py | 1 - src/HogProf/build/lib/pyoma/__init__.py | 1 - .../build/lib/pyoma/browser/KmerEncoder.py | 68 - .../lib/pyoma/browser/OrthoXMLSplitter.py | 195 -- .../build/lib/pyoma/browser/__init__.py | 1 - .../lib/pyoma/browser/check_db_consistency.py | 82 - .../build/lib/pyoma/browser/convert.py | 1910 ---------------- .../pyoma/browser/convert_omastandalone.py | 139 -- src/HogProf/build/lib/pyoma/browser/db.py | 1770 --------------- .../build/lib/pyoma/browser/geneontology.py | 424 ---- .../build/lib/pyoma/browser/homoeologs.py | 238 -- .../build/lib/pyoma/browser/linkout.py | 276 --- .../build/lib/pyoma/browser/locus_parser.py | 68 - src/HogProf/build/lib/pyoma/browser/models.py | 429 ---- .../build/lib/pyoma/browser/synteny.py | 102 - .../build/lib/pyoma/browser/tablefmt.py | 162 -- src/HogProf/build/lib/tests/__init__.py | 0 src/HogProf/build/lib/tests/test_hashutils.py | 28 - src/HogProf/build/lib/tests/test_hpputils.py | 13 - src/HogProf/build/lib/tests/test_profiler.py | 8 - src/HogProf/build/lib/utils/__init__.py | 1 - src/HogProf/build/lib/utils/config_utils.py | 5 - src/HogProf/build/lib/utils/files_utils.py | 159 -- src/HogProf/build/lib/utils/goatools_utils.py | 186 -- src/HogProf/build/lib/utils/hashutils.py | 166 -- .../build/lib/utils/preprocess_config.py | 17 - src/HogProf/build/lib/utils/pyhamutils.py | 86 - src/HogProf/lshbuilder.py | 486 ++--- src/HogProf/orthoxml.py | 1930 ----------------- src/HogProf/profiler.py | 839 +++---- .../__pycache__/__init__.cpython-310.pyc | Bin 210 -> 0 bytes .../utils/__pycache__/__init__.cpython-38.pyc | Bin 192 -> 0 bytes .../__pycache__/files_utils.cpython-310.pyc | Bin 4381 -> 0 bytes .../__pycache__/files_utils.cpython-38.pyc | Bin 4400 -> 0 bytes .../goatools_utils.cpython-310.pyc | Bin 6806 -> 0 bytes .../__pycache__/goautils.cpython-310.pyc | Bin 6800 -> 0 bytes .../__pycache__/hashutils.cpython-310.pyc | Bin 5110 -> 0 bytes .../__pycache__/hashutils.cpython-38.pyc | Bin 5135 -> 0 bytes .../__pycache__/pyhamutils.cpython-310.pyc | Bin 2496 -> 0 bytes .../__pycache__/pyhamutils.cpython-38.pyc | Bin 2372 -> 0 bytes src/HogProf/utils/files_utils.py | 58 +- src/HogProf/utils/goautils.py | 108 +- 53 files changed, 2026 insertions(+), 9613 deletions(-) delete mode 100644 docs/.nojekyll delete mode 100644 docsource/conf.py delete mode 100644 docsource/index.rst delete mode 100755 environment.yml delete mode 100644 pipreqs.txt create mode 100644 poetry.lock delete mode 100644 src/HogProf.egg-info/PKG-INFO delete mode 100644 src/HogProf/__pycache__/__init__.cpython-310.pyc delete mode 100644 src/HogProf/__pycache__/profiler.cpython-310.pyc delete mode 100755 src/HogProf/build/lib/notebooks/__init__.py delete mode 100755 src/HogProf/build/lib/pyoma/__init__.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/KmerEncoder.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/OrthoXMLSplitter.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/__init__.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/check_db_consistency.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/convert.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/convert_omastandalone.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/db.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/geneontology.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/homoeologs.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/linkout.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/locus_parser.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/models.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/synteny.py delete mode 100755 src/HogProf/build/lib/pyoma/browser/tablefmt.py delete mode 100755 src/HogProf/build/lib/tests/__init__.py delete mode 100755 src/HogProf/build/lib/tests/test_hashutils.py delete mode 100755 src/HogProf/build/lib/tests/test_hpputils.py delete mode 100755 src/HogProf/build/lib/tests/test_profiler.py delete mode 100755 src/HogProf/build/lib/utils/__init__.py delete mode 100755 src/HogProf/build/lib/utils/config_utils.py delete mode 100755 src/HogProf/build/lib/utils/files_utils.py delete mode 100755 src/HogProf/build/lib/utils/goatools_utils.py delete mode 100755 src/HogProf/build/lib/utils/hashutils.py delete mode 100755 src/HogProf/build/lib/utils/preprocess_config.py delete mode 100755 src/HogProf/build/lib/utils/pyhamutils.py delete mode 100755 src/HogProf/orthoxml.py delete mode 100644 src/HogProf/utils/__pycache__/__init__.cpython-310.pyc delete mode 100644 src/HogProf/utils/__pycache__/__init__.cpython-38.pyc delete mode 100644 src/HogProf/utils/__pycache__/files_utils.cpython-310.pyc delete mode 100644 src/HogProf/utils/__pycache__/files_utils.cpython-38.pyc delete mode 100644 src/HogProf/utils/__pycache__/goatools_utils.cpython-310.pyc delete mode 100644 src/HogProf/utils/__pycache__/goautils.cpython-310.pyc delete mode 100644 src/HogProf/utils/__pycache__/hashutils.cpython-310.pyc delete mode 100644 src/HogProf/utils/__pycache__/hashutils.cpython-38.pyc delete mode 100644 src/HogProf/utils/__pycache__/pyhamutils.cpython-310.pyc delete mode 100644 src/HogProf/utils/__pycache__/pyhamutils.cpython-38.pyc diff --git a/.gitignore b/.gitignore index 6fe9c3e..cb6f6c4 100755 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,7 @@ *.gaf **/*.tar.gz **/*.whl -**/*.pyc +*.pyc + +.idea/ +__pycache__/ diff --git a/docs/.nojekyll b/docs/.nojekyll deleted file mode 100644 index e69de29..0000000 diff --git a/docsource/conf.py b/docsource/conf.py deleted file mode 100644 index d092eae..0000000 --- a/docsource/conf.py +++ /dev/null @@ -1,44 +0,0 @@ -import os -import sys - -sys.path.insert(0, os.path.abspath('../../')) -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon'] -html_theme = 'sphinx_rtd_theme' - - - -# Project information -project = 'Hogprof' -author = 'Dave Moi' - -# Extensions to use -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.coverage', - 'sphinx.ext.napoleon', - 'sphinx_rtd_theme' -] - -# Theme settings -html_theme = 'sphinx_rtd_theme' -html_theme_options = { - 'collapse_navigation': False, - 'sticky_navigation': True, - 'navigation_depth': 3, - 'style_external_links': True -} - -# Add any additional options for autodoc -autodoc_default_options = { - 'member-order': 'bysource' -} - -# Add any modules to be excluded from the documentation -exclude_patterns = [] - -# The master toctree document. -master_doc = 'index' - -# Mock import modules that may not be available in the documentation build environment -autodoc_mock_imports = ['requests' , 'Bio' , 'numpy' , 'pandas' , 'matplotlib' , 'seaborn' , 'scipy' , 'wget' , 'statsmodels' , 'toytree' , 'pandas' , '' ] - diff --git a/docsource/index.rst b/docsource/index.rst deleted file mode 100644 index 3c12ad8..0000000 --- a/docsource/index.rst +++ /dev/null @@ -1,76 +0,0 @@ -HogProf -===================== - - - HogProf is an extensible and tunable approach to phylogenetic profiling using orthology data. It is powered by minhash based datastructures and computationally efficient. - - Still under major development and may change. - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - installation - usage - troubleshooting - credits - -Installation ------------- - -To install My Project, run the following command: - -Using pip - - -.. code-block:: bash - $ pip install hogprof - -Or from github -.. code-block:: bash -$ git clone https://github.com/DessimozLab/HogProf.git -$ pip install -r pipreqs.txt . - -Quickstart ------ - -To use the library we need an OMA instance's HDF5 file containing HOG info and some accesory files. - -.. code-block:: bash - $ mkdir YourOmaDirectory - $ cd YourOmaDirectory - $ wget https://omabrowser.org/All/OmaServer.h5 - $ wget https://omabrowser.org/All/oma-go.txt.gz - -Let's create a directory for the phylogenetic rpfiling database were going to make. - - -.. code-block:: bash - $ mkdir YourDBDirectory - -Ok. We're ready! Now let's compile a database containing all HOGs and our desired taxonomic levels using default settings. Launch the lshbuilder. -dbtypes available on the command line are : all , plants , archaea, bacteria , eukarya , protists , fungi , metazoa and vertebrates. These will use the NCBI taxonomy as a tree to annotate events in different gene family's histories. - -.. code-block:: bash - $python lshbuilder.py --outpath YourHogProfDirectory --dbtype all --OMA YourOmaDirectory/OmaServer.h5 --nthreads numberOfCPUcores - -This should build a taxonomic tree for the genomes contained in the release and then calculate enhanced phylogenies for all HOGs in OMA. -Once the database is completed it can be interogated using a profiler object. Construction and usage of this object should be done using a python script or notebook. - - -.. code-block:: python - import HogProf - myproject.do_x() - - - - -Troubleshooting ---------------- - - -If you encounter any issues while using My Project, please file a bug report on our GitHub repository: https://github.com/user/repo/issues - - -Credits -------- - -My Project was created by John Doe. \ No newline at end of file diff --git a/environment.yml b/environment.yml deleted file mode 100755 index 1b644f4..0000000 --- a/environment.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: Hogprof -channels: - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - ca-certificates=2021.5.25=h06a4308_1 - - certifi=2020.12.5=py38h06a4308_0 - - ld_impl_linux-64=2.33.1=h53a641e_7 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.1.0=hdf63c60_0 - - libstdcxx-ng=9.1.0=hdf63c60_0 - - ncurses=6.2=he6710b0_1 - - openssl=1.1.1k=h27cfd23_0 - - pip=21.1.1=py38h06a4308_0 - - python=3.8.10=hdb3f193_7 - - readline=8.1=h27cfd23_0 - - setuptools=52.0.0=py38h06a4308_0 - - sqlite=3.35.4=hdfb4753_0 - - tk=8.6.10=hbc83047_0 - - wheel=0.36.2=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7b6447c_3 - - pip: - - biopython==1.78 - - chardet==4.0.0 - - datasketch==1.5.3 - - ete3==3.1.2 - - future==0.18.2 - - goatools==1.1.6 - - h5py==3.2.1 - - idna==2.10 - - lxml==4.6.3 - - numexpr==2.7.3 - - numpy==1.20.3 - - pandas==1.2.4 - - pyham==1.1.10 - - pyoma==0.11.1 - - pyopa==0.8.0 - - python-dateutil==2.8.1 - - pytz==2021.1 - - requests==2.25.1 - - scipy==1.6.3 - - six==1.16.0 - - tables==3.6.1 - - urllib3==1.26.5 \ No newline at end of file diff --git a/pipreqs.txt b/pipreqs.txt deleted file mode 100644 index 4879288..0000000 --- a/pipreqs.txt +++ /dev/null @@ -1,65 +0,0 @@ -appdirs==1.4.4 -arrow==1.1.1 -attrs==21.2.0 -beautifulsoup4==4.9.3 -biopython==1.79 -bioservices==1.7.12 -certifi==2020.12.5 -chardet==4.0.0 -colorama==0.4.4 -colorlog==6.4.1 -csb==1.2.5 -custom-inherit==2.4.0 -cycler==0.10.0 -datasketch==1.5.3 -easydev==0.11.2 -ete3==3.1.2 -future==0.18.2 -gevent==21.8.0 -goatools==1.1.6 -greenlet==1.1.1 -grequests==0.6.0 -h5py==3.2.1 -idna==2.10 -itsdangerous==2.0.1 -joblib==1.0.1 -kiwisolver==1.3.2 -lxml==4.6.3 -matplotlib==3.4.3 -multipledispatch==0.6.0 -networkx==2.6.2 -numexpr==2.7.3 -numpy==1.21.2 -pandas==1.3.2 -pexpect==4.8.0 -Pillow==8.3.2 -ptyprocess==0.7.0 -pyham==1.1.10 -pyoma==0.11.1 -pyopa==0.8.0 -pyparsing==2.4.7 -pypng==0.0.21 --e git+git@github.com:DessimozLab/HogProf.git@a76d8cae9c3f3f5799da031c1aab632ac81cb409#egg=PyProfiler&subdirectory=pyprofiler -python-dateutil==2.8.2 -pytz==2021.1 -PyYAML==5.4.1 -reportlab==3.6.1 -requests==2.25.1 -requests-cache==0.7.4 -scikit-learn==0.24.2 -scipy==1.7.1 -seaborn==0.11.2 -six==1.16.0 -sklearn==0.0 -soupsieve==2.2.1 -suds-jurko==0.6 -tables==3.6.1 -threadpoolctl==2.2.0 -toyplot==0.19.0 -url-normalize==1.4.3 -urllib3==1.26.5 -wget==3.2 -wrapt==1.12.1 -xmltodict==0.12.0 -zope.event==4.5.0 -zope.interface==5.4.0 diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..057d516 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,1263 @@ +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. + +[[package]] +name = "biopython" +version = "1.81" +description = "Freely available tools for computational molecular biology." +optional = false +python-versions = ">=3.7" +files = [ + {file = "biopython-1.81-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef7c79b65b0b3f3c7dc59e20a7f8ae5758d8e852cb8b9cace590dc5617e348ba"}, + {file = "biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ebfbce0d91796c7aef422ee9dffe8827e07e5abaa94545e006f1f20e965c80b"}, + {file = "biopython-1.81-cp310-cp310-win32.whl", hash = "sha256:919a2c583cabf9c96d2ae4e1245a6b0376932fb342aca302a0fc198b71ab3275"}, + {file = "biopython-1.81-cp310-cp310-win_amd64.whl", hash = "sha256:b37c0d24191e5c96ca02415a5188551980c83a0d518bbc4ffe3c9a5d1fe0ee81"}, + {file = "biopython-1.81-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7a168709694e10b338718c18d967edd5b56c237dc88642c22275796007a70000"}, + {file = "biopython-1.81-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a51d9c1d1b4b634447535da74a644fae59bc234fbbf9001e2dc6b6fbabb98019"}, + {file = "biopython-1.81-cp311-cp311-win32.whl", hash = "sha256:2f9cfaf16d55ab80d514e7aebe5710dabe4e4ff47ede851031202e33b3249da3"}, + {file = "biopython-1.81-cp311-cp311-win_amd64.whl", hash = "sha256:e41b55edcfd448630e77bf4de66a7235324a8a149621499891da6bd1d5085b9a"}, + {file = "biopython-1.81-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3b36ba1bf6395c09a365c53530c9d71f3617763fa2c1d452b3d8948368c0f1de"}, + {file = "biopython-1.81-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c5c07123ff5f44c9e6b5369df854a38afd3c0c50ef58498a0ae8f7eb799f3e8"}, + {file = "biopython-1.81-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:97cbdbed01b2512471f36c74b91658d1dfbdcbf39bc038f6ce5a41c3e60a8fc6"}, + {file = "biopython-1.81-cp37-cp37m-win32.whl", hash = "sha256:35506e39822c52d11cf09a3951e82375ca1bb9303960b4286acf02c9a6f6c4cc"}, + {file = "biopython-1.81-cp37-cp37m-win_amd64.whl", hash = "sha256:793c42a376cd63f62f8a088ce39b7dc6b5c55e4e9031d887c434de1595bfa4b8"}, + {file = "biopython-1.81-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:11d673698b3d0d6589292ea951fb62cb24ea27d273eca0d08dbbd956690f97f5"}, + {file = "biopython-1.81-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:655df416936662c0c8a06a549cb25e1560e1fea5067d850f34fb714b8a3fae6c"}, + {file = "biopython-1.81-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:762c6c43a8486b5fcd07f136a3217b87d24755618b9ea9da1f17124ff44c2ad6"}, + {file = "biopython-1.81-cp38-cp38-win32.whl", hash = "sha256:ee51bb1cd7decffd24da6b76d5e01b7e2fd818ab85cf0c180226cbb5793a3abd"}, + {file = "biopython-1.81-cp38-cp38-win_amd64.whl", hash = "sha256:ccd729249fd5f586dd4c2a3507c2ea2456825d7e615e97c07c409c850eaf4594"}, + {file = "biopython-1.81-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9ba33244f0eff830beaa7240065bdb5095d96fded6599b76bbb9ddab45cd2bbd"}, + {file = "biopython-1.81-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bb0c690c7368f255ed45236bf0f5464b476b8c083c8f634533921af78278261"}, + {file = "biopython-1.81-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65b93b513ce9dd7b2ce058720eadf42cd03f312db3409356efeb93123d1320aa"}, + {file = "biopython-1.81-cp39-cp39-win32.whl", hash = "sha256:811796f8d222aa3869a50e31e54ce62b69106b47cd8bb06934867c0d843297b5"}, + {file = "biopython-1.81-cp39-cp39-win_amd64.whl", hash = "sha256:b09efcb4733c8770f25eab5fe555a96a08f5ab9e1bc36939e08ebf2ffbf3e0f1"}, + {file = "biopython-1.81.tar.gz", hash = "sha256:2cf38112b6d8415ad39d6a611988cd11fb5f33eb09346666a87263beba9614e0"}, +] + +[package.dependencies] +numpy = "*" + +[[package]] +name = "blosc2" +version = "2.2.9" +description = "Python wrapper for the C-Blosc2 library" +optional = false +python-versions = "<4,>=3.8" +files = [ + {file = "blosc2-2.2.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:135afe34913cd43b02186fb400f30e2c9bdbfe3752470d9b6b00a20e7293fb9f"}, + {file = "blosc2-2.2.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:562828192e3c6f4629823d836bec1d129dfdad38a7e6d2e84f52dcaf9979633b"}, + {file = "blosc2-2.2.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f9413d6926d7442847b115680567fd4ad4ddcdf46e2419cd2f5e82ee8d00f6c"}, + {file = "blosc2-2.2.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74a24b4efb8b608b71d8af51d5c8f16dc63f45c2145240e7d313472fa720a68e"}, + {file = "blosc2-2.2.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:73c7a7afd5390d60ad8ecd1e0e5de2492c60a24cce748b8ae2da83ceda0649ad"}, + {file = "blosc2-2.2.9-cp310-cp310-win32.whl", hash = "sha256:49f3b3951764ddf6d7ad3c1c0800adef2b7780348b1fe5126b6e0970f3ea6c2f"}, + {file = "blosc2-2.2.9-cp310-cp310-win_amd64.whl", hash = "sha256:e24335d97ae43558d222b15141d8499c3b220b3d166350441a6d2a4470997921"}, + {file = "blosc2-2.2.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f774b0c20b86c99fe1ba4fa7737add60d71930662192fdf66a547707a1e3a37"}, + {file = "blosc2-2.2.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7746244318adeb552cfb45c95b329eb12e146159ae6506b06b4854dec4c3b2c1"}, + {file = "blosc2-2.2.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e82b6280107b9ec05aa0ae7d86a3f73d14bd99767901cec95dab622d37cb0d7e"}, + {file = "blosc2-2.2.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c11ace31c542aa6eed11708e7b92cf5d3dbbb3c1b8a691919c3bb6130caf1746"}, + {file = "blosc2-2.2.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5a4db24030be00e8ccc9ff0645716504e4caf7525b70c7976ad8434b47f04f4f"}, + {file = "blosc2-2.2.9-cp311-cp311-win32.whl", hash = "sha256:ebfc1e9736d83bffa16e49f53278de6caa7b5469c44a4448800fc40009efbbba"}, + {file = "blosc2-2.2.9-cp311-cp311-win_amd64.whl", hash = "sha256:368b12e43249e55137a05506e747cc4656539afc73bf82a85b896a2f13a529d8"}, + {file = "blosc2-2.2.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8504a92404b2ba5112db83bebdfbe7eb3c286514acb658191434f020ea084c7a"}, + {file = "blosc2-2.2.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e38cc441798595f05e70d620f1124cd4c472003f9b58c17e79dd0477a4d151fb"}, + {file = "blosc2-2.2.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f10e14c7f3b9f14431df58f9891e490af83ae6fb3d7c2a7d05722560273a2da8"}, + {file = "blosc2-2.2.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adaef04627713e22bc7883a35afd499266762f700d8644a65cfafbf2879d4350"}, + {file = "blosc2-2.2.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a46f9216d63958572514354b94eaedaa2052b60b3301ec7c41c8f30c6825c718"}, + {file = "blosc2-2.2.9-cp312-cp312-win32.whl", hash = "sha256:658443f639975d29eaa3feea269a2f971d2da5cab736bb6462561d7efe261cc3"}, + {file = "blosc2-2.2.9-cp312-cp312-win_amd64.whl", hash = "sha256:0eb8ae893b60743a31feb4ed02dd96039400fb8e7fc5ff4d9adea8d70acde204"}, + {file = "blosc2-2.2.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:82ec6d1a4343868ce833380c82f60e9799794e04d35f630af948f0f3d28c3577"}, + {file = "blosc2-2.2.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:555468f4c77a45e35a7a878fab7679bf4705585a84b81649fc423eba293cf17b"}, + {file = "blosc2-2.2.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e788170a2e80cac38f15d723f7397a87d3c522980fc4f8d96c6fa9f5a74dd3"}, + {file = "blosc2-2.2.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31c0ee147f5f78ceeb65b601c47b0431a0f6111b8443aeb1485547394725895"}, + {file = "blosc2-2.2.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:25f27b50b2823e6a2e142eff02840979c19f629eb7833b45a98332a2d728543f"}, + {file = "blosc2-2.2.9-cp39-cp39-win32.whl", hash = "sha256:fa36fa18b8d41aee7db975a318b481304e6e3558b48641ec53933287274a4ec3"}, + {file = "blosc2-2.2.9-cp39-cp39-win_amd64.whl", hash = "sha256:c840bdfd97e25cd61d6e048f8d9ee6478133f3e70c880c2cb3054db93e142bba"}, + {file = "blosc2-2.2.9.tar.gz", hash = "sha256:63606498aaa72d58215b618d4512d5d3de29000a7b01a870edce8cb21d237c40"}, +] + +[package.dependencies] +msgpack = "*" +ndindex = ">=1.4" +numpy = ">=1.20.3" +py-cpuinfo = "*" + +[[package]] +name = "certifi" +version = "2023.7.22" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +files = [ + {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"}, + {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"}, +] + +[[package]] +name = "charset-normalizer" +version = "3.3.0" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "charset-normalizer-3.3.0.tar.gz", hash = "sha256:63563193aec44bce707e0c5ca64ff69fa72ed7cf34ce6e11d5127555756fd2f6"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:effe5406c9bd748a871dbcaf3ac69167c38d72db8c9baf3ff954c344f31c4cbe"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4162918ef3098851fcd8a628bf9b6a98d10c380725df9e04caf5ca6dd48c847a"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0570d21da019941634a531444364f2482e8db0b3425fcd5ac0c36565a64142c8"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5707a746c6083a3a74b46b3a631d78d129edab06195a92a8ece755aac25a3f3d"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:278c296c6f96fa686d74eb449ea1697f3c03dc28b75f873b65b5201806346a69"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a4b71f4d1765639372a3b32d2638197f5cd5221b19531f9245fcc9ee62d38f56"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5969baeaea61c97efa706b9b107dcba02784b1601c74ac84f2a532ea079403e"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3f93dab657839dfa61025056606600a11d0b696d79386f974e459a3fbc568ec"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:db756e48f9c5c607b5e33dd36b1d5872d0422e960145b08ab0ec7fd420e9d649"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:232ac332403e37e4a03d209a3f92ed9071f7d3dbda70e2a5e9cff1c4ba9f0678"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e5c1502d4ace69a179305abb3f0bb6141cbe4714bc9b31d427329a95acfc8bdd"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:2502dd2a736c879c0f0d3e2161e74d9907231e25d35794584b1ca5284e43f596"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23e8565ab7ff33218530bc817922fae827420f143479b753104ab801145b1d5b"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-win32.whl", hash = "sha256:1872d01ac8c618a8da634e232f24793883d6e456a66593135aeafe3784b0848d"}, + {file = "charset_normalizer-3.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:557b21a44ceac6c6b9773bc65aa1b4cc3e248a5ad2f5b914b91579a32e22204d"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d7eff0f27edc5afa9e405f7165f85a6d782d308f3b6b9d96016c010597958e63"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a685067d05e46641d5d1623d7c7fdf15a357546cbb2f71b0ebde91b175ffc3e"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0d3d5b7db9ed8a2b11a774db2bbea7ba1884430a205dbd54a32d61d7c2a190fa"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2935ffc78db9645cb2086c2f8f4cfd23d9b73cc0dc80334bc30aac6f03f68f8c"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fe359b2e3a7729010060fbca442ca225280c16e923b37db0e955ac2a2b72a05"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:380c4bde80bce25c6e4f77b19386f5ec9db230df9f2f2ac1e5ad7af2caa70459"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0d1e3732768fecb052d90d62b220af62ead5748ac51ef61e7b32c266cac9293"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b2919306936ac6efb3aed1fbf81039f7087ddadb3160882a57ee2ff74fd2382"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f8888e31e3a85943743f8fc15e71536bda1c81d5aa36d014a3c0c44481d7db6e"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:82eb849f085624f6a607538ee7b83a6d8126df6d2f7d3b319cb837b289123078"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7b8b8bf1189b3ba9b8de5c8db4d541b406611a71a955bbbd7385bbc45fcb786c"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5adf257bd58c1b8632046bbe43ee38c04e1038e9d37de9c57a94d6bd6ce5da34"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c350354efb159b8767a6244c166f66e67506e06c8924ed74669b2c70bc8735b1"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-win32.whl", hash = "sha256:02af06682e3590ab952599fbadac535ede5d60d78848e555aa58d0c0abbde786"}, + {file = "charset_normalizer-3.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:86d1f65ac145e2c9ed71d8ffb1905e9bba3a91ae29ba55b4c46ae6fc31d7c0d4"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:3b447982ad46348c02cb90d230b75ac34e9886273df3a93eec0539308a6296d7"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:abf0d9f45ea5fb95051c8bfe43cb40cda383772f7e5023a83cc481ca2604d74e"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b09719a17a2301178fac4470d54b1680b18a5048b481cb8890e1ef820cb80455"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b3d9b48ee6e3967b7901c052b670c7dda6deb812c309439adaffdec55c6d7b78"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:edfe077ab09442d4ef3c52cb1f9dab89bff02f4524afc0acf2d46be17dc479f5"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3debd1150027933210c2fc321527c2299118aa929c2f5a0a80ab6953e3bd1908"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86f63face3a527284f7bb8a9d4f78988e3c06823f7bea2bd6f0e0e9298ca0403"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24817cb02cbef7cd499f7c9a2735286b4782bd47a5b3516a0e84c50eab44b98e"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c71f16da1ed8949774ef79f4a0260d28b83b3a50c6576f8f4f0288d109777989"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:9cf3126b85822c4e53aa28c7ec9869b924d6fcfb76e77a45c44b83d91afd74f9"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:b3b2316b25644b23b54a6f6401074cebcecd1244c0b8e80111c9a3f1c8e83d65"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:03680bb39035fbcffe828eae9c3f8afc0428c91d38e7d61aa992ef7a59fb120e"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cc152c5dd831641e995764f9f0b6589519f6f5123258ccaca8c6d34572fefa8"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-win32.whl", hash = "sha256:b8f3307af845803fb0b060ab76cf6dd3a13adc15b6b451f54281d25911eb92df"}, + {file = "charset_normalizer-3.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:8eaf82f0eccd1505cf39a45a6bd0a8cf1c70dcfc30dba338207a969d91b965c0"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dc45229747b67ffc441b3de2f3ae5e62877a282ea828a5bdb67883c4ee4a8810"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f4a0033ce9a76e391542c182f0d48d084855b5fcba5010f707c8e8c34663d77"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ada214c6fa40f8d800e575de6b91a40d0548139e5dc457d2ebb61470abf50186"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b1121de0e9d6e6ca08289583d7491e7fcb18a439305b34a30b20d8215922d43c"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1063da2c85b95f2d1a430f1c33b55c9c17ffaf5e612e10aeaad641c55a9e2b9d"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70f1d09c0d7748b73290b29219e854b3207aea922f839437870d8cc2168e31cc"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:250c9eb0f4600361dd80d46112213dff2286231d92d3e52af1e5a6083d10cad9"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:750b446b2ffce1739e8578576092179160f6d26bd5e23eb1789c4d64d5af7dc7"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:fc52b79d83a3fe3a360902d3f5d79073a993597d48114c29485e9431092905d8"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:588245972aca710b5b68802c8cad9edaa98589b1b42ad2b53accd6910dad3545"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e39c7eb31e3f5b1f88caff88bcff1b7f8334975b46f6ac6e9fc725d829bc35d4"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-win32.whl", hash = "sha256:abecce40dfebbfa6abf8e324e1860092eeca6f7375c8c4e655a8afb61af58f2c"}, + {file = "charset_normalizer-3.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:24a91a981f185721542a0b7c92e9054b7ab4fea0508a795846bc5b0abf8118d4"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:67b8cc9574bb518ec76dc8e705d4c39ae78bb96237cb533edac149352c1f39fe"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac71b2977fb90c35d41c9453116e283fac47bb9096ad917b8819ca8b943abecd"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3ae38d325b512f63f8da31f826e6cb6c367336f95e418137286ba362925c877e"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:542da1178c1c6af8873e143910e2269add130a299c9106eef2594e15dae5e482"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a85aed0b864ac88309b7d94be09f6046c834ef60762a8833b660139cfbad13"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aae32c93e0f64469f74ccc730a7cb21c7610af3a775157e50bbd38f816536b38"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b26ddf78d57f1d143bdf32e820fd8935d36abe8a25eb9ec0b5a71c82eb3895"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f5d10bae5d78e4551b7be7a9b29643a95aded9d0f602aa2ba584f0388e7a557"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:249c6470a2b60935bafd1d1d13cd613f8cd8388d53461c67397ee6a0f5dce741"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:c5a74c359b2d47d26cdbbc7845e9662d6b08a1e915eb015d044729e92e7050b7"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:b5bcf60a228acae568e9911f410f9d9e0d43197d030ae5799e20dca8df588287"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:187d18082694a29005ba2944c882344b6748d5be69e3a89bf3cc9d878e548d5a"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:81bf654678e575403736b85ba3a7867e31c2c30a69bc57fe88e3ace52fb17b89"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-win32.whl", hash = "sha256:85a32721ddde63c9df9ebb0d2045b9691d9750cb139c161c80e500d210f5e26e"}, + {file = "charset_normalizer-3.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:468d2a840567b13a590e67dd276c570f8de00ed767ecc611994c301d0f8c014f"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e0fc42822278451bc13a2e8626cf2218ba570f27856b536e00cfa53099724828"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:09c77f964f351a7369cc343911e0df63e762e42bac24cd7d18525961c81754f4"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:12ebea541c44fdc88ccb794a13fe861cc5e35d64ed689513a5c03d05b53b7c82"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:805dfea4ca10411a5296bcc75638017215a93ffb584c9e344731eef0dcfb026a"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96c2b49eb6a72c0e4991d62406e365d87067ca14c1a729a870d22354e6f68115"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aaf7b34c5bc56b38c931a54f7952f1ff0ae77a2e82496583b247f7c969eb1479"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:619d1c96099be5823db34fe89e2582b336b5b074a7f47f819d6b3a57ff7bdb86"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a0ac5e7015a5920cfce654c06618ec40c33e12801711da6b4258af59a8eff00a"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:93aa7eef6ee71c629b51ef873991d6911b906d7312c6e8e99790c0f33c576f89"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7966951325782121e67c81299a031f4c115615e68046f79b85856b86ebffc4cd"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:02673e456dc5ab13659f85196c534dc596d4ef260e4d86e856c3b2773ce09843"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:c2af80fb58f0f24b3f3adcb9148e6203fa67dd3f61c4af146ecad033024dde43"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:153e7b6e724761741e0974fc4dcd406d35ba70b92bfe3fedcb497226c93b9da7"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-win32.whl", hash = "sha256:d47ecf253780c90ee181d4d871cd655a789da937454045b17b5798da9393901a"}, + {file = "charset_normalizer-3.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:d97d85fa63f315a8bdaba2af9a6a686e0eceab77b3089af45133252618e70884"}, + {file = "charset_normalizer-3.3.0-py3-none-any.whl", hash = "sha256:e46cd37076971c1040fc8c41273a8b3e2c624ce4f2be3f5dfcb7a430c1d3acc2"}, +] + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "datasketch" +version = "1.6.4" +description = "Probabilistic data structures for processing and searching very large datasets" +optional = false +python-versions = "*" +files = [ + {file = "datasketch-1.6.4-py3-none-any.whl", hash = "sha256:0982712115139348c21217b8ca83b8d3b342f2556f2686eeda2972604cc68532"}, + {file = "datasketch-1.6.4.tar.gz", hash = "sha256:fe5a3545885c4c84eeb49d53a8bd82414c9c26948f7b0271cfe51cf16944c81a"}, +] + +[package.dependencies] +numpy = ">=1.11" +scipy = ">=1.0.0" + +[package.extras] +benchmark = ["SetSimilaritySearch (>=0.1.7)", "matplotlib (>=3.1.2)", "nltk (>=3.4.5)", "pandas (>=0.25.3)", "pyfarmhash (>=0.2.2)", "pyhash (>=0.9.3)", "scikit-learn (>=0.21.3)", "scipy (>=1.3.3)"] +cassandra = ["cassandra-driver (>=3.20)"] +experimental-aio = ["aiounittest", "motor"] +redis = ["redis (>=2.10.0)"] +test = ["cassandra-driver (>=3.20)", "coverage", "mock (>=2.0.0)", "mockredispy", "nose (>=1.3.7)", "nose-exclude (>=0.5.0)", "pymongo (>=3.9.0)", "pytest", "redis (>=2.10.0)"] + +[[package]] +name = "docopt" +version = "0.6.2" +description = "Pythonic argument parser, that will make you smile" +optional = false +python-versions = "*" +files = [ + {file = "docopt-0.6.2.tar.gz", hash = "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491"}, +] + +[[package]] +name = "et-xmlfile" +version = "1.1.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = false +python-versions = ">=3.6" +files = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] + +[[package]] +name = "ete3" +version = "3.1.3" +description = "A Python Environment for (phylogenetic) Tree Exploration" +optional = false +python-versions = "*" +files = [ + {file = "ete3-3.1.3.tar.gz", hash = "sha256:06a3b7fa8ed90187b076a8dbbe5b1b62acee94201d3c6e822f55f449601ef6f2"}, +] + +[[package]] +name = "future" +version = "0.18.3" +description = "Clean single-source support for Python 3 and 2" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "future-0.18.3.tar.gz", hash = "sha256:34a17436ed1e96697a86f9de3d15a3b0be01d8bc8de9c1dffd59fb8234ed5307"}, +] + +[[package]] +name = "fuzzyset2" +version = "0.2.2" +description = "A simple python fuzzyset implementation." +optional = false +python-versions = ">=3.6" +files = [ + {file = "fuzzyset2-0.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:778c60881834ed5ddb789feb8d3f274774e0c429e46801af9ac426353b95c0fe"}, + {file = "fuzzyset2-0.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:70e9d03315eb0e1c9c07648baa957bcfc164584179283f0c2f5bafac04798192"}, + {file = "fuzzyset2-0.2.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54ae6f676190f5ecd3dd912e10a53537710bc47361f3208fcd78d0781d2a1595"}, + {file = "fuzzyset2-0.2.2-cp310-cp310-win32.whl", hash = "sha256:023d22f20e60a264fe5f340fbf2d8cb4438180c08def9ed32e25abbf2582fa6f"}, + {file = "fuzzyset2-0.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:3c32fdf5d605715f802c766b4ddb2cf5abcd4e59fb7d203da673cd471e8e3247"}, + {file = "fuzzyset2-0.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:61cfa257b41aa900c1a63e71ae07f9e96467d233db6f0bef2128f5fe644d19e2"}, + {file = "fuzzyset2-0.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0301a9d57dabec30fc9d91abbb5d8e395374e72219ebc72a0d6abae9d5421810"}, + {file = "fuzzyset2-0.2.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a01640140b341c196df55c910f93dfaf07558a1fe082b0e0511ac2b220eb10d5"}, + {file = "fuzzyset2-0.2.2-cp311-cp311-win32.whl", hash = "sha256:1f9a8d1f7bd51129a10f17910698f52e4d6da08be1f17cd3a1039c23fd2fc7e8"}, + {file = "fuzzyset2-0.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:394d307742219e0fc6854773040dc0d592ccead2f779c1586753b70792469f85"}, + {file = "fuzzyset2-0.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d6a8233f6463a2091b0e5bb5349a2bb527a8dd1e335814034217aadbb42e455b"}, + {file = "fuzzyset2-0.2.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:395820970f77d6694224f98c5837758b6f9d7d33c3526980ca7e366b051e5844"}, + {file = "fuzzyset2-0.2.2-cp37-cp37m-win32.whl", hash = "sha256:ac0e5e9a52778ef1acc708a9d0e97d3ddbe7aa6a12b50a2431f54da481de31c1"}, + {file = "fuzzyset2-0.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:a660a7a19d3af2898845e3ce4b7e77662bbd8b77c549d3bd48106e13ba92356f"}, + {file = "fuzzyset2-0.2.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:edb70b7e79e22d0e6467ca6b121778a14dd67181cc6657cf07da1c647b044372"}, + {file = "fuzzyset2-0.2.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:06493e858892e5c2306f17a0f8e2c34f9a6c2020fa659fc31278aff82870a309"}, + {file = "fuzzyset2-0.2.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6133a52ce7023f0a31540e8bb072b465ad0ff25126f5454aae0d02f88bcf6b5f"}, + {file = "fuzzyset2-0.2.2-cp38-cp38-win32.whl", hash = "sha256:ff5313a9096f53096c8972bda186c0ffb7b78b45e68ac9b589b6d900bfdcda63"}, + {file = "fuzzyset2-0.2.2-cp38-cp38-win_amd64.whl", hash = "sha256:6c32b048b11e45b0f49fd8bb92061a8289f03d722026d966bcb2202a7af0f590"}, + {file = "fuzzyset2-0.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3c033ecf4ea6327de89b07cfe5fe4017e585d1a7bc4c204cc7fc2c3dae984d68"}, + {file = "fuzzyset2-0.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11e0cbbd96db35b19c0911c26a8ee5c763ecd93a4e02cbcb8a736fb3ae9e6473"}, + {file = "fuzzyset2-0.2.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8db31967266d780ecf20d5dd88f6aeece3b7c752b9da6b3b040ab00275b20e4"}, + {file = "fuzzyset2-0.2.2-cp39-cp39-win32.whl", hash = "sha256:3020e804d3967c9620499b0722250b0610f5a41774dd701d2d9d8c7c2e86d687"}, + {file = "fuzzyset2-0.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:82a46ddd5aab26675da6b1b41221c810ea97d1d4ffdfcabbb2557028c3855715"}, + {file = "fuzzyset2-0.2.2.tar.gz", hash = "sha256:71f08c69ece31e73631f402ee532f74115255290819747d25e55661b5029cfb5"}, +] + +[package.dependencies] +rapidfuzz = ">=2.0" + +[[package]] +name = "goatools" +version = "1.3.9" +description = "Python scripts to find enrichment of GO terms" +optional = false +python-versions = "*" +files = [ + {file = "goatools-1.3.9-py3-none-any.whl", hash = "sha256:200732531e9cd897584c04347ccf86f481ee90526133f795b42cc8e59f548dc5"}, + {file = "goatools-1.3.9.tar.gz", hash = "sha256:a9c2fe7d4735d725dc685b1d4ef5df489f37506ada1c8198f0786c92559c348c"}, +] + +[package.dependencies] +docopt = "*" +numpy = "*" +openpyxl = "*" +pandas = "*" +pydot = "*" +requests = "*" +scipy = "*" +statsmodels = "*" +xlsxwriter = "*" + +[[package]] +name = "h5py" +version = "3.9.0" +description = "Read and write HDF5 files from Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "h5py-3.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eb7bdd5e601dd1739698af383be03f3dad0465fe67184ebd5afca770f50df9d6"}, + {file = "h5py-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:78e44686334cbbf2dd21d9df15823bc38663f27a3061f6a032c68a3e30c47bf7"}, + {file = "h5py-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f68b41efd110ce9af1cbe6fa8af9f4dcbadace6db972d30828b911949e28fadd"}, + {file = "h5py-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:12aa556d540f11a2cae53ea7cfb94017353bd271fb3962e1296b342f6550d1b8"}, + {file = "h5py-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:d97409e17915798029e297a84124705c8080da901307ea58f29234e09b073ddc"}, + {file = "h5py-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:551e358db05a874a0f827b22e95b30092f2303edc4b91bb62ad2f10e0236e1a0"}, + {file = "h5py-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6822a814b9d8b8363ff102f76ea8d026f0ca25850bb579d85376029ee3e73b93"}, + {file = "h5py-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54f01202cdea754ab4227dd27014bdbd561a4bbe4b631424fd812f7c2ce9c6ac"}, + {file = "h5py-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64acceaf6aff92af091a4b83f6dee3cf8d3061f924a6bb3a33eb6c4658a8348b"}, + {file = "h5py-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:804c7fb42a34c8ab3a3001901c977a5c24d2e9c586a0f3e7c0a389130b4276fc"}, + {file = "h5py-3.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8d9492391ff5c3c80ec30ae2fe82a3f0efd1e750833739c25b0d090e3be1b095"}, + {file = "h5py-3.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9da9e7e63376c32704e37ad4cea2dceae6964cee0d8515185b3ab9cbd6b947bc"}, + {file = "h5py-3.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4e20897c88759cbcbd38fb45b507adc91af3e0f67722aa302d71f02dd44d286"}, + {file = "h5py-3.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbf5225543ca35ce9f61c950b73899a82be7ba60d58340e76d0bd42bf659235a"}, + {file = "h5py-3.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:36408f8c62f50007d14e000f9f3acf77e103b9e932c114cbe52a3089e50ebf94"}, + {file = "h5py-3.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:23e74b878bbe1653ab34ca49b83cac85529cd0b36b9d625516c5830cc5ca2eac"}, + {file = "h5py-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f457089c5d524b7998e3649bc63240679b8fb0a3859ea53bbb06841f3d755f1"}, + {file = "h5py-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6284061f3214335e1eec883a6ee497dbe7a79f19e6a57fed2dd1f03acd5a8cb"}, + {file = "h5py-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7a745efd0d56076999b52e8da5fad5d30823bac98b59c68ae75588d09991a"}, + {file = "h5py-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:79bbca34696c6f9eeeb36a91776070c49a060b2879828e2c8fa6c58b8ed10dd1"}, + {file = "h5py-3.9.0.tar.gz", hash = "sha256:e604db6521c1e367c6bd7fad239c847f53cc46646f2d2651372d05ae5e95f817"}, +] + +[package.dependencies] +numpy = ">=1.17.3" + +[[package]] +name = "idna" +version = "3.4" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.5" +files = [ + {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, + {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, +] + +[[package]] +name = "lxml" +version = "4.9.3" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" +files = [ + {file = "lxml-4.9.3-cp27-cp27m-macosx_11_0_x86_64.whl", hash = "sha256:b0a545b46b526d418eb91754565ba5b63b1c0b12f9bd2f808c852d9b4b2f9b5c"}, + {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d"}, + {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1e224d5755dba2f4a9498e150c43792392ac9b5380aa1b845f98a1618c94eeef"}, + {file = "lxml-4.9.3-cp27-cp27m-win32.whl", hash = "sha256:2c74524e179f2ad6d2a4f7caf70e2d96639c0954c943ad601a9e146c76408ed7"}, + {file = "lxml-4.9.3-cp27-cp27m-win_amd64.whl", hash = "sha256:4f1026bc732b6a7f96369f7bfe1a4f2290fb34dce00d8644bc3036fb351a4ca1"}, + {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0781a98ff5e6586926293e59480b64ddd46282953203c76ae15dbbbf302e8bb"}, + {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cef2502e7e8a96fe5ad686d60b49e1ab03e438bd9123987994528febd569868e"}, + {file = "lxml-4.9.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b86164d2cff4d3aaa1f04a14685cbc072efd0b4f99ca5708b2ad1b9b5988a991"}, + {file = "lxml-4.9.3-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:42871176e7896d5d45138f6d28751053c711ed4d48d8e30b498da155af39aebd"}, + {file = "lxml-4.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae8b9c6deb1e634ba4f1930eb67ef6e6bf6a44b6eb5ad605642b2d6d5ed9ce3c"}, + {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:411007c0d88188d9f621b11d252cce90c4a2d1a49db6c068e3c16422f306eab8"}, + {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:cd47b4a0d41d2afa3e58e5bf1f62069255aa2fd6ff5ee41604418ca925911d76"}, + {file = "lxml-4.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e2cb47860da1f7e9a5256254b74ae331687b9672dfa780eed355c4c9c3dbd23"}, + {file = "lxml-4.9.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1247694b26342a7bf47c02e513d32225ededd18045264d40758abeb3c838a51f"}, + {file = "lxml-4.9.3-cp310-cp310-win32.whl", hash = "sha256:cdb650fc86227eba20de1a29d4b2c1bfe139dc75a0669270033cb2ea3d391b85"}, + {file = "lxml-4.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:97047f0d25cd4bcae81f9ec9dc290ca3e15927c192df17331b53bebe0e3ff96d"}, + {file = "lxml-4.9.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:1f447ea5429b54f9582d4b955f5f1985f278ce5cf169f72eea8afd9502973dd5"}, + {file = "lxml-4.9.3-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:57d6ba0ca2b0c462f339640d22882acc711de224d769edf29962b09f77129cbf"}, + {file = "lxml-4.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:9767e79108424fb6c3edf8f81e6730666a50feb01a328f4a016464a5893f835a"}, + {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:71c52db65e4b56b8ddc5bb89fb2e66c558ed9d1a74a45ceb7dcb20c191c3df2f"}, + {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d73d8ecf8ecf10a3bd007f2192725a34bd62898e8da27eb9d32a58084f93962b"}, + {file = "lxml-4.9.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0a3d3487f07c1d7f150894c238299934a2a074ef590b583103a45002035be120"}, + {file = "lxml-4.9.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9e28c51fa0ce5674be9f560c6761c1b441631901993f76700b1b30ca6c8378d6"}, + {file = "lxml-4.9.3-cp311-cp311-win32.whl", hash = "sha256:0bfd0767c5c1de2551a120673b72e5d4b628737cb05414f03c3277bf9bed3305"}, + {file = "lxml-4.9.3-cp311-cp311-win_amd64.whl", hash = "sha256:25f32acefac14ef7bd53e4218fe93b804ef6f6b92ffdb4322bb6d49d94cad2bc"}, + {file = "lxml-4.9.3-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:d3ff32724f98fbbbfa9f49d82852b159e9784d6094983d9a8b7f2ddaebb063d4"}, + {file = "lxml-4.9.3-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:48d6ed886b343d11493129e019da91d4039826794a3e3027321c56d9e71505be"}, + {file = "lxml-4.9.3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9a92d3faef50658dd2c5470af249985782bf754c4e18e15afb67d3ab06233f13"}, + {file = "lxml-4.9.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b4e4bc18382088514ebde9328da057775055940a1f2e18f6ad2d78aa0f3ec5b9"}, + {file = "lxml-4.9.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fc9b106a1bf918db68619fdcd6d5ad4f972fdd19c01d19bdb6bf63f3589a9ec5"}, + {file = "lxml-4.9.3-cp312-cp312-win_amd64.whl", hash = "sha256:d37017287a7adb6ab77e1c5bee9bcf9660f90ff445042b790402a654d2ad81d8"}, + {file = "lxml-4.9.3-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:56dc1f1ebccc656d1b3ed288f11e27172a01503fc016bcabdcbc0978b19352b7"}, + {file = "lxml-4.9.3-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:578695735c5a3f51569810dfebd05dd6f888147a34f0f98d4bb27e92b76e05c2"}, + {file = "lxml-4.9.3-cp35-cp35m-win32.whl", hash = "sha256:704f61ba8c1283c71b16135caf697557f5ecf3e74d9e453233e4771d68a1f42d"}, + {file = "lxml-4.9.3-cp35-cp35m-win_amd64.whl", hash = "sha256:c41bfca0bd3532d53d16fd34d20806d5c2b1ace22a2f2e4c0008570bf2c58833"}, + {file = "lxml-4.9.3-cp36-cp36m-macosx_11_0_x86_64.whl", hash = "sha256:64f479d719dc9f4c813ad9bb6b28f8390360660b73b2e4beb4cb0ae7104f1c12"}, + {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:dd708cf4ee4408cf46a48b108fb9427bfa00b9b85812a9262b5c668af2533ea5"}, + {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c31c7462abdf8f2ac0577d9f05279727e698f97ecbb02f17939ea99ae8daa98"}, + {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e3cd95e10c2610c360154afdc2f1480aea394f4a4f1ea0a5eacce49640c9b190"}, + {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:4930be26af26ac545c3dffb662521d4e6268352866956672231887d18f0eaab2"}, + {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4aec80cde9197340bc353d2768e2a75f5f60bacda2bab72ab1dc499589b3878c"}, + {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:14e019fd83b831b2e61baed40cab76222139926b1fb5ed0e79225bc0cae14584"}, + {file = "lxml-4.9.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0c0850c8b02c298d3c7006b23e98249515ac57430e16a166873fc47a5d549287"}, + {file = "lxml-4.9.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:aca086dc5f9ef98c512bac8efea4483eb84abbf926eaeedf7b91479feb092458"}, + {file = "lxml-4.9.3-cp36-cp36m-win32.whl", hash = "sha256:50baa9c1c47efcaef189f31e3d00d697c6d4afda5c3cde0302d063492ff9b477"}, + {file = "lxml-4.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bef4e656f7d98aaa3486d2627e7d2df1157d7e88e7efd43a65aa5dd4714916cf"}, + {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:46f409a2d60f634fe550f7133ed30ad5321ae2e6630f13657fb9479506b00601"}, + {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4c28a9144688aef80d6ea666c809b4b0e50010a2aca784c97f5e6bf143d9f129"}, + {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:141f1d1a9b663c679dc524af3ea1773e618907e96075262726c7612c02b149a4"}, + {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:53ace1c1fd5a74ef662f844a0413446c0629d151055340e9893da958a374f70d"}, + {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17a753023436a18e27dd7769e798ce302963c236bc4114ceee5b25c18c52c693"}, + {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7d298a1bd60c067ea75d9f684f5f3992c9d6766fadbc0bcedd39750bf344c2f4"}, + {file = "lxml-4.9.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:081d32421db5df44c41b7f08a334a090a545c54ba977e47fd7cc2deece78809a"}, + {file = "lxml-4.9.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:23eed6d7b1a3336ad92d8e39d4bfe09073c31bfe502f20ca5116b2a334f8ec02"}, + {file = "lxml-4.9.3-cp37-cp37m-win32.whl", hash = "sha256:1509dd12b773c02acd154582088820893109f6ca27ef7291b003d0e81666109f"}, + {file = "lxml-4.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:120fa9349a24c7043854c53cae8cec227e1f79195a7493e09e0c12e29f918e52"}, + {file = "lxml-4.9.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4d2d1edbca80b510443f51afd8496be95529db04a509bc8faee49c7b0fb6d2cc"}, + {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d7e43bd40f65f7d97ad8ef5c9b1778943d02f04febef12def25f7583d19baac"}, + {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:71d66ee82e7417828af6ecd7db817913cb0cf9d4e61aa0ac1fde0583d84358db"}, + {file = "lxml-4.9.3-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:6fc3c450eaa0b56f815c7b62f2b7fba7266c4779adcf1cece9e6deb1de7305ce"}, + {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65299ea57d82fb91c7f019300d24050c4ddeb7c5a190e076b5f48a2b43d19c42"}, + {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:eadfbbbfb41b44034a4c757fd5d70baccd43296fb894dba0295606a7cf3124aa"}, + {file = "lxml-4.9.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:3e9bdd30efde2b9ccfa9cb5768ba04fe71b018a25ea093379c857c9dad262c40"}, + {file = "lxml-4.9.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fcdd00edfd0a3001e0181eab3e63bd5c74ad3e67152c84f93f13769a40e073a7"}, + {file = "lxml-4.9.3-cp38-cp38-win32.whl", hash = "sha256:57aba1bbdf450b726d58b2aea5fe47c7875f5afb2c4a23784ed78f19a0462574"}, + {file = "lxml-4.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:92af161ecbdb2883c4593d5ed4815ea71b31fafd7fd05789b23100d081ecac96"}, + {file = "lxml-4.9.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:9bb6ad405121241e99a86efff22d3ef469024ce22875a7ae045896ad23ba2340"}, + {file = "lxml-4.9.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8ed74706b26ad100433da4b9d807eae371efaa266ffc3e9191ea436087a9d6a7"}, + {file = "lxml-4.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fbf521479bcac1e25a663df882c46a641a9bff6b56dc8b0fafaebd2f66fb231b"}, + {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:303bf1edce6ced16bf67a18a1cf8339d0db79577eec5d9a6d4a80f0fb10aa2da"}, + {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:5515edd2a6d1a5a70bfcdee23b42ec33425e405c5b351478ab7dc9347228f96e"}, + {file = "lxml-4.9.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:690dafd0b187ed38583a648076865d8c229661ed20e48f2335d68e2cf7dc829d"}, + {file = "lxml-4.9.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b6420a005548ad52154c8ceab4a1290ff78d757f9e5cbc68f8c77089acd3c432"}, + {file = "lxml-4.9.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bb3bb49c7a6ad9d981d734ef7c7193bc349ac338776a0360cc671eaee89bcf69"}, + {file = "lxml-4.9.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d27be7405547d1f958b60837dc4c1007da90b8b23f54ba1f8b728c78fdb19d50"}, + {file = "lxml-4.9.3-cp39-cp39-win32.whl", hash = "sha256:8df133a2ea5e74eef5e8fc6f19b9e085f758768a16e9877a60aec455ed2609b2"}, + {file = "lxml-4.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:4dd9a263e845a72eacb60d12401e37c616438ea2e5442885f65082c276dfb2b2"}, + {file = "lxml-4.9.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6689a3d7fd13dc687e9102a27e98ef33730ac4fe37795d5036d18b4d527abd35"}, + {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f6bdac493b949141b733c5345b6ba8f87a226029cbabc7e9e121a413e49441e0"}, + {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:05186a0f1346ae12553d66df1cfce6f251589fea3ad3da4f3ef4e34b2d58c6a3"}, + {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c2006f5c8d28dee289f7020f721354362fa304acbaaf9745751ac4006650254b"}, + {file = "lxml-4.9.3-pp38-pypy38_pp73-macosx_11_0_x86_64.whl", hash = "sha256:5c245b783db29c4e4fbbbfc9c5a78be496c9fea25517f90606aa1f6b2b3d5f7b"}, + {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4fb960a632a49f2f089d522f70496640fdf1218f1243889da3822e0a9f5f3ba7"}, + {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:50670615eaf97227d5dc60de2dc99fb134a7130d310d783314e7724bf163f75d"}, + {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9719fe17307a9e814580af1f5c6e05ca593b12fb7e44fe62450a5384dbf61b4b"}, + {file = "lxml-4.9.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3331bece23c9ee066e0fb3f96c61322b9e0f54d775fccefff4c38ca488de283a"}, + {file = "lxml-4.9.3-pp39-pypy39_pp73-macosx_11_0_x86_64.whl", hash = "sha256:ed667f49b11360951e201453fc3967344d0d0263aa415e1619e85ae7fd17b4e0"}, + {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8b77946fd508cbf0fccd8e400a7f71d4ac0e1595812e66025bac475a8e811694"}, + {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e4da8ca0c0c0aea88fd46be8e44bd49716772358d648cce45fe387f7b92374a7"}, + {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fe4bda6bd4340caa6e5cf95e73f8fea5c4bfc55763dd42f1b50a94c1b4a2fbd4"}, + {file = "lxml-4.9.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3df3db1d336b9356dd3112eae5f5c2b8b377f3bc826848567f10bfddfee77e9"}, + {file = "lxml-4.9.3.tar.gz", hash = "sha256:48628bd53a426c9eb9bc066a923acaa0878d1e86129fd5359aee99285f4eed9c"}, +] + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] +source = ["Cython (>=0.29.35)"] + +[[package]] +name = "msgpack" +version = "1.0.7" +description = "MessagePack serializer" +optional = false +python-versions = ">=3.8" +files = [ + {file = "msgpack-1.0.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862"}, + {file = "msgpack-1.0.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cca1b62fe70d761a282496b96a5e51c44c213e410a964bdffe0928e611368329"}, + {file = "msgpack-1.0.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e50ebce52f41370707f1e21a59514e3375e3edd6e1832f5e5235237db933c98b"}, + {file = "msgpack-1.0.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a7b4f35de6a304b5533c238bee86b670b75b03d31b7797929caa7a624b5dda6"}, + {file = "msgpack-1.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28efb066cde83c479dfe5a48141a53bc7e5f13f785b92ddde336c716663039ee"}, + {file = "msgpack-1.0.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4cb14ce54d9b857be9591ac364cb08dc2d6a5c4318c1182cb1d02274029d590d"}, + {file = "msgpack-1.0.7-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b573a43ef7c368ba4ea06050a957c2a7550f729c31f11dd616d2ac4aba99888d"}, + {file = "msgpack-1.0.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ccf9a39706b604d884d2cb1e27fe973bc55f2890c52f38df742bc1d79ab9f5e1"}, + {file = "msgpack-1.0.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cb70766519500281815dfd7a87d3a178acf7ce95390544b8c90587d76b227681"}, + {file = "msgpack-1.0.7-cp310-cp310-win32.whl", hash = "sha256:b610ff0f24e9f11c9ae653c67ff8cc03c075131401b3e5ef4b82570d1728f8a9"}, + {file = "msgpack-1.0.7-cp310-cp310-win_amd64.whl", hash = "sha256:a40821a89dc373d6427e2b44b572efc36a2778d3f543299e2f24eb1a5de65415"}, + {file = "msgpack-1.0.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:576eb384292b139821c41995523654ad82d1916da6a60cff129c715a6223ea84"}, + {file = "msgpack-1.0.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:730076207cb816138cf1af7f7237b208340a2c5e749707457d70705715c93b93"}, + {file = "msgpack-1.0.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:85765fdf4b27eb5086f05ac0491090fc76f4f2b28e09d9350c31aac25a5aaff8"}, + {file = "msgpack-1.0.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3476fae43db72bd11f29a5147ae2f3cb22e2f1a91d575ef130d2bf49afd21c46"}, + {file = "msgpack-1.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d4c80667de2e36970ebf74f42d1088cc9ee7ef5f4e8c35eee1b40eafd33ca5b"}, + {file = "msgpack-1.0.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b0bf0effb196ed76b7ad883848143427a73c355ae8e569fa538365064188b8e"}, + {file = "msgpack-1.0.7-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002"}, + {file = "msgpack-1.0.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:84b0daf226913133f899ea9b30618722d45feffa67e4fe867b0b5ae83a34060c"}, + {file = "msgpack-1.0.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ec79ff6159dffcc30853b2ad612ed572af86c92b5168aa3fc01a67b0fa40665e"}, + {file = "msgpack-1.0.7-cp311-cp311-win32.whl", hash = "sha256:3e7bf4442b310ff154b7bb9d81eb2c016b7d597e364f97d72b1acc3817a0fdc1"}, + {file = "msgpack-1.0.7-cp311-cp311-win_amd64.whl", hash = "sha256:3f0c8c6dfa6605ab8ff0611995ee30d4f9fcff89966cf562733b4008a3d60d82"}, + {file = "msgpack-1.0.7-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f0936e08e0003f66bfd97e74ee530427707297b0d0361247e9b4f59ab78ddc8b"}, + {file = "msgpack-1.0.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:98bbd754a422a0b123c66a4c341de0474cad4a5c10c164ceed6ea090f3563db4"}, + {file = "msgpack-1.0.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b291f0ee7961a597cbbcc77709374087fa2a9afe7bdb6a40dbbd9b127e79afee"}, + {file = "msgpack-1.0.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebbbba226f0a108a7366bf4b59bf0f30a12fd5e75100c630267d94d7f0ad20e5"}, + {file = "msgpack-1.0.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e2d69948e4132813b8d1131f29f9101bc2c915f26089a6d632001a5c1349672"}, + {file = "msgpack-1.0.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bdf38ba2d393c7911ae989c3bbba510ebbcdf4ecbdbfec36272abe350c454075"}, + {file = "msgpack-1.0.7-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:993584fc821c58d5993521bfdcd31a4adf025c7d745bbd4d12ccfecf695af5ba"}, + {file = "msgpack-1.0.7-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:52700dc63a4676669b341ba33520f4d6e43d3ca58d422e22ba66d1736b0a6e4c"}, + {file = "msgpack-1.0.7-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e45ae4927759289c30ccba8d9fdce62bb414977ba158286b5ddaf8df2cddb5c5"}, + {file = "msgpack-1.0.7-cp312-cp312-win32.whl", hash = "sha256:27dcd6f46a21c18fa5e5deed92a43d4554e3df8d8ca5a47bf0615d6a5f39dbc9"}, + {file = "msgpack-1.0.7-cp312-cp312-win_amd64.whl", hash = "sha256:7687e22a31e976a0e7fc99c2f4d11ca45eff652a81eb8c8085e9609298916dcf"}, + {file = "msgpack-1.0.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5b6ccc0c85916998d788b295765ea0e9cb9aac7e4a8ed71d12e7d8ac31c23c95"}, + {file = "msgpack-1.0.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:235a31ec7db685f5c82233bddf9858748b89b8119bf4538d514536c485c15fe0"}, + {file = "msgpack-1.0.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cab3db8bab4b7e635c1c97270d7a4b2a90c070b33cbc00c99ef3f9be03d3e1f7"}, + {file = "msgpack-1.0.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bfdd914e55e0d2c9e1526de210f6fe8ffe9705f2b1dfcc4aecc92a4cb4b533d"}, + {file = "msgpack-1.0.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36e17c4592231a7dbd2ed09027823ab295d2791b3b1efb2aee874b10548b7524"}, + {file = "msgpack-1.0.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38949d30b11ae5f95c3c91917ee7a6b239f5ec276f271f28638dec9156f82cfc"}, + {file = "msgpack-1.0.7-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc"}, + {file = "msgpack-1.0.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dc43f1ec66eb8440567186ae2f8c447d91e0372d793dfe8c222aec857b81a8cf"}, + {file = "msgpack-1.0.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dd632777ff3beaaf629f1ab4396caf7ba0bdd075d948a69460d13d44357aca4c"}, + {file = "msgpack-1.0.7-cp38-cp38-win32.whl", hash = "sha256:4e71bc4416de195d6e9b4ee93ad3f2f6b2ce11d042b4d7a7ee00bbe0358bd0c2"}, + {file = "msgpack-1.0.7-cp38-cp38-win_amd64.whl", hash = "sha256:8f5b234f567cf76ee489502ceb7165c2a5cecec081db2b37e35332b537f8157c"}, + {file = "msgpack-1.0.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfef2bb6ef068827bbd021017a107194956918ab43ce4d6dc945ffa13efbc25f"}, + {file = "msgpack-1.0.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:484ae3240666ad34cfa31eea7b8c6cd2f1fdaae21d73ce2974211df099a95d81"}, + {file = "msgpack-1.0.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3967e4ad1aa9da62fd53e346ed17d7b2e922cba5ab93bdd46febcac39be636fc"}, + {file = "msgpack-1.0.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8dd178c4c80706546702c59529ffc005681bd6dc2ea234c450661b205445a34d"}, + {file = "msgpack-1.0.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6ffbc252eb0d229aeb2f9ad051200668fc3a9aaa8994e49f0cb2ffe2b7867e7"}, + {file = "msgpack-1.0.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:822ea70dc4018c7e6223f13affd1c5c30c0f5c12ac1f96cd8e9949acddb48a61"}, + {file = "msgpack-1.0.7-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:384d779f0d6f1b110eae74cb0659d9aa6ff35aaf547b3955abf2ab4c901c4819"}, + {file = "msgpack-1.0.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f64e376cd20d3f030190e8c32e1c64582eba56ac6dc7d5b0b49a9d44021b52fd"}, + {file = "msgpack-1.0.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5ed82f5a7af3697b1c4786053736f24a0efd0a1b8a130d4c7bfee4b9ded0f08f"}, + {file = "msgpack-1.0.7-cp39-cp39-win32.whl", hash = "sha256:f26a07a6e877c76a88e3cecac8531908d980d3d5067ff69213653649ec0f60ad"}, + {file = "msgpack-1.0.7-cp39-cp39-win_amd64.whl", hash = "sha256:1dc93e8e4653bdb5910aed79f11e165c85732067614f180f70534f056da97db3"}, + {file = "msgpack-1.0.7.tar.gz", hash = "sha256:572efc93db7a4d27e404501975ca6d2d9775705c2d922390d878fcf768d92c87"}, +] + +[[package]] +name = "ndindex" +version = "1.7" +description = "A Python library for manipulating indices of ndarrays." +optional = false +python-versions = ">=3.7" +files = [ + {file = "ndindex-1.7-py3-none-any.whl", hash = "sha256:4c0555d352ac9947b0f022562aea9f5d57fa06743ea069669138f75a88b42884"}, + {file = "ndindex-1.7.tar.gz", hash = "sha256:bf9bd0b76eeada1c8275e04091f8291869ed2b373b7af48e56faf7579fd2efd2"}, +] + +[package.extras] +arrays = ["numpy"] + +[[package]] +name = "networkx" +version = "3.1" +description = "Python package for creating and manipulating graphs and networks" +optional = false +python-versions = ">=3.8" +files = [ + {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"}, + {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"}, +] + +[package.extras] +default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] + +[[package]] +name = "numexpr" +version = "2.8.7" +description = "Fast numerical expression evaluator for NumPy" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numexpr-2.8.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d88531ffea3ea9287e8a1665c6a2d0206d3f4660d5244423e2a134a7f0ce5fba"}, + {file = "numexpr-2.8.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db1065ba663a854115cf1f493afd7206e2efcef6643129e8061e97a51ad66ebb"}, + {file = "numexpr-2.8.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4546416004ff2e7eb9cf52c2d7ab82732b1b505593193ee9f93fa770edc5230"}, + {file = "numexpr-2.8.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb2f473fdfd09d17db3038e34818d05b6bc561a36785aa927d6c0e06bccc9911"}, + {file = "numexpr-2.8.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5496fc9e3ae214637cbca1ab556b0e602bd3afe9ff4c943a29c482430972cda8"}, + {file = "numexpr-2.8.7-cp310-cp310-win32.whl", hash = "sha256:d43f1f0253a6f2db2f76214e6f7ae9611b422cba3f7d4c86415d7a78bbbd606f"}, + {file = "numexpr-2.8.7-cp310-cp310-win_amd64.whl", hash = "sha256:cf5f112bce5c5966c47cc33700bc14ce745c8351d437ed57a9574fff581f341a"}, + {file = "numexpr-2.8.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:32934d51b5bc8a6636436326da79ed380e2f151989968789cf65b1210572cb46"}, + {file = "numexpr-2.8.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f021ac93cb3dd5d8ba2882627b615b1f58cb089dcc85764c6fbe7a549ed21b0c"}, + {file = "numexpr-2.8.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dccf572763517db6562fb7b17db46aacbbf62a9ca0a66672872f4f71aee7b186"}, + {file = "numexpr-2.8.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11121b14ee3179bade92e823f25f1b94e18716d33845db5081973331188c3338"}, + {file = "numexpr-2.8.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:81451962d4145a46dba189df65df101d4d1caddb6efe6ebfe05982cd9f62b2cf"}, + {file = "numexpr-2.8.7-cp311-cp311-win32.whl", hash = "sha256:da55ba845b847cc33c4bf81cee4b1bddfb0831118cabff8db62888ab8697ec34"}, + {file = "numexpr-2.8.7-cp311-cp311-win_amd64.whl", hash = "sha256:fd93b88d5332069916fa00829ea1b972b7e73abcb1081eee5c905a514b8b59e3"}, + {file = "numexpr-2.8.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5340d2c86d83f52e1a3e7fd97c37d358ae99af9de316bdeeab2565b9b1e622ca"}, + {file = "numexpr-2.8.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f3bdf8cbc00c77a46230c765d242f92d35905c239b20c256c48dbac91e49f253"}, + {file = "numexpr-2.8.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d46c47e361fa60966a3339cb4f463ae6151ce7d78ed38075f06e8585d2c8929f"}, + {file = "numexpr-2.8.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a371cfc1670a18eea2d5c70abaa95a0e8824b70d28da884bad11931266e3a0ca"}, + {file = "numexpr-2.8.7-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:47a249cecd1382d482a5bf1fac0d11392fb2ed0f7d415ebc4cd901959deb1ec9"}, + {file = "numexpr-2.8.7-cp312-cp312-win32.whl", hash = "sha256:b8a5b2c21c26b62875bf819d375d798b96a32644e3c28bd4ce7789ed1fb489da"}, + {file = "numexpr-2.8.7-cp312-cp312-win_amd64.whl", hash = "sha256:f29f4d08d9b0ed6fa5d32082971294b2f9131b8577c2b7c36432ed670924313f"}, + {file = "numexpr-2.8.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4ecaa5be24cf8fa0f00108e9dfa1021b7510e9dd9d159b8d8bc7c7ddbb995b31"}, + {file = "numexpr-2.8.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3a84284e0a407ca52980fd20962e89aff671c84cd6e73458f2e29ea2aa206356"}, + {file = "numexpr-2.8.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e838289e3b7bbe100b99e35496e6cc4cc0541c2207078941ee5a1d46e6b925ae"}, + {file = "numexpr-2.8.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0983052f308ea75dd232eb7f4729eed839db8fe8d82289940342b32cc55b15d0"}, + {file = "numexpr-2.8.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8bf005acd7f1985c71b1b247aaac8950d6ea05a0fe0bbbbf3f96cd398b136daa"}, + {file = "numexpr-2.8.7-cp39-cp39-win32.whl", hash = "sha256:56ec95f8d1db0819e64987dcf1789acd500fa4ea396eeabe4af6efdcb8902d07"}, + {file = "numexpr-2.8.7-cp39-cp39-win_amd64.whl", hash = "sha256:c7bf60fc1a9c90a9cb21c4c235723e579bff70c8d5362228cb2cf34426104ba2"}, + {file = "numexpr-2.8.7.tar.gz", hash = "sha256:596eeb3bbfebc912f4b6eaaf842b61ba722cebdb8bc42dfefa657d3a74953849"}, +] + +[package.dependencies] +numpy = ">=1.13.3" + +[[package]] +name = "numpy" +version = "1.26.0" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = "<3.13,>=3.9" +files = [ + {file = "numpy-1.26.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f8db2f125746e44dce707dd44d4f4efeea8d7e2b43aace3f8d1f235cfa2733dd"}, + {file = "numpy-1.26.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0621f7daf973d34d18b4e4bafb210bbaf1ef5e0100b5fa750bd9cde84c7ac292"}, + {file = "numpy-1.26.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51be5f8c349fdd1a5568e72713a21f518e7d6707bcf8503b528b88d33b57dc68"}, + {file = "numpy-1.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:767254ad364991ccfc4d81b8152912e53e103ec192d1bb4ea6b1f5a7117040be"}, + {file = "numpy-1.26.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:436c8e9a4bdeeee84e3e59614d38c3dbd3235838a877af8c211cfcac8a80b8d3"}, + {file = "numpy-1.26.0-cp310-cp310-win32.whl", hash = "sha256:c2e698cb0c6dda9372ea98a0344245ee65bdc1c9dd939cceed6bb91256837896"}, + {file = "numpy-1.26.0-cp310-cp310-win_amd64.whl", hash = "sha256:09aaee96c2cbdea95de76ecb8a586cb687d281c881f5f17bfc0fb7f5890f6b91"}, + {file = "numpy-1.26.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:637c58b468a69869258b8ae26f4a4c6ff8abffd4a8334c830ffb63e0feefe99a"}, + {file = "numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:306545e234503a24fe9ae95ebf84d25cba1fdc27db971aa2d9f1ab6bba19a9dd"}, + {file = "numpy-1.26.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c6adc33561bd1d46f81131d5352348350fc23df4d742bb246cdfca606ea1208"}, + {file = "numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e062aa24638bb5018b7841977c360d2f5917268d125c833a686b7cbabbec496c"}, + {file = "numpy-1.26.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:546b7dd7e22f3c6861463bebb000646fa730e55df5ee4a0224408b5694cc6148"}, + {file = "numpy-1.26.0-cp311-cp311-win32.whl", hash = "sha256:c0b45c8b65b79337dee5134d038346d30e109e9e2e9d43464a2970e5c0e93229"}, + {file = "numpy-1.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:eae430ecf5794cb7ae7fa3808740b015aa80747e5266153128ef055975a72b99"}, + {file = "numpy-1.26.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:166b36197e9debc4e384e9c652ba60c0bacc216d0fc89e78f973a9760b503388"}, + {file = "numpy-1.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f042f66d0b4ae6d48e70e28d487376204d3cbf43b84c03bac57e28dac6151581"}, + {file = "numpy-1.26.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5e18e5b14a7560d8acf1c596688f4dfd19b4f2945b245a71e5af4ddb7422feb"}, + {file = "numpy-1.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f6bad22a791226d0a5c7c27a80a20e11cfe09ad5ef9084d4d3fc4a299cca505"}, + {file = "numpy-1.26.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4acc65dd65da28060e206c8f27a573455ed724e6179941edb19f97e58161bb69"}, + {file = "numpy-1.26.0-cp312-cp312-win32.whl", hash = "sha256:bb0d9a1aaf5f1cb7967320e80690a1d7ff69f1d47ebc5a9bea013e3a21faec95"}, + {file = "numpy-1.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:ee84ca3c58fe48b8ddafdeb1db87388dce2c3c3f701bf447b05e4cfcc3679112"}, + {file = "numpy-1.26.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4a873a8180479bc829313e8d9798d5234dfacfc2e8a7ac188418189bb8eafbd2"}, + {file = "numpy-1.26.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:914b28d3215e0c721dc75db3ad6d62f51f630cb0c277e6b3bcb39519bed10bd8"}, + {file = "numpy-1.26.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c78a22e95182fb2e7874712433eaa610478a3caf86f28c621708d35fa4fd6e7f"}, + {file = "numpy-1.26.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86f737708b366c36b76e953c46ba5827d8c27b7a8c9d0f471810728e5a2fe57c"}, + {file = "numpy-1.26.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b44e6a09afc12952a7d2a58ca0a2429ee0d49a4f89d83a0a11052da696440e49"}, + {file = "numpy-1.26.0-cp39-cp39-win32.whl", hash = "sha256:5671338034b820c8d58c81ad1dafc0ed5a00771a82fccc71d6438df00302094b"}, + {file = "numpy-1.26.0-cp39-cp39-win_amd64.whl", hash = "sha256:020cdbee66ed46b671429c7265cf00d8ac91c046901c55684954c3958525dab2"}, + {file = "numpy-1.26.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0792824ce2f7ea0c82ed2e4fecc29bb86bee0567a080dacaf2e0a01fe7654369"}, + {file = "numpy-1.26.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d484292eaeb3e84a51432a94f53578689ffdea3f90e10c8b203a99be5af57d8"}, + {file = "numpy-1.26.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:186ba67fad3c60dbe8a3abff3b67a91351100f2661c8e2a80364ae6279720299"}, + {file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"}, +] + +[[package]] +name = "openpyxl" +version = "3.1.2" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = false +python-versions = ">=3.6" +files = [ + {file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"}, + {file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"}, +] + +[package.dependencies] +et-xmlfile = "*" + +[[package]] +name = "packaging" +version = "23.2" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"}, + {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"}, +] + +[[package]] +name = "pandas" +version = "2.1.1" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pandas-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:58d997dbee0d4b64f3cb881a24f918b5f25dd64ddf31f467bb9b67ae4c63a1e4"}, + {file = "pandas-2.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02304e11582c5d090e5a52aec726f31fe3f42895d6bfc1f28738f9b64b6f0614"}, + {file = "pandas-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffa8f0966de2c22de408d0e322db2faed6f6e74265aa0856f3824813cf124363"}, + {file = "pandas-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1f84c144dee086fe4f04a472b5cd51e680f061adf75c1ae4fc3a9275560f8f4"}, + {file = "pandas-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:75ce97667d06d69396d72be074f0556698c7f662029322027c226fd7a26965cb"}, + {file = "pandas-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:4c3f32fd7c4dccd035f71734df39231ac1a6ff95e8bdab8d891167197b7018d2"}, + {file = "pandas-2.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9e2959720b70e106bb1d8b6eadd8ecd7c8e99ccdbe03ee03260877184bb2877d"}, + {file = "pandas-2.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:25e8474a8eb258e391e30c288eecec565bfed3e026f312b0cbd709a63906b6f8"}, + {file = "pandas-2.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8bd1685556f3374520466998929bade3076aeae77c3e67ada5ed2b90b4de7f0"}, + {file = "pandas-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc3657869c7902810f32bd072f0740487f9e030c1a3ab03e0af093db35a9d14e"}, + {file = "pandas-2.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:05674536bd477af36aa2effd4ec8f71b92234ce0cc174de34fd21e2ee99adbc2"}, + {file = "pandas-2.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:b407381258a667df49d58a1b637be33e514b07f9285feb27769cedb3ab3d0b3a"}, + {file = "pandas-2.1.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c747793c4e9dcece7bb20156179529898abf505fe32cb40c4052107a3c620b49"}, + {file = "pandas-2.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3bcad1e6fb34b727b016775bea407311f7721db87e5b409e6542f4546a4951ea"}, + {file = "pandas-2.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f5ec7740f9ccb90aec64edd71434711f58ee0ea7f5ed4ac48be11cfa9abf7317"}, + {file = "pandas-2.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29deb61de5a8a93bdd033df328441a79fcf8dd3c12d5ed0b41a395eef9cd76f0"}, + {file = "pandas-2.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4f99bebf19b7e03cf80a4e770a3e65eee9dd4e2679039f542d7c1ace7b7b1daa"}, + {file = "pandas-2.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:84e7e910096416adec68075dc87b986ff202920fb8704e6d9c8c9897fe7332d6"}, + {file = "pandas-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366da7b0e540d1b908886d4feb3d951f2f1e572e655c1160f5fde28ad4abb750"}, + {file = "pandas-2.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9e50e72b667415a816ac27dfcfe686dc5a0b02202e06196b943d54c4f9c7693e"}, + {file = "pandas-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc1ab6a25da197f03ebe6d8fa17273126120874386b4ac11c1d687df288542dd"}, + {file = "pandas-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0dbfea0dd3901ad4ce2306575c54348d98499c95be01b8d885a2737fe4d7a98"}, + {file = "pandas-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0489b0e6aa3d907e909aef92975edae89b1ee1654db5eafb9be633b0124abe97"}, + {file = "pandas-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:4cdb0fab0400c2cb46dafcf1a0fe084c8bb2480a1fa8d81e19d15e12e6d4ded2"}, + {file = "pandas-2.1.1.tar.gz", hash = "sha256:fecb198dc389429be557cde50a2d46da8434a17fe37d7d41ff102e3987fd947b"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.1" + +[package.extras] +all = ["PyQt5 (>=5.15.6)", "SQLAlchemy (>=1.4.36)", "beautifulsoup4 (>=4.11.1)", "bottleneck (>=1.3.4)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=0.8.1)", "fsspec (>=2022.05.0)", "gcsfs (>=2022.05.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.8.0)", "matplotlib (>=3.6.1)", "numba (>=0.55.2)", "numexpr (>=2.8.0)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pandas-gbq (>=0.17.5)", "psycopg2 (>=2.9.3)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.5)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "pyxlsb (>=1.0.9)", "qtpy (>=2.2.0)", "s3fs (>=2022.05.0)", "scipy (>=1.8.1)", "tables (>=3.7.0)", "tabulate (>=0.8.10)", "xarray (>=2022.03.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)", "zstandard (>=0.17.0)"] +aws = ["s3fs (>=2022.05.0)"] +clipboard = ["PyQt5 (>=5.15.6)", "qtpy (>=2.2.0)"] +compression = ["zstandard (>=0.17.0)"] +computation = ["scipy (>=1.8.1)", "xarray (>=2022.03.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pyxlsb (>=1.0.9)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2022.05.0)"] +gcp = ["gcsfs (>=2022.05.0)", "pandas-gbq (>=0.17.5)"] +hdf5 = ["tables (>=3.7.0)"] +html = ["beautifulsoup4 (>=4.11.1)", "html5lib (>=1.1)", "lxml (>=4.8.0)"] +mysql = ["SQLAlchemy (>=1.4.36)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.8.10)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.4)", "numba (>=0.55.2)", "numexpr (>=2.8.0)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.36)", "psycopg2 (>=2.9.3)"] +spss = ["pyreadstat (>=1.1.5)"] +sql-other = ["SQLAlchemy (>=1.4.36)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.8.0)"] + +[[package]] +name = "patsy" +version = "0.5.3" +description = "A Python package for describing statistical models and for building design matrices." +optional = false +python-versions = "*" +files = [ + {file = "patsy-0.5.3-py2.py3-none-any.whl", hash = "sha256:7eb5349754ed6aa982af81f636479b1b8db9d5b1a6e957a6016ec0534b5c86b7"}, + {file = "patsy-0.5.3.tar.gz", hash = "sha256:bdc18001875e319bc91c812c1eb6a10be4bb13cb81eb763f466179dca3b67277"}, +] + +[package.dependencies] +numpy = ">=1.4" +six = "*" + +[package.extras] +test = ["pytest", "pytest-cov", "scipy"] + +[[package]] +name = "py-cpuinfo" +version = "9.0.0" +description = "Get CPU info with pure Python" +optional = false +python-versions = "*" +files = [ + {file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"}, + {file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"}, +] + +[[package]] +name = "pydot" +version = "1.4.2" +description = "Python interface to Graphviz's Dot" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "pydot-1.4.2-py2.py3-none-any.whl", hash = "sha256:66c98190c65b8d2e2382a441b4c0edfdb4f4c025ef9cb9874de478fb0793a451"}, + {file = "pydot-1.4.2.tar.gz", hash = "sha256:248081a39bcb56784deb018977e428605c1c758f10897a339fce1dd728ff007d"}, +] + +[package.dependencies] +pyparsing = ">=2.1.4" + +[[package]] +name = "pyham" +version = "1.1.12" +description = "A tool to analyse Hierarchical Orthologous Groups (HOGs)" +optional = false +python-versions = "*" +files = [ + {file = "pyham-1.1.12-py2.py3-none-any.whl", hash = "sha256:a26b47bb9a9f5b961cb31ba1f271eef7b46dafa8829913e0aace207fd2aef37f"}, + {file = "pyham-1.1.12.tar.gz", hash = "sha256:c2ec409ddae705670b2e0ed6870f0c8eb8ba8eb69f036ecdeb3c559ab9f7eb68"}, +] + +[package.dependencies] +ete3 = ">=3.1" +future = "*" +lxml = "*" +requests = "*" +scipy = "*" +six = "*" + +[package.extras] +dev = ["fabric", "fabric3", "nose", "sphinx", "twine", "wheel"] +test = ["nose"] + +[[package]] +name = "pyoma" +version = "0.12.1" +description = "library to interact and build OMA hdf5 files" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pyoma-0.12.1-py3-none-any.whl", hash = "sha256:3f04310ed62a049b758763005e8bc23591c34f3626809291a518d22fc27b1252"}, + {file = "pyoma-0.12.1.tar.gz", hash = "sha256:27667492eb0b465c5e2c561bf94121dee74ae4c986533cb3d475a6fc78aebfba"}, +] + +[package.dependencies] +biopython = {version = ">=1.76", markers = "python_version >= \"3.6\""} +datasketch = "*" +ete3 = "*" +future = "*" +fuzzyset2 = ">=0.1.1" +networkx = "*" +numpy = ">=1.16" +pandas = ">=0.22" +pyopa = ">=0.8" +tables = ">=3.5.1" +tqdm = "*" + +[package.extras] +create-db = ["PySAIS", "familyanalyzer (>=0.7.3)", "lark-parser", "matplotlib", "pebble", "pyham", "scikit-fuzzy", "scikit-learn"] +docs = ["sphinx"] +notebooks = ["jupyter", "matplotlib", "seaborn"] + +[[package]] +name = "pyopa" +version = "0.8.4" +description = "PyOPA - optimal pairwise sequence alignments" +optional = false +python-versions = "*" +files = [ + {file = "pyopa-0.8.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1fd0b6dbfcd0397390065f0cecb973cc0daf1d98e2473bc4d4d0bc5cb7aa8b30"}, + {file = "pyopa-0.8.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bdea0dc270fcd6496b1c932e8762b0f7919cd6a9a26dd152bd40041300684a17"}, + {file = "pyopa-0.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:067e0f692077c83b5b3344fbbd64f5d7439054ee7e986796cd1ec090885e1482"}, + {file = "pyopa-0.8.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f184f4b36fc8e4dd03c59a600b3256f12d39d02fb790469309931c87f78306f4"}, + {file = "pyopa-0.8.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1d08b21177d97bfdaee90148ece8e2b3c7579c4a211638ea5852b6607b464da3"}, + {file = "pyopa-0.8.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7699a06132a614915a8927d7d1367ef9e83f8707b17880a091be9b4638332d7a"}, + {file = "pyopa-0.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a40517c93fcaa44608b94dd58b446e4a6a8d2ad52efa42b6c7bc99288474479"}, + {file = "pyopa-0.8.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:23a4627a018de0eb9539e158d77d4016af93c147bcbcb07cb20f7d8be7ccc824"}, + {file = "pyopa-0.8.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:db24b34516d04ad51ec38391f92f7a3900dadd2ffaff43294be6b1405e028be3"}, + {file = "pyopa-0.8.4-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37261da0339e77bb1c80b01844906db3df19817e277e334681621d4e03e7c951"}, + {file = "pyopa-0.8.4-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:37eaf805ce193eb288a38e4b4731f0a0f7f1a1b8b7c904efdfb8ed53cc084fbe"}, + {file = "pyopa-0.8.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:029fcacb44d2bcd1f5dced906037b4e17c6eb583cb15c6b52a208f3d28177520"}, + {file = "pyopa-0.8.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d897e5d38c86b6e0b1418449c420ac84f18baf000278ea0bfd47568a85b02463"}, + {file = "pyopa-0.8.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:be72a0a6a97f023bb8ffe1612cbe82487f08502ae4a315644abe3c8918fb2250"}, + {file = "pyopa-0.8.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:67b8204023ee0926bd755f735946dc1fc11ada09ec7d4e2835cf80f300bf043c"}, + {file = "pyopa-0.8.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:841e8fb227d1305853f87e454b6e186962b5bb8efcb82880fee1ebe4f43a5fc6"}, + {file = "pyopa-0.8.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0b07a8d01cc83ad99b2b958f2f829f2c62b04f2ff7b31c0fc9380714c03ba6b"}, + {file = "pyopa-0.8.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f006a1eb5fa848b5cecc2fa8bcbffb2201933504ea3fff08608b210210c778bd"}, + {file = "pyopa-0.8.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ff16bc7bb7a3f488c06f1aa0a5e7aa0268afe5bec77830276a36ef873802d4d2"}, + {file = "pyopa-0.8.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:274301ea0bad35f16a66484b9c2fb0a10cf3302abbad565454447f3846f12156"}, + {file = "pyopa-0.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc7784364e255bfad3bc102a4b88c891216db03d7104fe9f26f2b8e488f9cac6"}, + {file = "pyopa-0.8.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d375a5092e22a5635387270b2fbbc5aa380810ea58993f82d1b81d0e4950be51"}, + {file = "pyopa-0.8.4.tar.gz", hash = "sha256:f83d1a7fddeb8e5d4abd63c38c9f638b2330ef323b2501031b4f9efa67abe819"}, +] + +[package.dependencies] +numpy = "*" + +[[package]] +name = "pyparsing" +version = "3.1.1" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +optional = false +python-versions = ">=3.6.8" +files = [ + {file = "pyparsing-3.1.1-py3-none-any.whl", hash = "sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb"}, + {file = "pyparsing-3.1.1.tar.gz", hash = "sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db"}, +] + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2023.3.post1" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2023.3.post1-py2.py3-none-any.whl", hash = "sha256:ce42d816b81b68506614c11e8937d3aa9e41007ceb50bfdcb0749b921bf646c7"}, + {file = "pytz-2023.3.post1.tar.gz", hash = "sha256:7b4fddbeb94a1eba4b557da24f19fdf9db575192544270a9101d8509f9f43d7b"}, +] + +[[package]] +name = "rapidfuzz" +version = "3.4.0" +description = "rapid fuzzy string matching" +optional = false +python-versions = ">=3.7" +files = [ + {file = "rapidfuzz-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1438e68fe8869fe6819a313140e98641b34bfc89234b82486d8fd02044a067e8"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:59f851c7a54a9652b9598553547e0940244bfce7c9b672bac728efa0b9028d03"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6286510910fcd649471a7f5b77fcc971e673729e7c84216dbf321bead580d5a1"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87409e12f9a82aa33a5b845c49dd8d5d4264f2f171f0a69ddc638e100fcc50de"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1d81d380ceabc8297880525c9d8b9e93fead38d3d2254e558c36c18aaf2553f"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a716efcfc92659d8695291f07da4fa60f42a131dc4ceab583931452dd5662e92"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:83387fb81c4c0234b199110655779762dd5982cdf9de4f7c321110713193133e"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55efb3231bb954f3597313ebdf104289b8d139d5429ad517051855f84e12b94e"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:51d47d52c890cbdb2d8b2085d747e557f15efd9c990cb6ae624c8f6948c4aa3a"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3db79070888d0dcd4f6a20fd30b8184dd975d6b0f7818acff5d7e07eba19b71f"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:46efc5e4675e2bd5118427513f86eaf3689e1482ebd309ad4532bcefae78179d"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:d15c364c5aa8f032dadf5b82fa02b7a4bd9688a961a27961cd5b985203f58037"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f1e91460baa42f5408f3c062913456a24b2fc1a181959b58a9c06b5eef700ca6"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c7f4f6dac25c120de8845a65a97090658c8a976827ac22b6b86e2a16a60bb820"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:124578029d926b2be32d60b748be95ee0de6cb2753eb49d6d1d6146269b428b9"}, + {file = "rapidfuzz-3.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:3af0384132e79fe6f6370d49347649382e04f689277525903bef84d30f3992fd"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:66ff93b81b382269dc7c2d46c839ce72e2d2331ad46a06321770bc94016fe236"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:da2764604a31fd1e3f1cacf226b43a871cc9f28844a3196c2a6b1ba52ae12922"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8eb33895353bfcc33ccf4b4bae837c0afb4eaf20a0361aa6f0800cef12505e91"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed3da08830c08c8bcd49414cc06b704a760d3067804775facc0df725b52085a4"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b38c7021f6114cfacba5717192fb3e1e50053261d49a774e645021a2f77e20a3"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5ea97886d2ec7b2b9a8172812a76e1d243f2ce705c2f24baf46f9ef5d3951"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b9a7ab061c1b75b274fc2ebd1d29cfa2e510c36e2f4cd9518a6d56d589003c8"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23b07685c21c93cdf6d68b49eccacfe975651b8d99ea8a02687400c60315e5bc"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c2a564f748497b6a5e08a1dc0ac06655f65377cf072c4f0e2c73818acc655d36"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ef30b5f2720f0acbcfba0e0661a4cc118621c47cf69b5fe92531dfed1e369e1c"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:ab981f9091ae8bd32bca9289fa1019b4ec656543489e7e13e64882d57d989282"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:a80f9aa4245a49e0677896d1b51b2b3bc36472aff7cec31c4a96f789135f03fe"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0d8c6cb80b5d2edf88bf6a88ac6827a353c974405c2d7e3025ed9527a5dbe1a6"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-win32.whl", hash = "sha256:c0150d521199277b5ad8bd3b060a5f3c1dbdf11df0533b4d79f458ef11d07e8c"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:bd50bc90167601963e2a90b820fb862d239ecb096a991bf3ce33ffaa1d6eedee"}, + {file = "rapidfuzz-3.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:bd10d68baabb63a3bb36b683f98fc481fcc62230e493e4b31e316bd5b299ef68"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:7f497f850d46c5e08f3340343842a28ede5d3997e5d1cadbd265793cf47417e5"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a7d6a9f04ea1277add8943d4e144e59215009f54f2668124ff26dee18a875343"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b6fe2aff0d9b35191701714e05afe08f79eaea376a3a6ca802b72d9e5b48b545"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b81b8bc29114ca861fed23da548a837832b85495b0c1b2600e6060e3cf4d50aa"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:805dc2aa3ac295dcbf2df8c1e420e8a73b1f632d6820a5a1c8506d22c11e0f27"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1276c7f50cd90a48b00084feb25256135c9ace6c599295dd5932949ec30c0e70"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0b9197656a6d71483959bf7d216e7fb7a6b80ca507433bcb3015fb92abc266f8"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3456f4df5b8800315fd161045c996479016c112228e4da370d09ed80c24853e5"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:734046d557550589edb83d5ad1468a1341d1092f1c64f26fd0b1fc50f9efdce1"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:37d5f0fbad6c092c89840eea2c4c845564d40849785de74c5e6ff48b47b0ecf6"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:bfe14711b9a7b744e242a482c6cabb696517a1a9946fc1e88d353cd3eb384788"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a733c10b1fcc47f837c23ab4a255cc4021a88939ff81baa64d6738231cba33d"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:929e6b71e5b36caee2ee11c209e75a0fcbd716a1b76ae6162b89ee9b591b63b1"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-win32.whl", hash = "sha256:c56073ba1d1b25585359ad9769163cb2f3183e7a03c03b914a0667fcbd95dc5c"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:bf58ba21df06fc8aeef3056fd137eca0a593c2f5c82923a4524d251dc5f3df5d"}, + {file = "rapidfuzz-3.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:f3effbe9c677658b3149da0d2778a740a6b7d8190c1407fd0c0770a4e223cfe0"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ed0d5761b44d9dd87278d5c32903bb55632346e4d84ea67ba2e4a84afc3b7d45"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bafbd3e2e9e0b5f740f66155cc7e1e23eee1e1f2c44eff12daf14f90af0e8ab"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2543fd8d0fb3b1ac065bf94ee54c0ea33343c62481d8e54b6117a88c92c9b721"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93ceb62ade1a0e62696487274002157a58bb751fc82cd25016fc5523ba558ca5"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76f4162ce5fe08609455d318936ed4aa709f40784be61fb4e200a378137b0230"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f723197f2dbce508a7030dcf6d3fc940117aa54fc876021bf6f6feeaf3825ba1"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:cfdc74afd93ac71270b5be5c25cb864b733b9ae32b07495705a6ac294ac4c390"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:273c7c7f5b405f2f54d41e805883572d57e1f0a56861f93ca5a6733672088acb"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:712dd91d429afaddbf7e86662155f2ad9bc8135fca5803a01035a3c1d76c5977"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:9814905414696080d8448d6e6df788a0148954ab34d7cd8d75bcb85ba30e0b25"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:01013ee67fb15608c8c5961af3bc2b1f242cff94c19f53237c9b3f0edb8e0a2d"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:8f5d2adc48c181486125d42230e80479a1e0568942e883d1ebdeb76cd3f83470"}, + {file = "rapidfuzz-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c92d847c997c384670e3b4cf6727cb73a4d7a7ba6457310e2083cf06d56013c4"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:d0bda173b0ec1fa546f123088c0d42c9096304771b4c0555d4e08a66a246b3f6"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bbb05b1203f683b341f44ebe8fe38afed6e56f606094f9840d6406e4a7bf0eab"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f0075ff8990437923da42202b60cf04b5c122ee2856f0cf2344fb890cadecf57"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f295842c282fe7fe93bfe7a20e78f33f43418f47fb601f2f0a05df8a8282b43"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1ebee7313719dfe652debb74bdd4024e8cf381a59adc6d065520ff927f3445f4"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f71454249ddd29d8ba5415ed7307e7b7493fc7e9018f1ff496127b8b9a8df94b"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:52c6b7a178f0e800488fa1aede17b00f6397cab0b79d48531504b0d89e45315f"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d38596c804a9f2bd49360c15e1f4afbf016f181fe37fc4f1a4ddd247d3e91e5"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8756461e7ee79723b8f762fc6db226e65eb453bf9fa64b14fc0274d4aaaf9e21"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e14799297f194a4480f373e45142ef16d5dc68a42084c0e2018e0bdba56a8fef"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:f813fb663d90038c1171d30ea1b6b275e09fced32f1d12b972c6045d9d4233f2"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:0df66e07e42e2831fae84dea481f7803bec7cfa53c31d770e86ac47bb18dcd57"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b05c7d4b4ddb617e977d648689013e50e5688140ee03538d3760a3a11d4fa8a2"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-win32.whl", hash = "sha256:74b9a1c1fc139d325fb0b89ccc85527d27096a76f6ed690ee3378143cc38e91d"}, + {file = "rapidfuzz-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:5fe3ef7daecd79f852936528e37528fd88818bc000991e0fea23b9ac5b79e875"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:61f16bb0f3026853500e7968261831a2e1a35d56947752bb6cf6953afd70b9de"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d188e8fb5a9709931c6a48cc62c4ac9b9d163969333711e426d9dbd134c1489b"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c006aa481d1b91c2600920ce16e42d208a4b6f318d393aef4dd2172d568f2641"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02afbe7ed12e9191082ed7bda43398baced1d9d805302b7b010d397de3ae973f"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01d64710060bc3241c08ac1f1a9012c7184f3f4c3d6e2eebb16c6093a03f6a67"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3198f70b97127e52a4f96bb2f7de447f89baa338ff398eb126930c8e3137ad1"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50ad7bac98a0f00492687eddda73d2c0bdf71c78b52fddaa5901634ae323d3ce"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc3efc06db79e818f4a6783a4e001b3c8b2c61bd05c0d5c4d333adaf64ed1b34"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:75d1365387ec8ef2128fd7e2f7436aa1a04a1953bc6d7068835bb769cd07c146"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a0750278693525b5ce58d3b313e432dfa5d90f00d06ae54fa8cde87f2a397eb0"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:2e49151572b842d290dcee2cc6f9ce7a7b40b77cc20d0f6d6b54e7afb7bafa5c"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:8b38d7677b2f20b137bb7aaf0dcd3d8ac2a2cde65f09f5621bf3f57d9a1e5d6e"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d904ac97f2e370f91e8170802669c8ad68641bf84d742968416b53c5960410c6"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-win32.whl", hash = "sha256:53bbef345644eac1c2d7cc21ade4fe9554fa289f60eb2c576f7fdc454dbc0641"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:233bf022938c38060a93863ec548e624d69a56d7384634d8bea435b915b88e52"}, + {file = "rapidfuzz-3.4.0-cp39-cp39-win_arm64.whl", hash = "sha256:63933792146f3d333680d415cecc237e6275b42ad948d0a798f9a81325517666"}, + {file = "rapidfuzz-3.4.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:e182ea5c809e7ed36ebfbcef4bb1808e213d27b33c036007a33bcbb7ba498356"}, + {file = "rapidfuzz-3.4.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e1142c8d35fa6f3af8150d02ff8edcbea3723c851d889e8b2172e0d1b99f3f7"}, + {file = "rapidfuzz-3.4.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6b8258846e56b03230fa733d29bb4f9fb1f4790ac97d1ebe9faa3ff9d2850999"}, + {file = "rapidfuzz-3.4.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:950d1dfd2927cd45c9bb2927933926718f0a17792841e651d42f4d1cb04a5c1d"}, + {file = "rapidfuzz-3.4.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:dd54dd0355225dc3c1d55e233d510adcccee9bb25d656b4cf1136114b92e7bf3"}, + {file = "rapidfuzz-3.4.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f5921780e7995e9ac3cea41fa57b623159d7295788618d3f2946d61328c25c25"}, + {file = "rapidfuzz-3.4.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc4b1b69a64d337c40fa07a721dae1b1550d90f17973fb348055f6440d597e26"}, + {file = "rapidfuzz-3.4.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f5c8b901b6d3be63591c68e2612f76ad85af27193d0a88d4d87bb047aeafcb3"}, + {file = "rapidfuzz-3.4.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c67f5ced39aff6277dd772b239ef8aa8fc810200a3b42f69ddbb085ea0e18232"}, + {file = "rapidfuzz-3.4.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4fd94acab871afbc845400814134a83512a711e824dc2c9a9776d6123464a221"}, + {file = "rapidfuzz-3.4.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:437508ec1ea6e71a77126715ac6208cb9c3e74272536ebfa79be9dd008cfb85f"}, + {file = "rapidfuzz-3.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7215f7c5de912b364d5cf7c4c66915ccf4acf71aafbb8da62ad346569196e15"}, + {file = "rapidfuzz-3.4.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:698488002eb7be2f737e48679ed0cd310b76291f26d8ec792db8345d13eb6573"}, + {file = "rapidfuzz-3.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e77873126eb07e7461f0b675263e6c5d42c8a952e88e4a44eeff96f237b2b024"}, + {file = "rapidfuzz-3.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:28d03cd33817f6e0bea9b618b460f85ff9c9c3fedc6c19cfa0992f719a0d1801"}, + {file = "rapidfuzz-3.4.0.tar.gz", hash = "sha256:a74112e2126b428c77db5e96f7ce34e91e750552147305b2d361122cbede2955"}, +] + +[package.extras] +full = ["numpy"] + +[[package]] +name = "requests" +version = "2.31.0" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.7" +files = [ + {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, + {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, +] + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "scipy" +version = "1.11.3" +description = "Fundamental algorithms for scientific computing in Python" +optional = false +python-versions = "<3.13,>=3.9" +files = [ + {file = "scipy-1.11.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:370f569c57e1d888304052c18e58f4a927338eafdaef78613c685ca2ea0d1fa0"}, + {file = "scipy-1.11.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:9885e3e4f13b2bd44aaf2a1a6390a11add9f48d5295f7a592393ceb8991577a3"}, + {file = "scipy-1.11.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e04aa19acc324a1a076abb4035dabe9b64badb19f76ad9c798bde39d41025cdc"}, + {file = "scipy-1.11.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e1a8a4657673bfae1e05e1e1d6e94b0cabe5ed0c7c144c8aa7b7dbb774ce5c1"}, + {file = "scipy-1.11.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7abda0e62ef00cde826d441485e2e32fe737bdddee3324e35c0e01dee65e2a88"}, + {file = "scipy-1.11.3-cp310-cp310-win_amd64.whl", hash = "sha256:033c3fd95d55012dd1148b201b72ae854d5086d25e7c316ec9850de4fe776929"}, + {file = "scipy-1.11.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:925c6f09d0053b1c0f90b2d92d03b261e889b20d1c9b08a3a51f61afc5f58165"}, + {file = "scipy-1.11.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5664e364f90be8219283eeb844323ff8cd79d7acbd64e15eb9c46b9bc7f6a42a"}, + {file = "scipy-1.11.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00f325434b6424952fbb636506f0567898dca7b0f7654d48f1c382ea338ce9a3"}, + {file = "scipy-1.11.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f290cf561a4b4edfe8d1001ee4be6da60c1c4ea712985b58bf6bc62badee221"}, + {file = "scipy-1.11.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:91770cb3b1e81ae19463b3c235bf1e0e330767dca9eb4cd73ba3ded6c4151e4d"}, + {file = "scipy-1.11.3-cp311-cp311-win_amd64.whl", hash = "sha256:e1f97cd89c0fe1a0685f8f89d85fa305deb3067d0668151571ba50913e445820"}, + {file = "scipy-1.11.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dfcc1552add7cb7c13fb70efcb2389d0624d571aaf2c80b04117e2755a0c5d15"}, + {file = "scipy-1.11.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:0d3a136ae1ff0883fffbb1b05b0b2fea251cb1046a5077d0b435a1839b3e52b7"}, + {file = "scipy-1.11.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bae66a2d7d5768eaa33008fa5a974389f167183c87bf39160d3fefe6664f8ddc"}, + {file = "scipy-1.11.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2f6dee6cbb0e263b8142ed587bc93e3ed5e777f1f75448d24fb923d9fd4dce6"}, + {file = "scipy-1.11.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:74e89dc5e00201e71dd94f5f382ab1c6a9f3ff806c7d24e4e90928bb1aafb280"}, + {file = "scipy-1.11.3-cp312-cp312-win_amd64.whl", hash = "sha256:90271dbde4be191522b3903fc97334e3956d7cfb9cce3f0718d0ab4fd7d8bfd6"}, + {file = "scipy-1.11.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a63d1ec9cadecce838467ce0631c17c15c7197ae61e49429434ba01d618caa83"}, + {file = "scipy-1.11.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:5305792c7110e32ff155aed0df46aa60a60fc6e52cd4ee02cdeb67eaccd5356e"}, + {file = "scipy-1.11.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9ea7f579182d83d00fed0e5c11a4aa5ffe01460444219dedc448a36adf0c3917"}, + {file = "scipy-1.11.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c77da50c9a91e23beb63c2a711ef9e9ca9a2060442757dffee34ea41847d8156"}, + {file = "scipy-1.11.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:15f237e890c24aef6891c7d008f9ff7e758c6ef39a2b5df264650eb7900403c0"}, + {file = "scipy-1.11.3-cp39-cp39-win_amd64.whl", hash = "sha256:4b4bb134c7aa457e26cc6ea482b016fef45db71417d55cc6d8f43d799cdf9ef2"}, + {file = "scipy-1.11.3.tar.gz", hash = "sha256:bba4d955f54edd61899776bad459bf7326e14b9fa1c552181f0479cc60a568cd"}, +] + +[package.dependencies] +numpy = ">=1.21.6,<1.28.0" + +[package.extras] +dev = ["click", "cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"] +doc = ["jupytext", "matplotlib (>2)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-design (>=0.2.0)"] +test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[[package]] +name = "statsmodels" +version = "0.14.0" +description = "Statistical computations and models for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "statsmodels-0.14.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:16bfe0c96a53b20fa19067e3b6bd2f1d39e30d4891ea0d7bc20734a0ae95942d"}, + {file = "statsmodels-0.14.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5a6a0a1a06ff79be8aa89c8494b33903442859add133f0dda1daf37c3c71682e"}, + {file = "statsmodels-0.14.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77b3cd3a5268ef966a0a08582c591bd29c09c88b4566c892a7c087935234f285"}, + {file = "statsmodels-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c64ebe9cf376cba0c31aed138e15ed179a1d128612dd241cdf299d159e5e882"}, + {file = "statsmodels-0.14.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:229b2f676b4a45cb62d132a105c9c06ca8a09ffba060abe34935391eb5d9ba87"}, + {file = "statsmodels-0.14.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb471f757fc45102a87e5d86e87dc2c8c78b34ad4f203679a46520f1d863b9da"}, + {file = "statsmodels-0.14.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:582f9e41092e342aaa04920d17cc3f97240e3ee198672f194719b5a3d08657d6"}, + {file = "statsmodels-0.14.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7ebe885ccaa64b4bc5ad49ac781c246e7a594b491f08ab4cfd5aa456c363a6f6"}, + {file = "statsmodels-0.14.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b587ee5d23369a0e881da6e37f78371dce4238cf7638a455db4b633a1a1c62d6"}, + {file = "statsmodels-0.14.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef7fa4813c7a73b0d8a0c830250f021c102c71c95e9fe0d6877bcfb56d38b8c"}, + {file = "statsmodels-0.14.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:afe80544ef46730ea1b11cc655da27038bbaa7159dc5af4bc35bbc32982262f2"}, + {file = "statsmodels-0.14.0-cp311-cp311-win_amd64.whl", hash = "sha256:a6ad7b8aadccd4e4dd7f315a07bef1bca41d194eeaf4ec600d20dea02d242fce"}, + {file = "statsmodels-0.14.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:0eea4a0b761aebf0c355b726ac5616b9a8b618bd6e81a96b9f998a61f4fd7484"}, + {file = "statsmodels-0.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4c815ce7a699047727c65a7c179bff4031cff9ae90c78ca730cfd5200eb025dd"}, + {file = "statsmodels-0.14.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:575f61337c8e406ae5fa074d34bc6eb77b5a57c544b2d4ee9bc3da6a0a084cf1"}, + {file = "statsmodels-0.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8be53cdeb82f49c4cb0fda6d7eeeb2d67dbd50179b3e1033510e061863720d93"}, + {file = "statsmodels-0.14.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:6f7d762df4e04d1dde8127d07e91aff230eae643aa7078543e60e83e7d5b40db"}, + {file = "statsmodels-0.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:fc2c7931008a911e3060c77ea8933f63f7367c0f3af04f82db3a04808ad2cd2c"}, + {file = "statsmodels-0.14.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3757542c95247e4ab025291a740efa5da91dc11a05990c033d40fce31c450dc9"}, + {file = "statsmodels-0.14.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:de489e3ed315bdba55c9d1554a2e89faa65d212e365ab81bc323fa52681fc60e"}, + {file = "statsmodels-0.14.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76e290f4718177bffa8823a780f3b882d56dd64ad1c18cfb4bc8b5558f3f5757"}, + {file = "statsmodels-0.14.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:71054f9dbcead56def14e3c9db6f66f943110fdfb19713caf0eb0f08c1ec03fd"}, + {file = "statsmodels-0.14.0-cp38-cp38-win_amd64.whl", hash = "sha256:d7fda067837df94e0a614d93d3a38fb6868958d37f7f50afe2a534524f2660cb"}, + {file = "statsmodels-0.14.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1c7724ad573af26139a98393ae64bc318d1b19762b13442d96c7a3e793f495c3"}, + {file = "statsmodels-0.14.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3b0a135f3bfdeec987e36e3b3b4c53e0bb87a8d91464d2fcc4d169d176f46fdb"}, + {file = "statsmodels-0.14.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce28eb1c397dba437ec39b9ab18f2101806f388c7a0cf9cdfd8f09294ad1c799"}, + {file = "statsmodels-0.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68b1c768dd94cc5ba8398121a632b673c625491aa7ed627b82cb4c880a25563f"}, + {file = "statsmodels-0.14.0-cp39-cp39-win_amd64.whl", hash = "sha256:8d1e3e10dfbfcd58119ba5a4d3c7d519182b970a2aebaf0b6f539f55ae16058d"}, + {file = "statsmodels-0.14.0.tar.gz", hash = "sha256:6875c7d689e966d948f15eb816ab5616f4928706b180cf470fd5907ab6f647a4"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.18", markers = "python_version != \"3.10\" or platform_system != \"Windows\" or platform_python_implementation == \"PyPy\""}, + {version = ">=1.22.3", markers = "python_version == \"3.10\" and platform_system == \"Windows\" and platform_python_implementation != \"PyPy\""}, +] +packaging = ">=21.3" +pandas = ">=1.0" +patsy = ">=0.5.2" +scipy = ">=1.4,<1.9.2 || >1.9.2" + +[package.extras] +build = ["cython (>=0.29.26)"] +develop = ["colorama", "cython (>=0.29.26)", "cython (>=0.29.28,<3.0.0)", "flake8", "isort", "joblib", "matplotlib (>=3)", "oldest-supported-numpy (>=2022.4.18)", "pytest (>=7.0.1,<7.1.0)", "pytest-randomly", "pytest-xdist", "pywinpty", "setuptools-scm[toml] (>=7.0.0,<7.1.0)"] +docs = ["ipykernel", "jupyter-client", "matplotlib", "nbconvert", "nbformat", "numpydoc", "pandas-datareader", "sphinx"] + +[[package]] +name = "tables" +version = "3.9.1" +description = "Hierarchical datasets for Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "tables-3.9.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:784c1ffe7f972e69a9c97c0f164064e43617727668df4333802a7f23cfb06ee3"}, + {file = "tables-3.9.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af92f1e63b9fcadea621ab544540b7312553ea4f9456cf3d2728b48346fa557c"}, + {file = "tables-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f725f69d49f414736de24616b4ffa400127b86417bd14a11854aacd2a505b4d"}, + {file = "tables-3.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:e346249116b2eb95dd9277336c12f0d10d5328a5a3e8e16c74faa3c815817dc3"}, + {file = "tables-3.9.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f49e899247b541ed69d12fef10b5505b97243317a91b93927328c19a15d38671"}, + {file = "tables-3.9.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d1f2c947d63019db20728c6ecec39a1c900be00a65cae8025ac770148b641e8"}, + {file = "tables-3.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb89fab4a3c3cd98bd781913234e1f67464ff6e17662180cf718e67645a09271"}, + {file = "tables-3.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:aa176e1c72b0f935b0e607218ea8302378a39ed4fef5a544ebbd8d0523b56b86"}, + {file = "tables-3.9.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f482aaaa4b12d394421013cd4617d3e8a53a8d4a7a872454f7a13fb16c51a68e"}, + {file = "tables-3.9.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1813c0eced77540598987db32ce9e619d02b6032acdc3f59590d83c13bdb910c"}, + {file = "tables-3.9.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a64ce39652a2e2934f6d41500b2c6f8d4922e2022f1361e2302f3e85df4e2393"}, + {file = "tables-3.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:b49015aa8f576c6d5108c4aeb4d430bfcfc91ee8d0cca4d03e574e5485ffdc8b"}, + {file = "tables-3.9.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:50140091af9d60eb3f806d3ee43f542beae569888c37ae96d6a1c887c389d8c8"}, + {file = "tables-3.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:282a0747b3ce4e3108bcd443361e031c9817bf7e84358317723a51b9c02c5655"}, + {file = "tables-3.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0295123272bb49efbebdc9b1e2b72baa99c5761b78fccacedbf44c52a5fa51ac"}, + {file = "tables-3.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:22084019437c504917ba8c0b2af75419e3d5c8ffc6d2ef4cd44031f06939518c"}, + {file = "tables-3.9.1.tar.gz", hash = "sha256:48331503cd509c9f1f95cf2f5c64a57c48c0aa5141423f0eca352965c4f9bf81"}, +] + +[package.dependencies] +blosc2 = ">=2.2.8" +numexpr = ">=2.6.2" +numpy = ">=1.19.0" +packaging = "*" +py-cpuinfo = "*" + +[package.extras] +doc = ["ipython", "numpydoc", "sphinx (>=1.1,<6)", "sphinx-rtd-theme"] + +[[package]] +name = "tqdm" +version = "4.66.1" +description = "Fast, Extensible Progress Meter" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"}, + {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + +[[package]] +name = "tzdata" +version = "2023.3" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"}, + {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"}, +] + +[[package]] +name = "urllib3" +version = "2.0.6" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.7" +files = [ + {file = "urllib3-2.0.6-py3-none-any.whl", hash = "sha256:7a7c7003b000adf9e7ca2a377c9688bbc54ed41b985789ed576570342a375cd2"}, + {file = "urllib3-2.0.6.tar.gz", hash = "sha256:b19e1a85d206b56d7df1d5e683df4a7725252a964e3993648dd0fb5a1c157564"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + +[[package]] +name = "xlsxwriter" +version = "3.1.6" +description = "A Python module for creating Excel XLSX files." +optional = false +python-versions = ">=3.6" +files = [ + {file = "XlsxWriter-3.1.6-py3-none-any.whl", hash = "sha256:fc3838232f9f50763c1e81a3b381c6ad559dcdcd0983ee239bf54556392b4f3f"}, + {file = "XlsxWriter-3.1.6.tar.gz", hash = "sha256:2087abdaa4a5e981a3ae50b5c21ff1adae59c8fecb6157808585fc169a6bfcd9"}, +] + +[metadata] +lock-version = "2.0" +python-versions = ">=3.10,<3.13" +content-hash = "929e4f186404a38e3a3b0ca423cad177b5271758033b5e306e33e25dae89387b" diff --git a/pyproject.toml b/pyproject.toml index 6ce4573..08b57d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,59 +1,21 @@ -[build-system] -requires = ["setuptools>=61.0"] - - -build-backend = "setuptools.build_meta" -[project] -name = "HogProf" -dynamic = ["entry-points"] -version = "0.0.8" -authors = [ - { name="Dave Moi", email="dmoi@unil.ch" }, -] -dependencies = ["biopython", -"certifi", -"chardet", -"datasketch", -"ete3", -"future", -"goatools", -"h5py", -"idna", -"lxml", -"numexpr", -"numpy", -"pandas", -"pyham>=1.1.10", -"pyoma", -"pyopa", -"python-dateutil", -"pytz", -"requests", -"scipy", -"six", -"tables", -"urllib3", -"tqdm" -] -description = "Phylogenetic Profiling with OMA and minhashing" +[tool.poetry] +name = "hogprof" +version = "0.1.0" +description = "" +authors = ["Your Name "] readme = "README.md" -license = { file="LICENSE" } +[tool.poetry.dependencies] +python = ">=3.10,<3.13" +tables = "^3.9.1" +pyoma = "^0.12.1" +pandas = "^2.1.1" +h5py = "^3.9.0" +ete3 = "^3.1.3" +pyham = "^1.1.12" +goatools = "^1.3.9" -requires-python = ">=3.7" -classifiers = [ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", -] -[project.urls] -"Homepage" = "https://github.com/DessimozLab/HogProf" -"Bug Tracker" = "https://github.com/DessimozLab/HogProf/issues" -"Docs" = "https://dessimozlab.github.io/HogProf/" - -[tool.setuptools] -package-dir = {"" = "src"} - -[tool.setuptools.packages.find] -where = ["src"] \ No newline at end of file +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/src/HogProf.egg-info/PKG-INFO b/src/HogProf.egg-info/PKG-INFO deleted file mode 100644 index 38ed72e..0000000 --- a/src/HogProf.egg-info/PKG-INFO +++ /dev/null @@ -1,113 +0,0 @@ -Metadata-Version: 2.1 -Name: HogProf -Version: 0.0.8 -Summary: Phylogenetic Profiling with OMA and minhashing -Author-email: Dave Moi -License: MIT License - - Copyright (c) 2019 David Moi - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - -Project-URL: Homepage, https://github.com/DessimozLab/HogProf -Project-URL: Bug Tracker, https://github.com/DessimozLab/HogProf/issues -Project-URL: Docs, https://dessimozlab.github.io/HogProf/ -Classifier: Programming Language :: Python :: 3 -Classifier: License :: OSI Approved :: MIT License -Classifier: Operating System :: OS Independent -Requires-Python: >=3.7 -Description-Content-Type: text/markdown -License-File: LICENSE -Requires-Dist: biopython -Requires-Dist: certifi -Requires-Dist: chardet -Requires-Dist: datasketch -Requires-Dist: ete3 -Requires-Dist: future -Requires-Dist: goatools -Requires-Dist: h5py -Requires-Dist: idna -Requires-Dist: lxml -Requires-Dist: numexpr -Requires-Dist: numpy -Requires-Dist: pandas -Requires-Dist: pyham>=1.1.10 -Requires-Dist: pyoma -Requires-Dist: pyopa -Requires-Dist: python-dateutil -Requires-Dist: pytz -Requires-Dist: requests -Requires-Dist: scipy -Requires-Dist: six -Requires-Dist: tables -Requires-Dist: urllib3 -Requires-Dist: tqdm - -# HogProf - - HogProf is an extensible and tunable approach to phylogenetic profiling using orthology data. It is powered by minhash based datastructures and computationally efficient. - - Still under major development and may change - -# Features - - - Using orthoxoml files and a taxonomy calculated enhanced phylogenies of each family - - These are transformed into minhash signatures and a locally sensitive hashing forest object for search and comparison of profiles - - Taxonomic levels and evolutionary event types ( presence, loss, duplication ) can have custom weight in profile construction - - Optimization of weights using machine learning - -If you run into any problems feel free to contact me at [dmoi@unil.ch](dmoi@unil.ch) - -# Quickstart - -to install from github -``` -$ git clone https://github.com/DessimozLab/HogProf.git -$ pip install -r pipreqs.txt . -``` -or to install from pypi -``` -$ pip install hogprof -``` - - -lets get a current version of the OMA hdf5 file and GAF. This will alow us to use the HOGs and study the functional enrichment of our search results. - -``` -$ cd ../.. -$ mkdir YourOmaDirectory -$ cd YourOmaDirectory -$ wget https://omabrowser.org/All/OmaServer.h5 -$ wget https://omabrowser.org/All/oma-go.txt.gz -``` - -We also need to make a location to store our pyprofiler databases - -``` -$ cd .. -$ mkdir YourHogProfDirectory -``` - -Ok. We're ready! Now let's compile a database containing all HOGs and our desired taxonomic levels using default settings. Launch the lshbuilder. -dbtypes available on the command line are : all , plants , archaea, bacteria , eukarya , protists , fungi , metazoa and vertebrates. These will use the NCBI taxonomy as a tree to annotate events in different gene family's histories. -``` -$python lshbuilder.py --outpath YourHogProfDirectory --dbtype all --OMA YourOmaDirectory/OmaServer.h5 --nthreads numberOfCPUcores - -``` -This should build a taxonomic tree for the genomes contained in the release and then calculate enhanced phylogenies for all HOGs in OMA. - -Once the database is completed it can be interogated using a profiler object. Construction and usage of this object should be done using a python script or notebook. This shown in the example notebook searchenrich.ipynb found in the examples. Please feel free to modify it to suit the needs of your own research. diff --git a/src/HogProf/__pycache__/__init__.cpython-310.pyc b/src/HogProf/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 324adb641d338e1cac4fc23f4f3e24282ac688dc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 206 zcmd1j<>g`kf?CZ3$$~)oF^Gc<7=auIATH(r5-AK(3@MDk44O>0*gf*o1B&v~{4^PF zvE(J@rmkctVg@P!6TcGmGxBp&^^+5mOG=BgGgI_K{XBhq{X+B$it@8klS_*Aq1xk% zQqzEn@{9C~i;`i2`g!>!sY&_y*~R*LdLRT?^7T9a`n}3u&gieVfZhBhd1FDd=`bWlwG$P zcFDGR@6?@=BiBU9m20w;lxwP#!ZlG(pE2#cQf8>$sb@=BJiGOgMy`}=jFv_l`BGl` zOxDL5W2Lcf|Kp_z`EIf_Dc4>PLFkJ^j7oZ6@EK>Ja(U)_mo-XBwUsk<**m+aCw z?hEQ3xt~z4QU~yUQq8J+)qVJKO1)a$kD6(9P!)0AC8RhMd(Y?=*Qh%xn>ki!cb^Ka z`DMQv7Ap06>wHiMm;AzN;Oj!zDx6*Q^_tW?_1vk#`C7PCICc7opipV5!lI8)3iVnL z7Fr7h?FXy%(7c_mR9CXu%-qTAmTg(p^bq~XiynTD;}?7aZ$s?rCG28oN49bwOK7Nhgo#;ShLtUF+4gKHVFgRT=wxty z(F`h$HUOb;u+|LGbZ)gV52*mJ_g+9iASi4bYTqv$^h5vQ*)zCB>!&M0NEzY_G*Sgl zXwh%_x)KTvDi``2JZZlT(eg37u-0mFbjoLA(7Sw&&Yomn zg@dYAy zwlvn#ZedVz0VyE!;S$t*S@u%R&mOOYmHA3QDxQA&SV46kyF1$>*LlCTxD;Y{Zd~!u zW|CO42d25~f?ugFar9P`eLZ{P*zxk|XV08ET{u_;1XkvSF;1>Fg@W>h_A4^wmJXL% z7aDcpnV_(sTMbT;-JCl1borSlpE>c=lXE9--qA1vU}ndf%@#Hm+#z$6{wf9CI&XS> z?$meQCHdS;tlJUh_Dn@%+rf@KsR1wgLD;SU&dF^L7FuoLmO4oO3~LR86xy}wN*%MK zT$yi%HepDjZq>&vB&<;h%in$yXr*fxZYrj&M&;Iv80H~io{GNp2O%0%baknAPGoxv z4s0RSO%b7R++V0b9g3tKe&oA3tPxq+=UdWzNVH|m{I)YvZy62*?}PJ+q2idg}*q_Ua4P)lBEHZ!UlqraiV#CrSmX=4H8C}K37vd0lT}^ z#4y8+z$?h7e;zMnBJ9IRsTfCuIhazEVu7A`glp?cWt=$QSd87{r{_)-?~jvh09zQl zt+wBcQ)Z+%$t4A`OWBH32Fl~4EGu>i(XmU6HpZGDckHhCYeDSwHY834?RqWLyRg7G zd0y8-za*3BB5CY$F|m6F^NKSkE>wNta-Bl^(o|Oz{W4cjmT~nA-;8gEy*R^(m&I~& zIF{E#C_Rlw?Xqrq(BFUzdN*HX7KkOS7R=3byP~NjsYWppOsH}XQ_5iEI6FY2(jLBr zaatx_0pexCLmA)@=Z3*1_E2@0!dMzTb!-SSirLr={Q5#^6vQ1!ygtC8y`JpF35=`n z<2jT}vBwXceC+8%#~*wAP*wRs zP-`4Mqzv~TQjJ#a5Lk@XWpLr9pp$`SR+N)$Mq30VY&K!cyc2_O2GIZ<0>=kQy!U-zjv(s>Gy{H$23LC?^I%^MWvYhrMn1G~30 z%mW}OOh4l`QNuQBfwAJ+wYCqL@&j~KB|i85$ee_eDg3_r0E$Jh;ENOMaE_KOjeS_g zE-X9eaL+5B_gWut&Vv1f8?Gf{x84eXwKS{*&#pPKGyA&OZeFv6nTEI~dz9W=wbf`p zTH}Bm{Q;|UsLw9OBrkV`vwZr@$=Rdz7ObS;QQY*tz3B?{W9YCyFSXjP>l3UY%<7XY zp6oTyWMlni6wAE%J`^|Y_N%jaD}$gE)Z|=?OHsPQj4@=0NQP4HoL8VqU)ympB3QNFQU9*RgnrbzZ0|VqC{A z?bkTbskLLL*^ZMPU$@YA-ERt=;*8&1#a6;fjMJ5%qIG4hI4V0CCm`=oPBcmh7_k$! zp-EdofW_f*g(Tths6^qyy4NtFV8%ZbN;C=y@0R{@>ycbm>=KBz}E0cQqIkuu~ zYbsPD+eJsefCpHoFt&=*BHX&f*V}|U5%{5gJDUPBjQt^ipii-}K;S!g_aY0*sD394 z%9DN<3jxRq@8(f-#a&lfp;-7RN@JBKq#8I!CQ$uI=~@L-T+VGd-v2FXa6-k`aV4Kq-|***Z|!jTB(N!K8; z*GI!lG!l)9K{5jUbQ#~}ObrZ@{P0s`5BdTCjHyhNQwf!TK{6%=$=hL6jUr+RlSk!f zlrZjnHd7q!tO5warCLy+HI8_bxEc%;p^A6_e*-ZM7#p2_{HPa$FwvwfHp^|9EEutq z>kbzHPQo*DH&xXG3*gou@pIUAQTjt%V}|&S;9zLDr`Y337CYD&AMFiy^NG)~$!QkP zu((-IALGksS?quq`fYdtD}W#+KoKY4(}@8f@Sq1Ma7zGyAHNzF$Bp?$Nv2zImacPn zmM$~iZUYA+)O{OZ2ncq-07D_s@+m{~4`GbpAd2y%1GmeAh+6S6Y!Pt+8`A-VvS0>U;Rg`@hbI7$4y_GaB_J9I_jm77H4FAX{11lw zaZNz{YPuJG7P;a9U`D~QfZN20uPkApn3(j5(UPL||3-i&) z^7zIC(g?QPPnvr|6#SJO<~$nZ(9`64KC+GcWBAqUSTv&V`i02ZI@j7Lc%xU%?mGMZ4C*IVlo(dfoZG|BItx0-G*-GO!QRwF`< zEqw$(%=CazZr}19%lo5Il~bcvo%Jcq|ITPi zY;!kYH=}kVWiX=l9JRv2#yz~-i!{ypG~T~zeOGvZAc$1WbhK;1c`IprDoTSI>2>VL z#x#>HsQYiJ%c5?uBlpUVd{cRFrT2Dz3^3CNGYO!>ps6BQ6f&UO4b>ZC);L)0N)0|2 z;^Q=KW(#mqs;hOhE?>a=dL0QWrWYj44WFEG?XYSh>V(kqt?k}hFP@_c^J@jk5~AG} zyr8b)kCGn{AP^O>V1`)BhzLmT$X<3qnuOdVf`EH*nSt?1Wk)XRlTlJRD_{nY6K?M% zxSPeqobf7XG$D|XWT@tXxvUK=1w_A=Du^&7&oB9!E(zSRVk%A`8dOTjCl7V6hXpAf zlJcv@oLLIg^fOckb^CIf%(28tnl*4}Gu4@GU%{ z6tn{XFLrSWt~r{Lj}Vw{!EOok%Xl(|l`Qw5JptK)^V1BI8-enO5>r2r{FZzF$dHj z02{kc&Ye6RCv~NX24;z%lPvK9hfj)<6vHldYjB_SMO5oHi>qvqT5Z;tX#G*vh>IBV zE->%f$bVp$gSzj-LtL!J>1qoa9X_MeR&j=*$de53Bh1M7pm>u>IE;%v!nL~9ZdT$X zGC+01I&tb8rc+&slc8P>mqaxoi0kVHVi%!boG}5Xv`|GYAlPc~Jcu*ZrPbz2fc&1m z%;}wDvC27Sx;qLN6W%AuB<5zEoUeq{rBXVm!~H~GE_~Dg10&^87kw3{>ANDI0w=iD z#3HM$RmO>#@c}{oP%V_wZ$tbO;$BLxnVap5o9Xl7FJUqPRjPB*OJ%e4NU6KPz|Ot) zq%-a8!-x><0^#`sdF=+3P zP6N5%ar}aFuwspZi=d{U9_+qy`wr9`R7z$@)n&J-y6g?AE-R`I!LT7!cRy$F;sF3T z4k#eL`3zK!Ky?SXaafm7A?rCPmXuK}%3II9V8ITh+NuG%q4pRa6Zk#}wKb|Tuwuuk zI#m{GD-X5xkvMEs+9}lOOcP-CE8LBj>s_|$XC_3JG>n&6WS!{t~Go1PD+QMLG z#*FD9^PB_2@1j`9zT0&GF`E90=m2H|#2HP|8QtcF_S~%}&?vSSwz0AJpp?V_Tq3my2ygL`X%(BudpDxo|zMTHSl^}JjWm8y8??JLQ&kUKg9bVXYms(ev-vc zvG{2gudw(T7G!Dtvn&W|`ok>5t{ni@yI4om(jQ?VHvK1g_i+|K&*B$Q6eo-!`zx&Y zMHUp?IE`@Fd<*AJ`oN+;#h1U#;#XPF@-=qur+N2lEPkEEZ?O1H7NXCjcmp0|2CcTy z?!QnU(74a1Owh_egxduPK6wBF3O+@@4-<}oiL3yDd$9`^U(1b50zwdD-xA*1Hm3d^ zjJL%OrrAKy5Rl+hnM#15;fTrx?jb%-Xd}LhqaVvmKOkCuH38#64!|Hfp4f!T8zpHh zeEo~y$tRGjY!)iCeabaekklR6mG_8CSIp6i2fH>9E2t*A=L5E7%aMkV%w_ka0aCxr zKpvqZ+35Cvq?JG3F4% zy$_TsrcJnok%M>?gGlbHyrdQ$k<^wn(s%Q1Qob#wOtds*h=W9^5GJ-);;b}4k_}l^ z=3lt*?K~v87iq_kgrrtHFN*|I-f~YrYbO8{f`Q50RW)dRLrb22LYiO5+cBzW1gC-jb{!9m#n71(PTw zmS_^8V(}58JKq?BX!1wbO$9s$h1)`99Mv+%ZHUEZ81I8ij2J&qlE7yU$G|~`r(!CB zXy+>RRWV}zU$EP#jA3g@2FExBGf?kn8fnk-WVmvvnC}gYNB$RVIfxs$HNY}2yE1YR9bfv5t#p+ITXx|D0e0#9aof%%=AYpy&O%uJBA48rL zW&-v1=5T&Y|2f*|zeK^1jiJmoZ7rIRgXkuj4G9mY(?#+j;pwYD2rC2INL(WTgr=pG zWU$xtMbzqhQJ{qw2LHf{zhF^d5wj59`g`69fR79fRw938@oGAgMNpbH`;gELHQYiO zQiVSeI=UHTsSq|cBDqq^$r3w+#Vt|gyI?ywV7K1$EmR&b#_UBQ!HPVeUpOb_cAli{LiUGv z&}`#vFccda$rm>f)-`u*C>L4bBX=gY&%oYBC>1fL$|BAd*P)5IZ*jWYi3$6-kcNk1 zdl`As*5YBaivyU*V--*sJ^f$fMp6mScuXI9IJV_nbiSEQ@c2?^cON>oesu$Q^r7^- z&{c#Z^V~P_r2q<&5Ok49beT!uxg>-@XvHRQMO=SVg07uK>2o{3=VMDlxYAN3!uJxJ$J;uwFty1%&|QN{C)! zY9VU4X`Ol;|q6YXQ$Y_G?uo{fLSK@e){3cM5XQIIa??lx`JJ0m3qE%B~r z`F|1jLR(Y_b%odxY3TE;px=*RF_P%(9KL}mn-_KhKh!~Gh=(Iv4g;pbk!`V3;LUP8 zXw<=oVoSUkwH!<=mBZFh(7h)EI}c~X7h!0w=$$O6vGqT*_yUTe8{1D9c}*ek>jz8v zU-(Xz`DNY-%4P?nyqB{e$j%^^%D{)4?o92l_$@nxZ;dNz7F{{?Q86_)H+NE# zo^#+!p7zik6cCO6{Yhk}<|LkFRy@Pgx;$%GF2~t&87I(J>%7mE5e%u+yKmCvvT9Y! zW&J(u^B9W;3(evJi{E1LyDa{U#ow~{9E-n05$9lfg|#aF55rPR>3?GVzp?mt7XQKG zKUw?_ivc_i1el;mY6K6U@GRHKy51W-&)etSncbH?mOYf6@TR?-17pI;gFxfioHvt8 zXnt4RT^en#;pFq|JS|%!H^`ZqVG*2VM}W8zm;mQJQUiLbR)eW8{67-7w>Z4l?q=-{ zq5;lmn`6+k^4|=NFGKE?Q|TssG}b^Is@66+BxIOOjFxDpKgxoUSsv{MPjG@etvL3M bnyq*gj9`|XvrKGrP) base 10. - Heavily adapted, but based on https://gist.github.com/bitsandbooks/2649444 - - NOTE: any sequences that are being decoded should be sanitised first. - - -- Alex Warwick Vesztrocy, May-June 2017 -''' -import numpy as np - - -# "digits" -DIGITS_AA = np.fromstring('ACDEFGHIKLMNPQRSTVWXY', dtype='S1') -DIGITS_DNA = np.fromstring('ACGTX', dtype='S1') - - -class KmerEncoder(object): - def __init__(self, k, is_protein=True): - ''' - Initialise the kmer converter. k is the kmer length. - If is_dna=True then DNA else AA. - ''' - self.digits = DIGITS_AA if is_protein else DIGITS_DNA - self.k = int(k) # Cast incase np-type for n below. - self.max = (len(self.digits) ** self.k) - 1 - self.n = self.decode(self.digits[-1] * self.k) + 1 - self._prot = np.zeros((self.k,), dtype='S1') - - def __len__(self): - ''' - Return the maximum integer-representation of the kmer length - in this converter. - ''' - return self.n - - def encode(self, seq): - ''' - Encode integer kmer in protein chars. - ''' - if seq <= self.max: - self._prot[:] = self.digits[0] - i = -1 - while seq > 0: - self._prot[i] = self.digits[seq % self.digits.size] - seq //= self.digits.size - i -= 1 - return self._prot.tostring() - else: - raise ValueError('{} Larger than largest kmer of size {}' - .format(seq, self.k)) - - def decode(self, seq): - ''' - Decode a protein kmer -> integer. NOTE: sanitisation to a byte - string required first. - ''' - x = 0 - for digit in seq[:self.k].decode('ascii'): - x = (x * self.digits.size) + np.searchsorted(self.digits, digit) - return x - - def decompose(self, seq): - ''' - Decompose a sequence into counts of its constituent (decoded) kmers. - ''' - for i in range(len(seq) - self.k + 1): - yield self.decode(seq[i:(i + self.k)]) diff --git a/src/HogProf/build/lib/pyoma/browser/OrthoXMLSplitter.py b/src/HogProf/build/lib/pyoma/browser/OrthoXMLSplitter.py deleted file mode 100755 index e16541e..0000000 --- a/src/HogProf/build/lib/pyoma/browser/OrthoXMLSplitter.py +++ /dev/null @@ -1,195 +0,0 @@ -from __future__ import unicode_literals, division -from builtins import str -import lxml.etree as etree -import os -import errno -import logging - -logger = logging.getLogger(__name__) - - -class OrthoXMLSplitter(object): - """Convert orthoxml files with several families. - - This class provides the means to extract a subset of root HOGs (i.e. - families) into a new output orthoxml file, or to split it and create - for each family an individual file. - - The object should be instantiated with the input orthoxml file and - optionally a cache_dir argument where the output orthoxml files will - be stored. This later parameter can be overwritten in the __call__ - method call that does the work. - - .. note:: - - Calls to the splitter will remove the created families from the - loaded input file, so subsequent calls that contain a family in - common will miss them from the second call onwards. - - - :Example: - - splitter = OrthoXMLSplitter("data.orthoxml", cache_dir="./splits") - splitter() - - will create files HOGxxxxxx.orthoxml in the ./splits directory.""" - - def __init__(self, xml_file, cache_dir=None): - self.xml_file = xml_file - if cache_dir is not None: - self._assert_cache_dir(cache_dir) - logger.info('loading xml file {}...'.format(xml_file)) - parser = etree.XMLParser(remove_blank_text=True) - self.Etree_XML = etree.parse(self.xml_file, parser=parser) - self.Etree_root = self.Etree_XML.getroot() - logger.info('building lookup table for genes') - self.gene_lookup = {gene.get('id'): gene for gene in self._iter_gene_elements()} - logger.info('init of OrthoXMLSplitter finished') - - def _assert_cache_dir(self, cache_dir): - # Ensure existance of cache directory (py2 compat) - try: - os.makedirs(cache_dir) - except OSError as exc: - if exc.errno == errno.EEXIST and os.path.isdir(cache_dir): - pass - else: - raise - self.cache_dir = cache_dir - - def _iter_gene_elements(self): - """This method is a faster version of xpath '//ns:gene'. - - It iterates the element in sequential order""" - for node in self.Etree_root: - if node.tag == "{http://orthoXML.org/2011/}species": - for gene in node.iter('{http://orthoXML.org/2011/}gene'): - yield gene - - def _iter_toplevel_groups(self): - """This method yields all the root hogs sequentially.""" - for node in self.Etree_root: - if node.tag == "{http://orthoXML.org/2011/}groups": - for root_hog in node: - yield root_hog - - def __call__(self, hogs_to_extract=None, single_hog_files=False, basename=None, cache_dir=None): - """Split/extract hogs from orthoxml file based on root hogs ids. - - Split the input orthoxml or extract a subset of root hogs. If no - argument is passed, one orthoxml file per root hog is created, - named as 'HOGxxxxxx.orthoxml', where xxxxxx is the numeric id of - each hog. - - The set of root hogs to be extracted can be limited by specifying - a subset of hog ids in the hogs_to_extract parameter. If - single_hog_files is set to true, each of these hogs will be converted - into a single orthoxml file named as explained above. If single_hog_files - is set to false, the whole subset of hogs will be stored in one - orthoxml file named as specified in `basename`. - - The file(s) will be stored in the cache_dir folder which can be - specified in the constructor or overwritten as an argument in - this method. - - :param hogs_to_extract: list or set that contains the set of root - hogs to be extracted. If set to None, all hogs are extracted. - :param bool single_hog_files: whether or not to build one orthoxml - file for all the selected hogs or individual ones. - :param str basename: name of the output file if a subset of hogs - is extracted into a single file. - :param str cache_dir: folder where to store the output files. - """ - if cache_dir is not None: - self._assert_cache_dir(cache_dir) - elif self.cache_dir is None: - raise RuntimeError("cache dir to output files to is not set") - - if single_hog_files: - if hogs_to_extract is None: - raise RuntimeError('useless to extract all hogs into single output file') - if basename is None or not isinstance(basename, (str, bytes)): - raise ValueError('basename needs to be specified: {}'.format(basename)) - ogs = [og for og in self._iter_toplevel_groups() if int(og.get("id")) in hogs_to_extract] - fn = os.path.join(self.cache_dir, basename) - logger.info("extracting {:d} hogs into {:s}".format(len(ogs), fn)) - self.create_new_orthoxml(fn, ogs) - else: - for og in self._iter_toplevel_groups(): - if hogs_to_extract is None or int(og.get('id')) in hogs_to_extract: - hog_nr = int(og.get("id")) - hog_id = "HOG{:06d}.orthoxml".format(hog_nr) - fname = os.path.join(self.cache_dir, hog_id) - logger.info("extracting {} into {}".format(hog_id, fname)) - self.create_new_orthoxml(fname, [og]) - - def iter_generefs_in_og(self, og_node): - for node in og_node.iterdescendants('{http://orthoXML.org/2011/}geneRef'): - yield node - - def get_gene_via_generef(self, genesref_ids): - genesref_ids = set(genesref_ids) - return [self.gene_lookup[gene_id] for gene_id in genesref_ids] - - def create_new_orthoxml(self, fn, OGs): - """create a new orthoxml file for the passed orthologGroup elements. - - :param fn: the filename of the output file. The path needs to exists - prior to calling this method. - :param OGs: the orthologGroup elements that should be included in the - new output file.""" - # Get element to store - for og_node in OGs: - gene_ids = [gene_ref_elem.get("id") for gene_ref_elem in self.iter_generefs_in_og(og_node)] - gene_els = self.get_gene_via_generef(gene_ids) - - # Get all information to store - zoo = {} # <- {key:sp_etree || value: {key:db_el || values:[list_genes]}} - for gene_el in gene_els: # <- for all gene el - db_el = gene_el.getparent().getparent() - sp_el = db_el.getparent() - if sp_el in zoo.keys(): # <- if species already visited - if db_el in zoo[sp_el].keys(): # <- if db already visited so add gene - zoo[sp_el][db_el].append(gene_el) - else: # <- if db not visited so add db,genes - zoo[sp_el][db_el] = [] - zoo[sp_el][db_el].append(gene_el) - else: # <- if species not visited so add sp,db,gene - zoo[sp_el] = {} - zoo[sp_el][db_el] = [] - zoo[sp_el][db_el].append(gene_el) - - etree_2_dump = etree.Element("orthoXML", nsmap=self.Etree_root.nsmap) - for attr, value in self.Etree_root.items(): - etree_2_dump.set(attr, value) - - for species_el in zoo.keys(): - species_xml = etree.Element("species") - for attr, value in species_el.items(): - species_xml.set(attr, value) - etree_2_dump.insert(0, species_xml) - - for db_el in zoo[species_el].keys(): - # Add into - database_xml = etree.SubElement(species_xml, "database") - for attr, value in db_el.items(): - database_xml.set(attr, value) - - # Add TAG into - genes_xml = etree.SubElement(database_xml, "genes") - - # Fill with - for gene_el in zoo[species_el][db_el]: - gene_xml = etree.SubElement(genes_xml, "gene") - for attr, value in gene_el.attrib.items(): - gene_xml.set(attr, value) - - groupsxml = etree.SubElement(etree_2_dump, "groups") - for og_et in OGs: - if not og_et.get('id').startswith('HOG:'): - og_et.set('id', 'HOG:{:07d}'.format(int(og_et.get('id')))) - groupsxml.append(og_et) - - tree = etree.ElementTree(etree_2_dump) - tree.write(fn, xml_declaration=True, encoding='utf-8', method="xml", pretty_print=True) - diff --git a/src/HogProf/build/lib/pyoma/browser/__init__.py b/src/HogProf/build/lib/pyoma/browser/__init__.py deleted file mode 100755 index dd4c206..0000000 --- a/src/HogProf/build/lib/pyoma/browser/__init__.py +++ /dev/null @@ -1 +0,0 @@ -name = "browser" diff --git a/src/HogProf/build/lib/pyoma/browser/check_db_consistency.py b/src/HogProf/build/lib/pyoma/browser/check_db_consistency.py deleted file mode 100755 index 264de1d..0000000 --- a/src/HogProf/build/lib/pyoma/browser/check_db_consistency.py +++ /dev/null @@ -1,82 +0,0 @@ -import random -import unittest -import os -import Bio.Seq -import Bio.Data.CodonTable -import pyoma.browser.db as pyomadb -import tables -import numpy - - -class DatabaseChecks(unittest.TestCase): - - @classmethod - def setUpClass(cls): - try: - path = os.environ['PYOMA_DB2CHECK'] - except KeyError: - raise unittest.SkipTest("No database specified in PYOMA_DB2CHECK") - - cls.db = pyomadb.Database(path) - - def translated_cdna_match_protein_sequence(self, cdna, prot): - cdna = cdna.replace('X', 'N') - for tab in Bio.Data.CodonTable.generic_by_id.keys(): - tab_ok = True - trans = Bio.Seq.translate(cdna, table=tab) - if not 3 >= len(trans) - len(prot) >= 0: - return False - for pos, (trans_aa, prot_aa) in enumerate(zip(trans, prot)): - if trans_aa == prot_aa or trans_aa == 'X' or prot_aa == 'X': - continue - elif prot_aa == 'M' and pos == 0 and trans_aa != '*': - continue - else: - tab_ok = False - break - if tab_ok: - return True - - def test_cdna_and_protein_sequence_match(self): - """test translated cdna sequence and protein sequence match. - - This is done for a random sample of 1000 entries""" - SAMPLES = 1000 - nr_entries = self.db.id_resolver.max_entry_nr - for entry_nr in random.sample(range(nr_entries+1), SAMPLES): - with self.subTest(entry_nr=entry_nr): - cdna = self.db.get_cdna(entry_nr).decode() - prot = self.db.get_sequence(entry_nr).decode() - self.assertTrue(self.translated_cdna_match_protein_sequence(cdna, prot)) - - def test_increasing_offsets(self): - entry_tab = self.db.get_hdf5_handle().get_node('/Protein/Entries') - seq_off = -1 - cds_off = -1 - for row in entry_tab: - self.assertLess(seq_off, row['SeqBufferOffset'], "SeqBufferOffset decreases in row {}: {} vs {}" - .format(row.nrow, seq_off, row['SeqBufferOffset'])) - self.assertLess(cds_off, row['CDNABufferOffset'], "CDNABufferOffset decreases in row {}: {} vs {}" - .format(row.nrow, seq_off, row['CDNABufferOffset'])) - seq_off = row['SeqBufferOffset'] - cds_off = row['CDNABufferOffset'] - - def test_homeology_flag(self): - genome_tab = self.db.get_hdf5_handle().get_node('/Genome') - for g in (b'WHEAT', b'GOSHI', b'BRANA'): - for row in genome_tab.read_where('UniProtSpeciesCode == g'): - self.assertTrue(row['IsPolyploid'], "{} is not recorded as polyploid genome".format(g)) - for g in (b'YEAST', b'HUMAN', b'PLAF7', b'ARATH', b'MOUSE'): - for row in genome_tab.read_where('UniProtSpeciesCode == g'): - self.assertFalse(row['IsPolyploid'], "{} is recorded to be a ployploid genome".format(g)) - - def test_synteny_scores_exist(self): - for g in ('WHEAT', 'BRANA', 'GOSHI'): - try: - t = self.db.get_hdf5_handle().get_node('/PairwiseRelation/{}/within'.format(g)) - except tables.NoSuchNodeError: - # if species does not exist, we skip - not all datasets will have these genomes - continue - syn_col = t.col('SyntenyConservationLocal') - computed_pairs = numpy.where(syn_col >= 0) - self.assertLess(0, len(computed_pairs[0]), "No synteny values computed for {}".format(g)) diff --git a/src/HogProf/build/lib/pyoma/browser/convert.py b/src/HogProf/build/lib/pyoma/browser/convert.py deleted file mode 100755 index a2814e6..0000000 --- a/src/HogProf/build/lib/pyoma/browser/convert.py +++ /dev/null @@ -1,1910 +0,0 @@ -from __future__ import division, print_function -from builtins import str, chr, range, object, super, bytes - -import pandas -from future.standard_library import hooks -from PySAIS import sais -from tempfile import NamedTemporaryFile -from tqdm import tqdm -import csv -import resource -import tables -import numpy -import numpy.lib.recfunctions -import os -import subprocess -import errno -import json -import time -import familyanalyzer -import re -import multiprocessing as mp -import lxml.html -import collections -import gzip -import hashlib -import itertools -import operator -import fileinput - -from .. import common -from . import locus_parser -from . import tablefmt -from .KmerEncoder import KmerEncoder -from .OrthoXMLSplitter import OrthoXMLSplitter -from .geneontology import GeneOntology, OntologyParser -from .synteny import SyntenyScorer -from .homoeologs import HomeologsConfidenceCalculator - -with hooks(): - import urllib.request - - -class DarwinException(Exception): - pass - - -def callDarwinExport(func, drwfile=None): - """Function starts a darwin session, loads convert.drw file - and calls the darwin function passed as argument. The output - is expected to be written by darwin in json format into the - file specified by 'outfn'. - This function returns the parsed json datastructure""" - - with NamedTemporaryFile(suffix='.dat') as tmpfile: - if drwfile is None: - drwfile = os.path.abspath(os.path.splitext(__file__)[0] + ".drw") - # with open(os.devnull, 'w') as DEVNULL: - stacksize = resource.getrlimit(resource.RLIMIT_STACK) - common.package_logger.info('current stacklimit: {}'.format(stacksize)) - common.package_logger.info('setting stacklimit: {}'.format((max(stacksize)-1, stacksize[1]))) - resource.setrlimit(resource.RLIMIT_STACK, (min(stacksize), stacksize[1])) - p = subprocess.Popen(['darwin', '-q', '-E', '-B'], stdin=subprocess.PIPE, - stderr=subprocess.PIPE, stdout=subprocess.PIPE) - drw_cmd = "outfn := '{}': ReadProgram('{}'): {}; done;".format( - tmpfile.name, - drwfile, - func).encode('utf-8') - common.package_logger.debug('calling darwin function: {}'.format(func)) - (stdout, stderr) = p.communicate(input=drw_cmd) - if p.returncode > 0: - raise DarwinException(p.stderr.read()) - - trans_tab = "".join(str(chr(x)) for x in range(128)) + " " * 128 - with open(tmpfile.name, 'r') as jsonData: - rawdata = jsonData.read() - return json.loads(rawdata.translate(trans_tab)) - - -def uniq(seq): - """return uniq elements of a list, preserving order - - :param seq: an iterable to be analyzed - """ - seen = set() - return [x for x in seq if not (x in seen or seen.add(x))] - - -def silentremove(filename): - """Function to remove a given file. No exception is raised if the - file does not exist. Other errors are passed to the user. - :param filename: the path of the file to be removed""" - try: - os.remove(filename) - except OSError as e: - if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory - raise # re-raise exception if a different error occured - - -def gz_is_empty(fname): - """Test if gzip file fname is empty - - Return True if the uncompressed data in fname has zero length - or if fname itself has zero length - Raises OSError if fname has non-zero length and is not a gzip file - """ - with gzip.open(fname, 'rb') as f: - data = f.read(1) - return len(data) == 0 - - -def load_tsv_to_numpy(args): - fn, off1, off2, swap = args - rel_desc = tablefmt.PairwiseRelationTable - # we need to get the enum as a dict to be able to extend it - # with the reversed labels, i.e. n:1 - relEnum = rel_desc.columns['RelType'].enum._names - relEnum['n:1'] = relEnum['m:1'] - relEnum['1:m'] = relEnum['1:n'] - relEnum['n:m'] = relEnum['m:n'] - read_dir = -1 if swap else 1 - tsv_dtype = [('EntryNr1', 'i4'), ('EntryNr2', 'i4'), ('Score', 'f4'), ('RelType', 'i1'), - ('AlignmentOverlap', 'f2'), ('Distance', 'f4')] - for curNr, curFn in enumerate([fn, fn.replace('.ext.', '.')]): - try: - if gz_is_empty(curFn): - return numpy.empty(0, dtype=tables.dtype_from_descr(rel_desc)) - with gzip.GzipFile(curFn) as fh: - data = numpy.genfromtxt(fh, dtype=tsv_dtype, - names=[_[0] for _ in tsv_dtype], - delimiter='\t', - usecols=(0, 1, 2, 3, 4, 5), - converters={'EntryNr1': lambda nr: int(nr) + off1, - 'EntryNr2': lambda nr: int(nr) + off2, - 'RelType': lambda rel: (relEnum[rel[::read_dir].decode()] - if len(rel) <= 3 - else relEnum[rel.decode()]), - 'Score': lambda score: float(score) / 100}) - break - except OSError as e: - if curNr < 1: - common.package_logger.info('tried to load {}'.format(curFn)) - pass - else: - raise e - - if swap: - reversed_cols = tuple(data.dtype.names[z] for z in (1, 0, 2, 3, 4, 5)) - data.dtype.names = reversed_cols - full_table = numpy.empty(data.size, dtype=tables.dtype_from_descr(rel_desc)) - common_cols = list(data.dtype.names) - full_table[common_cols] = data[common_cols] - for col_not_in_tsv in set(full_table.dtype.names) - set(data.dtype.names): - full_table[col_not_in_tsv] = rel_desc.columns[col_not_in_tsv].dflt - return full_table - - -def read_vps_from_tsv(gs, ref_genome): - ref_genome_idx = gs.get_where_list('(UniProtSpeciesCode=={!r})'. - format(ref_genome))[0] - job_args = [] - for g in range(len(gs)): - if g == ref_genome_idx: - continue - g1, g2 = sorted((g, ref_genome_idx,)) - off1, off2 = gs.read_coordinates(numpy.array((g1, g2)), 'EntryOff') - fn = os.path.join(os.environ['DARWIN_OMADATA_PATH'], 'Phase4', - gs.cols.UniProtSpeciesCode[g1].decode(), - gs.cols.UniProtSpeciesCode[g2].decode() + ".orth.txt.gz") - tup = (fn, off1, off2, g1 != ref_genome_idx) - common.package_logger.info('adding job: {}'.format(tup)) - job_args.append(tup) - - pool = mp.Pool(processes=min(os.cpu_count(), 10)) - all_pairs = pool.map(load_tsv_to_numpy, job_args) - pool.close() - return numpy.lib.recfunctions.stack_arrays(all_pairs, usemask=False) - - -class DataImportError(Exception): - pass - - -def _load_taxonomy_without_ref_to_itselfs(data): - dtype = tables.dtype_from_descr(tablefmt.TaxonomyTable) - arr = numpy.array([tuple(x) for x in data], dtype=dtype) - clean = arr[numpy.where(arr['NCBITaxonId'] != arr['ParentTaxonId'])] - return clean - - -def compute_ortholog_types(data, genome_offs): - """this function computes the type of orthologs from the data and sets in - the RelType column. - - :param data: a numpy recarray corresponding to the `numpy.dtype` of - `tablefmt.PairwiseRelationTable` - :param genome_offs: a numpy array with the genome offsets, i.e. the entry - numbers where the next genome starts - - :returns: a modified version of data - """ - typEnum = tablefmt.PairwiseRelationTable.columns.get('RelType').enum - query_type = {val: 'm' if cnt > 1 else '1' - for val, cnt in zip(*numpy.unique(data['EntryNr2'], - return_counts=True))} - - def genome_idx(enr): - return numpy.searchsorted(genome_offs, enr - 1, side='right') - - g0 = genome_idx(data[0]['EntryNr2']) - it = numpy.nditer(data, flags=['c_index'], op_flags=['readwrite']) - while not it.finished: - row0 = it[0] - i1 = it.index + 1 - # we move i1 forward to the row where the next genome starts, i.e. the - # current query changes the species or the query itself changes - while i1 < len(data): - row1 = data[i1] - g1 = genome_idx(row1['EntryNr2']) - if g1 != g0 or row0['EntryNr1'] != row1['EntryNr1']: - break - i1 += 1 - subj_type = 'n' if i1 - it.index > 1 else '1' - while not it.finished and it.index < i1: - typ = '{}:{}'.format(query_type[int(it[0]['EntryNr2'])], subj_type) - it[0]['RelType'] = typEnum[typ] - it.iternext() - g0 = g1 - - -def get_or_create_tables_node(h5, path, desc=None): - """return the node of a given path from the h5 file - - If the node does not yet exist, it is created (including potential - inexistant internal nodes). - - :param h5: Handle to the hdf5 object - :param str path: Path of the node to return - :param str desc: Description to be added to the node""" - try: - grp = h5.get_node(path) - except tables.NoSuchNodeError: - base, name = os.path.split(path) - grp = h5.create_group(base, name, title=desc, createparents=True) - return grp - - -class DarwinExporter(object): - DB_SCHEMA_VERSION = '3.2' - DRW_CONVERT_FILE = os.path.abspath(os.path.splitext(__file__)[0] + '.drw') - - def __init__(self, path, logger=None, mode=None): - self.logger = logger if logger is not None else common.package_logger - fn = os.path.normpath(os.path.join( - os.getenv('DARWIN_BROWSERDATA_PATH', ''), - path)) - if mode is None: - mode = 'append' if os.path.exists(fn) else 'write' - self._compr = tables.Filters(complevel=6, complib='zlib', fletcher32=True) - self.h5 = tables.open_file(fn, mode=mode[0], filters=self._compr) - self.logger.info("opened {} in {} mode, options {}".format( - fn, mode, str(self._compr))) - if mode == 'write': - self.h5.root._f_setattr('convertion_start', time.strftime("%c")) - - def call_darwin_export(self, func): - return callDarwinExport(func, self.DRW_CONVERT_FILE) - - def _get_or_create_node(self, path, desc=None): - return get_or_create_tables_node(self.h5, path, desc) - - def create_table_if_needed(self, parent, name, drop_data=False, **kwargs): - """create a table if needed. - - The function only checks whether a table exists with that name, - but not if it is compatible with the passed arguments. - if you pass data with the `obj` argument, this data is appended - to the table. If you set `drop_data` to True, data that was - previously in the existing table is dropped prior to adding new - data.""" - try: - tab = self.h5.get_node(parent, name=name) - if drop_data: - tab.remove_rows(0, tab.nrows) - if 'obj' in kwargs: - tab.append(kwargs['obj']) - except tables.NoSuchNodeError: - tab = self.h5.create_table(parent, name, **kwargs) - return tab - - def get_version(self): - """return version of the dataset. - - Default implementation searches for 'mname' in Matrix or matrix_stats.drw files. - """ - for fname in ('Matrix', 'matrix_stats.drw'): - with open(os.path.join(os.environ['DARWIN_BROWSERDATA_PATH'], fname), 'r') as fh: - for i, line in enumerate(fh): - if line.startswith('mname :='): - match = re.match(r'mname := \'(?P[^\']*)\'', line) - return match.group('version') - if i > 1000: - break - raise DataImportError('No version information found') - - def add_version(self): - version = self.get_version() - self.h5.set_node_attr('/', 'oma_version', version) - self.h5.set_node_attr('/', 'pytables', tables.get_pytables_version()) - self.h5.set_node_attr('/', 'hdf5_version', tables.get_hdf5_version()) - self.h5.set_node_attr('/', 'db_schema_version', self.DB_SCHEMA_VERSION) - - def add_species_data(self): - cache_file = os.path.join( - os.getenv('DARWIN_NETWORK_SCRATCH_PATH', ''), - 'pyoma', 'gs.json') - if os.path.exists(cache_file): - with open(cache_file, 'r') as fd: - data = json.load(fd) - else: - data = self.call_darwin_export('GetGenomeData();') - gstab = self.h5.create_table('/', 'Genome', tablefmt.GenomeTable, - expectedrows=len(data['GS'])) - gs_data = self._parse_date_columns(data['GS'], gstab) - self._write_to_table(gstab, gs_data) - gstab.cols.NCBITaxonId.create_csindex(filters=self._compr) - gstab.cols.UniProtSpeciesCode.create_csindex(filters=self._compr) - gstab.cols.EntryOff.create_csindex(filters=self._compr) - - taxtab = self.h5.create_table('/', 'Taxonomy', tablefmt.TaxonomyTable, - expectedrows=len(data['Tax'])) - self._write_to_table(taxtab, _load_taxonomy_without_ref_to_itselfs(data['Tax'])) - taxtab.cols.NCBITaxonId.create_csindex(filters=self._compr) - - def _parse_date_columns(self, data, tab): - """convert str values in a date column to epoch timestamps""" - time_cols = [i for i, col in enumerate(tab.colnames) if tab.coldescrs[col].kind == 'time'] - dflts = [tab.coldflts[col] for col in tab.colnames] - - def map_data(col, data): - try: - val = data[col] - if col in time_cols and isinstance(val, str): - for fmt in ('%b %d, %Y', '%B %d, %Y', '%d.%m.%Y', '%Y%m%d'): - try: - date = time.strptime(val, fmt) - return time.mktime(date) - except ValueError: - pass - raise ValueError("Cannot parse date of '{}'".format(val)) - return val - except IndexError: - return dflts[col] - - arr = numpy.empty(len(data), dtype=tab.dtype) - for i, row in enumerate(data): - as_tup = tuple(map_data(c, row) for c in range(len(dflts))) - arr[i] = as_tup - return arr - - def _convert_to_numpyarray(self, data, tab): - """convert a list of list dataset into a numpy rec array that - corresponds to the table definition of `tab`. - - :param data: the data to be converted. - :param tab: a pytables table node.""" - - enum_cols = {i: tab.get_enum(col) for (i, col) in enumerate(tab.colnames) - if tab.coltypes[col] == 'enum'} - dflts = [tab.coldflts[col] for col in tab.colnames] - - def map_data(col, data): - try: - val = data[col] - return enum_cols[col][val] - except IndexError: - return dflts[col] - except KeyError: - return val - - arr = numpy.empty(len(data), dtype=tab.dtype) - for i, row in enumerate(data): - as_tup = tuple(map_data(c, row) for c in range(len(dflts))) - arr[i] = as_tup - return arr - - def add_orthologs(self): - genome_offs = self.h5.root.Genome.col('EntryOff') - for gs in self.h5.root.Genome.iterrows(): - genome = gs['UniProtSpeciesCode'].decode() - rel_node_for_genome = self._get_or_create_node('/PairwiseRelation/{}'.format(genome)) - if 'VPairs' not in rel_node_for_genome: - cache_file = os.path.join( - os.getenv('DARWIN_NETWORK_SCRATCH_PATH', ''), - 'pyoma', 'vps', '{}.json'.format(genome)) - if os.path.exists(cache_file): - with open(cache_file, 'r') as fd: - data = json.load(fd) - elif ((not os.getenv('DARWIN_OMADATA_PATH') is None) and - os.path.exists(os.path.join( - os.environ['DARWIN_OMADATA_PATH'], 'Phase4'))): - # try to read from Phase4 in parallel. - data = read_vps_from_tsv(self.h5.root.Genome, - genome.encode('utf-8')) - else: - # fallback to read from VPsDB - data = self.call_darwin_export('GetVPsForGenome({})'.format(genome)) - - vp_tab = self.h5.create_table(rel_node_for_genome, 'VPairs', tablefmt.PairwiseRelationTable, - expectedrows=len(data)) - if isinstance(data, list): - data = self._convert_to_numpyarray(data, vp_tab) - if numpy.any(data['RelType'] >= tablefmt.PairwiseRelationTable.columns.get('RelType').enum['n/a']): - compute_ortholog_types(data, genome_offs) - self._write_to_table(vp_tab, data) - vp_tab.cols.EntryNr1.create_csindex() - - def add_same_species_relations(self): - for gs in self.h5.root.Genome.iterrows(): - genome = gs['UniProtSpeciesCode'].decode() - rel_node_for_genome = self._get_or_create_node('/PairwiseRelation/{}'.format(genome)) - if 'within' not in rel_node_for_genome: - cache_file = os.path.join( - os.getenv('DARWIN_NETWORK_SCRATCH_PATH', ''), - 'pyoma', 'cps', '{}.json'.format(genome)) - if os.path.exists(cache_file): - with open(cache_file, 'r') as fd: - data = json.load(fd) - else: - # fallback to read from VPsDB - data = self.call_darwin_export('GetSameSpeciesRelations({})'.format(genome)) - - ss_tab = self.h5.create_table(rel_node_for_genome, 'within', tablefmt.PairwiseRelationTable, - expectedrows=len(data)) - if isinstance(data, list): - data = self._convert_to_numpyarray(data, ss_tab) - self._write_to_table(ss_tab, data) - ss_tab.cols.EntryNr1.create_csindex() - - def add_synteny_scores(self): - """add synteny scores of pairwise relations to database. - - Current implementation only computes synteny scores for - homoeologs, but easy to extend. Question is rather if we - need synteny scores for all genome pairs, and if not, how - to select. - - The computations of the scores are done using :mod:`synteny` - module of this package.""" - # TODO: compute for non-homoeologs relation as well. - self.logger.info("Adding synteny scores for polyploid genomes") - polyploid_genomes = self.h5.root.Genome.where('IsPolyploid==True') - for genome in polyploid_genomes: - genome_code = genome['UniProtSpeciesCode'].decode() - self.logger.info('compute synteny score for {}'.format(genome_code)) - synteny_scorer = SyntenyScorer(self.h5, genome_code) - rels = synteny_scorer.compute_scores() - self._callback_store_rel_data( - genome_code, rels, [('SyntenyConservationLocal', 'mean_synteny_score')]) - - def add_homoeology_confidence(self): - """adds the homoeology confidence scores to the database. - - This method should be called only after the synteny scores have - been computed and added to the database. - - The computations are done using :mod:`homoeologs` module.""" - self.logger.info("Adding homoeolog confidence scores") - polyploid_genomes = self.h5.root.Genome.where('IsPolyploid==True') - for genome in polyploid_genomes: - genome_code = genome['UniProtSpeciesCode'].decode() - self.logger.info("compute homoeolog confidence for {}".format(genome_code)) - homoeolg_scorer = HomeologsConfidenceCalculator(self.h5, genome_code) - rels = homoeolg_scorer.calculate_scores() - self._callback_store_rel_data( - genome_code, rels, [("Confidence", "fuzzy_confidence_scaled")]) - - def _callback_store_rel_data(self, genome, rels_df, assignments): - tab = self.h5.get_node('/PairwiseRelation/{}/within'.format(genome)) - df_all = pandas.DataFrame(tab.read()) - if 'entry_nr1' in list(rels_df): - enr_col_names = ['entry_nr1', 'entry_nr2'] - else: - enr_col_names = ['EntryNr1', 'EntryNr2'] - merged = pandas.merge(df_all, rels_df, how="left", left_on=['EntryNr1', 'EntryNr2'], - right_on=enr_col_names, validate='one_to_one') - - for target, source in assignments: - # replace NaN in column from rels_df by the default value of the target column - merged.loc[merged[source].isnull(), source] = tab.coldescrs[target].dflt - # update the data in the target hdf5 column by the source column data - tab.modify_column(column=merged[source].as_matrix(), colname=target) - tab.flush() - - def _add_sequence(self, sequence, row, sequence_array, off, typ="Seq"): - # add ' ' after each sequence (Ascii is smaller than - # any AA, allows to build PAT array with split between - # sequences. - seqLen = len(sequence) + 1 - row[typ + 'BufferOffset'] = off - row[typ + 'BufferLength'] = seqLen - seqNumpyObj = numpy.ndarray((seqLen,), - buffer=(sequence + " ").encode('utf-8'), - dtype=tables.StringAtom(1)) - sequence_array.append(seqNumpyObj) - if typ == "Seq": - row['MD5ProteinHash'] = hashlib.md5(sequence.encode('utf-8')).hexdigest() - return seqLen - - def add_proteins(self): - gsNode = self.h5.get_node('/Genome') - nrProt = sum(gsNode.cols.TotEntries) - nrAA = sum(gsNode.cols.TotAA) - protGrp = self._get_or_create_node('/Protein', "Root node for protein (oma entries) information") - protTab = self.h5.create_table(protGrp, 'Entries', tablefmt.ProteinTable, - expectedrows=nrProt) - seqArr = self.h5.create_earray(protGrp, 'SequenceBuffer', - tables.StringAtom(1), (0,), 'concatenated protein sequences', - expectedrows=nrAA + nrProt) - cdnaArr = self.h5.create_earray(protGrp, 'CDNABuffer', - tables.StringAtom(1), (0,), 'concatenated cDNA sequences', - expectedrows=3 * nrAA + nrProt) - seqOff = cdnaOff = 0 - loc_parser = locus_parser.LocusParser() - for gs in gsNode.iterrows(): - genome = gs['UniProtSpeciesCode'].decode() - cache_file = os.path.join( - os.getenv('DARWIN_NETWORK_SCRATCH_PATH', ''), - 'pyoma', 'prots', '{}.json'.format(genome)) - if os.path.exists(cache_file): - with open(cache_file, 'r') as fd: - data = json.load(fd) - else: - data = self.call_darwin_export('GetProteinsForGenome({})'.format(genome)) - - if len(data['seqs']) != gs['TotEntries']: - raise DataImportError('number of entries ({:d}) does ' - 'not match number of seqs ({:d}) for {}' - .format(len(data['seqs']), gs['TotEntries'], genome)) - - locTab = self.h5.create_table('/Protein/Locus', - genome, tablefmt.LocusTable, createparents=True, - expectedrows=gs['TotEntries'] * 4) - - for nr in range(gs['TotEntries']): - eNr = data['off'] + nr + 1 - protTab.row['EntryNr'] = eNr - protTab.row['OmaGroup'] = data['ogs'][nr] - - seqOff += self._add_sequence(data['seqs'][nr], protTab.row, seqArr, seqOff) - cdnaOff += self._add_sequence(data['cdna'][nr], protTab.row, cdnaArr, cdnaOff, 'CDNA') - - protTab.row['Chromosome'] = data['chrs'][nr] - protTab.row['AltSpliceVariant'] = data['alts'][nr] - protTab.row['OmaHOG'] = b" " # will be assigned later - protTab.row['CanonicalId'] = b" " # will be assigned later - - locus_str = data['locs'][nr] - try: - locus_tab = loc_parser.parse(locus_str, eNr) - locTab.append(locus_tab) - len_cds = sum(z['End'] - z['Start']+1 for z in locus_tab) - if len_cds != protTab.row['CDNABufferLength']-1: - self.logger.warning("sum of exon lengths differ with cdna sequence for {}: {} vs {}" - .format(eNr, len_cds, protTab.row['CDNABufferLength']-1)) - - protTab.row['LocusStart'] = locus_tab['Start'].min() - protTab.row['LocusEnd'] = locus_tab['End'].max() - protTab.row['LocusStrand'] = locus_tab[0]['Strand'] - except ValueError as e: - self.logger.warning(e) - protTab.row['SubGenome'] = data['subgenome'][nr].encode('ascii') - protTab.row.append() - protTab.flush() - seqArr.flush() - for n in (protTab, seqArr, locTab): - if n.size_in_memory != 0: - self.logger.info('worte %s: compression ratio %3f%%' % - (n._v_pathname, 100 * n.size_on_disk / n.size_in_memory)) - protTab.cols.EntryNr.create_csindex(filters=self._compr) - protTab.cols.MD5ProteinHash.create_csindex(filters=self._compr) - - def _write_to_table(self, tab, data): - if len(data)>0: - tab.append(data) - self.logger.info('wrote %s : compression ratio %.3f%%' % - (tab._v_pathname, 100 * tab.size_on_disk / tab.size_in_memory)) - - def add_hogs(self): - hog_path = os.path.normpath(os.path.join( - os.environ['DARWIN_NETWORK_SCRATCH_PATH'], - 'pyoma', 'split_hogs')) - entryTab = self.h5.get_node('/Protein/Entries') - tree_filename = os.path.join( - os.environ['DARWIN_BROWSERDATA_PATH'], - 'speciestree.nwk') - if not os.path.exists(hog_path): - hog_file = os.path.join(os.environ['DARWIN_BROWSERDATA_PATH'], - '..', 'downloads', 'oma-hogs.orthoXML.gz') - splitter = OrthoXMLSplitter(hog_file, cache_dir=hog_path) - splitter() - hog_converter = HogConverter(entryTab) - hog_converter.attach_newick_taxonomy(tree_filename) - hogTab = self.h5.create_table('/', 'HogLevel', tablefmt.HOGsTable, - 'nesting structure for each HOG', expectedrows=1e8) - self.orthoxml_buffer = self.h5.create_earray('/OrthoXML', 'Buffer', - tables.StringAtom(1), (0,), 'concatenated orthoxml files', - expectedrows=1e9, createparents=True) - self.orthoxml_index = self.h5.create_table('/OrthoXML', 'Index', tablefmt.OrthoXmlHogTable, - 'Range index per HOG into OrthoXML Buffer', expectedrows=5e6) - for root, dirs, filenames in os.walk(hog_path): - for fn in filenames: - try: - levels = hog_converter.convert_file(os.path.join(root, fn)) - hogTab.append(levels) - fam_nrs = set([z[0] for z in levels]) - self.add_orthoxml(os.path.join(root, fn), fam_nrs) - except Exception as e: - self.logger.error('an error occured while processing ' + fn + ':') - self.logger.exception(e) - - hog_converter.write_hogs() - - def add_orthoxml(self, orthoxml_path, fam_nrs): - """append orthoxml file content to orthoxml_buffer array and add index for the HOG family""" - if len(fam_nrs) > 1: - self.logger.warning('expected only one family per HOG file, but found {}: {}' - .format(len(fam_nrs), fam_nrs)) - self.logger.warning(' --> the orthoxml files per family will be not correct, ' - 'i.e. they will contain all families of this file.') - with open(orthoxml_path, 'r') as fh: - orthoxml = fh.read().encode('utf-8') - offset = len(self.orthoxml_buffer) - length = len(orthoxml) - self.orthoxml_buffer.append(numpy.ndarray((length,), - buffer=orthoxml, dtype=tables.StringAtom(1))) - for fam in fam_nrs: - row = self.orthoxml_index.row - row['Fam'] = fam - row['HogBufferOffset'] = offset - row['HogBufferLength'] = length - offset += length - row.append() - - def xref_databases(self): - return os.path.join(os.environ['DARWIN_BROWSERDATA_PATH'], 'ServerIndexed.db') - - def add_xrefs(self): - self.logger.info('start extracting XRefs, EC and GO annotations') - db_parser = DarwinDbEntryParser() - xref_tab = self.h5.create_table('/', 'XRef', tablefmt.XRefTable, - 'Cross-references of proteins to external ids / descriptions', - expectedrows=1e8) - - ec_tab = self.h5.create_table('/Annotations', 'EC', tablefmt.ECTable, 'Enzyme Commission annotations', - expectedrows=1e7, createparents=True) - gs = self.h5.get_node('/Genome').read() - with DescriptionManager(self.h5, '/Protein/Entries', '/Protein/DescriptionBuffer') as de_man, \ - GeneOntologyManager(self.h5, '/Annotations/GeneOntology', '/Ontologies/GO') as go_man: - xref_importer = XRefImporter(db_parser, gs, xref_tab, ec_tab, go_man, de_man) - files = self.xref_databases() - dbs_iter = fileinput.input(files=files) - db_parser.parse_entrytags(dbs_iter) - xref_importer.flush_buffers() - xref_importer.build_suffix_index() - - def add_group_metadata(self): - m = OmaGroupMetadataLoader(self.h5) - m.add_data() - - def close(self): - self.h5.root._f_setattr('conversion_end', time.strftime("%c")) - self.h5.close() - self.logger.info('closed {}'.format(self.h5.filename)) - - def create_indexes(self): - self.logger.info('creating indexes for HogLevel table') - hogTab = self.h5.get_node('/HogLevel') - for col in ('Fam', 'ID', 'Level'): - if not hogTab.colindexed[col]: - hogTab.colinstances[col].create_csindex() - orthoxmlTab = self.h5.get_node('/OrthoXML/Index') - orthoxmlTab.cols.Fam.create_csindex() - - self.logger.info('creating missing indexes for Entries table') - entryTab = self.h5.get_node('/Protein/Entries') - for col in ('EntryNr', 'OmaHOG', 'OmaGroup', 'MD5ProteinHash'): - if not entryTab.colindexed[col]: - entryTab.colinstances[col].create_csindex() - - self.logger.info('creating index for xrefs (EntryNr and XRefId)') - xrefTab = self.h5.get_node('/XRef') - xrefTab.cols.EntryNr.create_csindex() - xrefTab.cols.XRefId.create_csindex() - - self.logger.info('creating index for go (EntryNr and TermNr)') - goTab = self.h5.get_node('/Annotations/GeneOntology') - goTab.cols.EntryNr.create_csindex() - goTab.cols.TermNr.create_index() - - self.logger.info('creating index for EC (EntryNr)') - ec_tab = self.h5.get_node('/Annotations/EC') - ec_tab.cols.EntryNr.create_csindex() - - self.logger.info('creating index for domains (EntryNr)') - domtab = self.h5.get_node('/Annotations/Domains') - domtab.cols.EntryNr.create_csindex() - - self.logger.info('creating indexes for HOG to prevalent domains ' - '(Fam and DomainId)') - dom2hog_tab = self.h5.get_node('/HOGAnnotations/Domains') - dom2hog_tab.cols.DomainId.create_csindex() - domprev_tab = self.h5.get_node('/HOGAnnotations/DomainArchPrevalence') - domprev_tab.cols.Fam.create_csindex() - - def _iter_canonical_xref(self): - """extract one canonical xref id for each protein. - - We take the first valid xref per gene with the ordering of xrefsources - as given in the xrefsource_order.""" - xrefsource_order = ('UniProtKB/SwissProt', 'UniProtKB/TrEMBL', - 'Ensembl Gene', 'Ensembl Protein', 'FlyBase', - 'WormBase', 'EnsemblGenomes', 'RefSeq', 'SourceID') - - xrefs = self.h5.get_node('/XRef') - source_enum = xrefs.get_enum('XRefSource') - canonical_sources = [source_enum[z] for z in xrefsource_order] - current_protein = None - past_proteins = set([]) - for xref in xrefs: - if xref['EntryNr'] != current_protein: - if current_protein: - past_proteins.add(current_protein) - yield (current_protein, current_xref[1]) - current_protein = xref['EntryNr'] - current_xref = (1000, b'') # init with a sentinel - if current_protein in past_proteins: - raise DataImportError('Data in /XRef is not grouped w.r.t. EntryNr') - try: - rank = canonical_sources.index(xref['XRefSource']) - if rank < current_xref[0]: - current_xref = (rank, xref['XRefId']) - except ValueError: - pass - if current_protein: - yield (current_protein, current_xref[1]) - - def add_canonical_id(self): - """add one canonical xref id to the /Protein/Entries table.""" - self.logger.info('adding canonical ids for each protein...') - prot_tab = self.h5.get_node('/Protein/Entries') - canonical_ids = numpy.chararray(shape=(len(prot_tab),), itemsize=prot_tab.cols.CanonicalId.dtype.itemsize) - for eNr, canonical_id in self._iter_canonical_xref(): - row_nr = eNr - 1 - row = prot_tab[row_nr] - if row['EntryNr'] != eNr: - self.logger.warn('Entries table not properly sorted: {}, expected {}'.format(row['EntryNr'], eNr)) - raise DataImportError('Entries table not properly sorted') - canonical_ids[row_nr] = canonical_id - prot_tab.modify_column(0, len(prot_tab), 1, column=canonical_ids, colname='CanonicalId') - prot_tab.flush() - - def add_domain_info(self, domains): - self.logger.info('adding domain information...') - domtab = self.h5.create_table('/Annotations', 'Domains', tablefmt.DomainTable, createparents=True, - expectedrows=1e7) - entrytab = self.h5.get_node('/Protein/Entries') - md5_to_enr = collections.defaultdict(list) - for e in entrytab: - md5_to_enr[e['MD5ProteinHash']].append(e['EntryNr']) - - buffer = [] - for i, domain in enumerate(domains): - for entry_nr in md5_to_enr[domain.md5.encode('utf-8')]: - buffer.append((entry_nr, domain.id, domain.coords)) - if len(buffer) > 5000: - domtab.append(buffer) - buffer = [] - if i % 50000 == 0: - self.logger.info('processed {:d} domain annotations so far'.format(i)) - if len(buffer) > 0: - domtab.append(buffer) - domtab.flush() - - def add_domainname_info(self, domainname_infos): - self.logger.info('adding domain name information...') - dom_name_tab = self.h5.create_table('/Annotations', 'DomainDescription', tablefmt.DomainDescriptionTable, - createparents=True, expectedrows=2e5) - buffer = [] - for i, dom_info in enumerate(domainname_infos): - buffer.append(dom_info) - if len(buffer) > 5000: - self._write_to_table(dom_name_tab, buffer) - buffer = [] - if i % 50000 == 0: - self.logger.info('processed {:d} domain name descriptions so far'.format(i)) - if len(buffer) > 0: - self._write_to_table(dom_name_tab, buffer) - dom_name_tab.flush() - - def update_summary_stats(self): - """update the summary statistics of xrefs & go. - - The function analyses the well-known xref sources as well as - GO annotations and computes aggregated counts for - all / in OMA Group / in HOGs for all of them. - """ - for tab_name, sum_fun in [('/Annotations/GeneOntology', self.count_xref_summary), - ('/XRef', self.count_xref_summary)]: - summary = sum_fun() - tab = self.h5.get_node(tab_name) - for attr, val in summary.items(): - tab.set_attr(attr, val) - - group_sizes = self.collect_group_sizes() - summary = self._get_or_create_node('/Summary', 'Various Summary Statistics') - for group_type in group_sizes.keys(): - grp_size_tab = self.create_table_if_needed( - summary, '{}_size_hist'.format(group_type), - description=tablefmt.GroupsizeHistogram, - drop_data=True) - data = sorted(group_sizes[group_type].items()) - grp_size_tab.append(data) - - cov_fracs = self.add_domain_covered_sites_counts() - cov_hist, bins = numpy.histogram(cov_fracs[cov_fracs > 0], bins=numpy.linspace(0, 1, 51)) - cov_hist_data = numpy.zeros(50, dtype=[('BinEndValue', 'f4'), ('Counts', 'i4')]) - cov_hist_data['BinEndValue'] = bins[1:] - cov_hist_data['Counts'] = cov_hist - dom_cov_hist_tab = self.create_table_if_needed(summary, 'Domain_coverage_hist', - drop_data=True, obj=cov_hist_data) - dom_cov_hist_tab.set_attr('frac_genes_w_domain', len(cov_fracs[cov_fracs > 0]) / len(cov_fracs)) - dom_cov_hist_tab.set_attr('mean_coverage_overall', numpy.mean(cov_fracs)) - dom_cov_hist_tab.set_attr('mean_coverage_w_domain', numpy.mean(cov_fracs[cov_fracs > 0])) - - def count_gene_ontology_summary(self): - self.logger.info('Bulding gene ontology annotations summary info') - go_tab = self.h5.get_node('/Annotations/GeneOntology') - prot_tab = self.h5.get_node('/Protein/Entries') - exp_codes = frozenset([b'EXP', b'IDA', b'IPI', b'IMP', b'IGI' b'IEP']) - cnts = collections.Counter() - cur_enr = None - for (enr, term), row_iter in itertools.groupby(go_tab, operator.itemgetter('EntryNr','TermNr')): - evidences = {row['Evidence'] for row in row_iter} - is_iea = b'IEA' in evidences - evidences.discard(b'IEA') - is_exp = not exp_codes.isdisjoint(evidences) - is_cur = len(evidences.difference(exp_codes)) > 0 - cnts['annotations_any'] += 1 - if is_exp: - cnts['annotations_exp'] += 1 - if is_cur: - cnts['annotations_currated'] += 1 - if is_iea: - cnts['annotations_iea'] += 1 - if cur_enr != enr: - e = next(prot_tab.where('EntryNr == {}'.format(enr))).fetch_all_fields() - cnts['proteins_any'] += 1 - if e['OmaGroup'] != 0: - cnts['protein_OmaGroup'] += 1 - if len(e['OmaHOG']) > 0: - cnts['protein_HOG'] += 1 - cur_enr = enr - return cnts - - def count_xref_summary(self): - self.logger.info('Building cross-ref summary info') - xref_tab = self.h5.get_node('/XRef') - prot_tab_iter = iter(self.h5.get_node('/Protein/Entries')) - source = xref_tab.get_enum('XRefSource') - trusted = frozenset(['UniProtKB/SwissProt', 'UniProtKB/TrEMBL', 'RefSeq', 'EntrezGene', 'Ensembl Gene', 'Ensembl Protein']) - if len(trusted.difference(source._names.keys())) > 0: - raise ValueError('set of trusted xrefs is invalid') - cnts = collections.Counter() - - entry = next(prot_tab_iter) - for enr, xref_it in itertools.groupby(xref_tab, operator.itemgetter('EntryNr')): - while entry['EntryNr'] < enr: - entry = next(prot_tab_iter) - sources_all = [source._values[x['XRefSource']] for x in xref_it] - cnts += collections.Counter(sources_all) - has_trusted_xref = len(trusted.intersection(sources_all)) > 0 - if has_trusted_xref: - cnts['trusted_all'] += 1 - if entry['OmaGroup'] != 0: - cnts['trusted_OmaGroup'] += 1 - if len(entry['OmaHOG']) > 0: - cnts['trusted_HOG'] += 1 - return cnts - - def collect_group_sizes(self): - self.logger.info("Building grouping size histograms") - groupings = ('OmaHOG', 'OmaGroup') - memb_cnts = {grp: collections.defaultdict(int) for grp in groupings} - fam_re = re.compile(br'([A-Z]+:)?(?P[0-9]+).*') - prot_tab = self.h5.get_node('/Protein/Entries') - for row in prot_tab: - for grp in groupings: - if grp == 'OmaHOG': - m = fam_re.match(row[grp]) - if m is None: - continue - grp_id = int(m.group('fam')) - else: - grp_id = int(row[grp]) - if grp_id == 0: - continue - memb_cnts[grp][grp_id] += 1 - sizes = {grp: collections.defaultdict(int) for grp in groupings} - for grp in groupings: - for grp_size in memb_cnts[grp].values(): - sizes[grp][grp_size] += 1 - return sizes - - def compute_domaincovered_sites(self): - dom_tab = self.h5.get_node('/Annotations/Domains') - domains = pandas.DataFrame.from_records(dom_tab[:]) - - def dlen(coords): - doms = [int(pos) for pos in coords.split(b':')] - return sum((doms[i + 1] - doms[i] + 1 for i in range(0, len(doms), 2))) - - # sum all parts of each domain region and store total length in DLen column - domains = domains.assign(DLen=domains['Coords'].apply(dlen)) - # sum over all domains per protein - cov_sites = domains.groupby('EntryNr').agg({'DLen': sum}) - return cov_sites - - def add_domain_covered_sites_counts(self): - """Stores the number of AA covered by a DomainAnnotation. - - This method adds to the hdf5 file a /Protein/DomainCoverage array that - contains the number of AA sites covered by a domain. The position - corresponds to the protein entry numbers in /Protein/Entries. - - :Note: The method assumes that the domains are all non-overlapping. - If they are not, the reported coverage will be too high! - - :return: covered fractions by domains for each protein - :rtype: numpy.array""" - self.logger.info("Counting covered sites by domains") - cov_sites_df = self.compute_domaincovered_sites() - - prot_tab = self.h5.get_node('/Protein/Entries') - enr_col = prot_tab.col('EntryNr') - assert numpy.all(numpy.equal(enr_col, numpy.arange(1, len(prot_tab)+1))) - - cov_sites = numpy.zeros(len(prot_tab), dtype=numpy.uint32) - for eNr, coverage in zip(cov_sites_df.index, cov_sites_df.DLen.values): - cov_sites[eNr-1] = coverage - create_node = False - try: - dom_cov_tab = self.h5.get_node('/Protein/CoveredSitesByDomains') - if len(dom_cov_tab) != len(cov_sites): - self.h5.remove_node('/Protein/CoveredSitesByDomains') - create_node = True - except tables.NoSuchNodeError: - create_node = True - if create_node: - dom_cov_tab = self.h5.create_carray('/Protein', 'CoveredSitesByDomains', - tables.UInt32Atom(), (len(cov_sites),)) - dom_cov_tab[0:len(cov_sites)] = cov_sites - return cov_sites / (prot_tab.col('SeqBufferLength') - 1) - - def add_sequence_suffix_array(self, k=6, fn=None, sa=None): - ''' - Adds the sequence suffix array to the database. NOTE: this - (obviously) requires A LOT of memory for large DBs. - ''' - # Ensure we're run in correct order... - assert ('Protein' in self.h5.root), 'Add proteins before calc. SA!' - idx_compr = tables.Filters(complevel=6, complib='blosc', fletcher32=True) - - # Add to separate file if fn is set. - if fn is None: - db = self.h5 - else: - fn = os.path.normpath(os.path.join( - os.getenv('DARWIN_BROWSERDATA_PATH', ''), - fn)) - db = tables.open_file(fn, 'w', filters=idx_compr) - db.create_group('/', 'Protein') - db.root._f_setattr('conversion_start', time.strftime("%c")) - self.logger.info('opened {}'.format(db.filename)) - - # Load sequence buffer to memory - this is required to calculate the SA. - # Do it here (instead of in PySAIS) so that we can use it for computing - # the split points later. - seqs = self.h5.get_node('/Protein/SequenceBuffer')[:].tobytes() - n = len(self.h5.get_node('/Protein/Entries')) - - # Compute & save the suffix array to DB. TODO: work out what compression - # works best! - if sa is None: - sa = sais(seqs) - sa[:n].sort() # Sort delimiters by position. - db.create_carray('/Protein', - name='SequenceIndex', - title='concatenated protein sequences suffix array', - obj=sa, - filters=idx_compr) - - # Create lookup table for fa2go - dtype = (numpy.uint32 if (n < numpy.iinfo(numpy.uint32).max) else - numpy.uint64) - idx = numpy.zeros(sa.shape, dtype=dtype) - mask = numpy.zeros(sa.shape, dtype=numpy.bool) - - # Compute mask and entry index for sequence buff - for i in range(n): - s = (sa[i - 1] if i > 0 else -1) + 1 - e = (sa[i] + 1) - idx[s:e] = i + 1 - mask[(e - k):e] = True # (k-1) invalid and delim. - - # Mask off those we don't want... - sa = sa[~mask[sa]] - - # Reorder the necessary elements of entry index - idx = idx[sa] - - # Initialise lookup array - atom = (tables.UInt32Atom if dtype is numpy.uint32 else tables.UInt64Atom) - kmers = KmerEncoder(k, is_protein=True) - kmer_lookup_arr = db.create_vlarray('/Protein', - name='KmerLookup', - atom=atom(shape=()), - title='kmer entry lookup table', - filters=idx_compr, - expectedrows=len(kmers)) - kmer_lookup_arr._f_setattr('k', k) - - # Now find the split points and construct lookup ragged array. - ii = 0 - for kk in tqdm(range(len(kmers)), desc='Constructing kmer lookup'): - kmer = kmers.encode(kk) - if (ii < len(sa)) and (seqs[sa[ii]:(sa[ii] + k)] == kmer): - jj = ii + 1 - while (jj < len(sa)) and (seqs[sa[jj]:(sa[jj] + k)] == kmer): - jj += 1 - kmer_lookup_arr.append(idx[ii:jj]) - # New start - ii = jj - else: - # End or not found - kmer_lookup_arr.append([]) - - if db.filename != self.h5.filename: - self.logger.info('storing external links to SequenceIndex and KmerLookup') - self.h5.create_external_link('/Protein', 'KmerLookup', - self._relative_path_to_external_node(kmer_lookup_arr)) - self.h5.create_external_link('/Protein', 'SequenceIndex', - self._relative_path_to_external_node(db.root.Protein.SequenceIndex)) - db.root._f_setattr('conversion_end', time.strftime("%c")) - db.close() - self.logger.info('closed {}'.format(db.filename)) - - def _relative_path_to_external_node(self, node): - rel_path = os.path.relpath(node._v_file.filename, os.path.dirname(self.h5.filename)) - return str(rel_path + ":" + node._v_pathname) - - def add_hog_domain_prevalence(self): - # Check that protein entries / domains are added already to the DB - assert True # TODO - - # Used later - hl_tab = self.h5.get_node('/HogLevel') - if not hl_tab.colindexed['Fam']: - hl_tab.colinstances['Fam'].create_csindex() - - # Load the HOG -> Entry table to memory - prot_tab = self.h5.root.Protein.Entries - # TODO: work out how to do this in a neater way - df = pandas.DataFrame.from_records(((z['EntryNr'], z['OmaHOG'], z['SeqBufferLength']) - for z in prot_tab.iterrows()), - columns=['EntryNr', 'OmaHOG', 'SeqBufferLength']) - # Strip singletons - df = df[~(df['OmaHOG'] == b'')] - - # Reformat HOG ID to plain-integer for top-level grouping only - df['OmaHOG'] = df['OmaHOG'].apply(lambda i: int(i[4:].split(b'.')[0])) - - # Load domains - domains = pandas.DataFrame.from_records(self.h5.root.Annotations.Domains[:]) - - # Ensure sorted by coordinate - TODO: move this to DA import function - domains['start'] = domains['Coords'].apply(lambda c: - int(c.split(b':')[0])) - domains.sort_values(['EntryNr', 'start'], inplace=True) - domains = domains[['EntryNr', 'DomainId']] - - # Merge domains / entry-hog tables. Keep entries with no domains - # so that we can count the size of the HOGs. - df = pandas.merge(df, domains, on='EntryNr', how='left') - - # Gather entry-domain for each HOG. - hog2dom = [] - hog2info = [] - for (hog_id, hdf) in tqdm(df.groupby('OmaHOG')): - size = len(set(hdf['EntryNr'])) - - hdf = hdf[~hdf['DomainId'].isnull()] - cov = len(set(hdf['EntryNr'])) # Coverage with any DA - - if (size > 2) and (cov > 1): - # There are some annotations - da = collections.defaultdict(list) - for (enum, edf) in hdf.groupby('EntryNr'): - d = edf['DomainId'] - d = tuple(d) if (type(d) != bytes) else (d,) - da[d].append(enum) - - da = sorted(da.items(), key=lambda i: len(i[1]), reverse=True) - c = len(da[0][1]) # Count of prev. DA - if c > 1: - # DA exists in more than one member. - cons_da = da[0][0] - repr_entry = da[0][1][0] - tl = hl_tab.read_where('Fam == {}'.format(hog_id))[0]['Level'].decode('ascii') - rep_len = hdf[hdf['EntryNr'] == repr_entry]['SeqBufferLength'] - rep_len = int(rep_len if len(rep_len) == 1 else list(rep_len)[0]) - - # Save the consensus DA - off = len(hog2info) # Offset in the information table. - hog2dom += [(off, d) for d in cons_da] - - # Save required information about this group for the web - # view. - hog2info.append((hog_id, # HOG ID - repr_entry, # Repr. entry - rep_len, # Repr. entry length - tl, # Top level of HOG - size, # HOG size - c)) # Prevalence - - # Create tables in file -- done this way as these end up being pretty - # small tables (<25MB) - tab = self.h5.create_table('/HOGAnnotations', - 'DomainArchPrevalence', - tablefmt.HOGDomainArchPrevalenceTable, - createparents=True, - expectedrows=len(hog2info)) - self._write_to_table(tab, hog2info) - tab.flush() # Required? - - # HOG <-> Domain table - tab = self.h5.create_table('/HOGAnnotations', - 'Domains', - tablefmt.HOGDomainPresenceTable, - createparents=True, - expectedrows=len(hog2dom)) - self._write_to_table(tab, hog2dom) - tab.flush() # Required? - - -def download_url_if_not_present(url, force_copy=False): - if url.startswith('file://') and not force_copy: - fname = url[len('file://'):] - if os.path.exists(fname): - common.package_logger.info('using file "{}" directly from source without copying.'.format(url)) - return fname - tmpfolder = os.path.join(os.getenv('DARWIN_NETWORK_SCRATCH_PATH', '/tmp'), "Browser", "xref") - basename = url.split('/')[-1] - fname = os.path.join(tmpfolder, basename) - if not os.path.exists(tmpfolder): - os.makedirs(tmpfolder) - if not os.path.exists(fname): - common.package_logger.info("downloading {} into {}".format(url, fname)) - try: - urllib.request.urlretrieve(url, fname) - except urllib.request.URLError: - common.package_logger.warn('cannot download {}'.format(url)) - return fname - - -def iter_domains(url): - DomainTuple = collections.namedtuple('DomainTuple', ('md5', 'id', 'coords')) - - fname = download_url_if_not_present(url) - with gzip.open(fname, 'rt') as uncompressed: - dialect = csv.Sniffer().sniff(uncompressed.read(4096)) - uncompressed.seek(0) - csv_reader = csv.reader(uncompressed, dialect) - col_md5, col_id, col_coord = (None,) * 3 - coord_fromat_trans = str.maketrans('-,', '::') - - for lineNr, row in enumerate(csv_reader): - if col_md5 is None: - # identify which tuples to use. - if len(row) >= 9: - # representative_proteins format. use columns 5-7 - col_md5, col_id, col_coord = 4, 5, 6 - elif len(row) == 3: - # additionally created ones, minimal format - col_md5, col_id, col_coord = 0, 1, 2 - else: - raise DataImportError("Unknown Domain Annotation format in {}".format(uncompressed.filename)) - try: - dom = DomainTuple(row[col_md5], row[col_id], row[col_coord].translate(coord_fromat_trans)) - if lineNr < 10: - # do some sanity checks on the first few lines - if re.match(r'[0-9a-f]{32}$', dom.md5) is None: - raise DataImportError("md5 hash of line {:d} has unexpected values: {}" - .format(lineNr, dom.md5)) - if re.match(r'([1-4]\.\d+\.\d+\.\d+|PF\d+)$', dom.id) is None: - raise DataImportError("Domain-ID of line {:d} has unexpected value: {}" - .format(lineNr, dom.id)) - if re.match(r'\d+:\d+', dom.coords) is None: - raise DataImportError("Domain coordinates in line {:d} has unexpected value: {}" - .format(lineNr, dom.coords)) - yield dom - except Exception: - common.package_logger.exception('cannot create tuple from line {}'.format(lineNr)) - - -def only_pfam_or_cath_domains(iterable): - cath_re = re.compile(r'[1-4]\.') - for dom in iterable: - if dom.id.startswith('PF') or cath_re.match(dom.id) is not None: - yield dom - - -def filter_duplicated_domains(iterable): - """filter duplicated domain annotations that come from different proteins - with the exact same sequence.""" - seen = set([]) - ignored = 0 - for dom in iterable: - if not dom in seen: - seen.add(dom) - yield dom - else: - ignored += 1 - common.package_logger.info("skipped {} duplicated domains. {} distinct domains yielded" - .format(ignored, len(seen))) - - -class OmaGroupMetadataLoader(object): - """OMA Group Meta data extractor. - - This class provides the means to import the Keywords and Fingerprints - of the OMA Groups into the hdf5 database. The data is stored under - in the node defined by :attr:`meta_data_path`, which defaults to - /OmaGroups/MetaData. - """ - keyword_name = "Keywords.drw" - finger_name = "Fingerprints" - - meta_data_path = '/OmaGroups/MetaData' - - def __init__(self, db): - self.db = db - - def add_data(self): - common.package_logger.info('adding OmaGroup Metadata') - nr_groups = self._get_nr_of_groups() - has_meta_data = self._check_textfiles_avail() - if has_meta_data: - data = self._load_data() - fingerprints = data['Fingerprints'] - keywords = data['Keywords'] - else: - common.package_logger.warning('No fingerprint nor keyword information available') - fingerprints = [b'n/a'] * nr_groups - keywords = [b''] * nr_groups - if nr_groups != len(fingerprints) or nr_groups != len(keywords): - raise DataImportError('nr of oma groups does not match the number of fingerprints and keywords') - - grptab, keybuf = self._create_db_objects(nr_groups) - self._fill_data_into_db(fingerprints, keywords, grptab, keybuf) - grptab.modify_column(column=self._get_group_member_counts(), colname='NrMembers') - self._create_indexes(grptab) - - def _create_db_objects(self, nrows): - key_path = os.path.join(os.path.dirname(self.meta_data_path), 'KeywordBuffer') - try: - self.db.get_node(self.meta_data_path) - self.db.remove_node(self.meta_data_path) - self.db.remove_node(key_path) - except tables.NoSuchNodeError: - pass - root, name = self.meta_data_path.rsplit('/', 1) - grptab = self.db.create_table(root, name, tablefmt.OmaGroupTable, - expectedrows=nrows, createparents=True) - buffer = self.db.create_earray(root, "KeywordBuffer", tables.StringAtom(1), (0,), - 'concatenated group keywords descriptions', - expectedrows=500 * nrows) - return grptab, buffer - - def _fill_data_into_db(self, stable_ids, keywords, grp_tab, key_buf): - row = grp_tab.row - buf_pos = 0 - for i in range(len(stable_ids)): - row['GroupNr'] = i+1 - row['Fingerprint'] = stable_ids[i] - row['KeywordOffset'] = buf_pos - row['KeywordLength'] = len(keywords[i]) - row.append() - key = numpy.ndarray((len(keywords[i]),), buffer=keywords[i], - dtype=tables.StringAtom(1)) - key_buf.append(key) - buf_pos += len(keywords[i]) - grp_tab.flush() - key_buf.flush() - - def _create_indexes(self, grp_tab): - grp_tab.cols.Fingerprint.create_csindex() - grp_tab.cols.GroupNr.create_csindex() - - def _parse_darwin_string_list_file(self, fh): - data = fh.read() - start, end = data.find(b'['), data.rfind(b', NULL]') - if end == -1: - end = data.rfind(b']:') - part = data[start:end] + b']' - as_json = part.replace(b"''", b"__apos__").replace(b"'", b'"')\ - .replace(b'__apos__', b"'") - as_list = json.loads(as_json.decode()) - return [el.encode('utf8') for el in as_list] - - def _load_data(self): - return callDarwinExport('GetGroupData()') - - def _get_nr_of_groups(self): - etab = self.db.get_node('/Protein/Entries') - try: - return etab[etab.colindexes['OmaGroup'][-1]]['OmaGroup'] - except KeyError: - return max(etab.col('OmaGroup')) - - def _get_group_member_counts(self): - grp_nr, cnts = numpy.unique(self.db.get_node('/Protein/Entries').col('OmaGroup'), return_counts=True) - if grp_nr[0] == 0: - cnts = cnts[1:] - assert(len(cnts) == self._get_nr_of_groups()) - return cnts - - def _check_textfiles_avail(self): - rootdir = os.getenv('DARWIN_BROWSERDATA_PATH','') - fn1 = os.path.join(rootdir, self.keyword_name) - fn2 = os.path.join(rootdir, self.finger_name) - return os.path.exists(fn1) and os.path.exists(fn2) - - -class DescriptionManager(object): - def __init__(self, db, entry_path, buffer_path): - self.db = db - self.entry_path = entry_path - self.buffer_path = buffer_path - - def __enter__(self): - self.entry_tab = self.db.get_node(self.entry_path) - if not numpy.all(numpy.equal(self.entry_tab.col('EntryNr'), - numpy.arange(1, len(self.entry_tab) + 1))): - raise RuntimeError('entry table is not sorted') - - root, name = os.path.split(self.buffer_path) - self.desc_buf = self.db.create_earray(root, name, - tables.StringAtom(1), (0,), 'concatenated protein descriptions', - expectedrows=len(self.entry_tab) * 100) - self.cur_eNr = None - self.cur_desc = [] - bufindex_dtype = numpy.dtype([(col, self.entry_tab.coldtypes[col]) - for col in ('DescriptionOffset', 'DescriptionLength')]) - # columns to be stored in entry table with buffer index data - self.buf_index = numpy.zeros(len(self.entry_tab), dtype=bufindex_dtype) - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.cur_eNr: - self._store_description() - self.desc_buf.flush() - self.entry_tab.modify_columns(columns=self.buf_index, - names=self.buf_index.dtype.names) - self.entry_tab.flush() - - def add_description(self, eNr, desc): - """stages a description for addition. Note that the descriptions - must be ordered according to the entryNr, i.e. all descriptions - related to eNr X must be staged before changeing to another eNr.""" - if self.cur_eNr and self.cur_eNr != eNr: - self._store_description() - self.cur_desc = [] - self.cur_eNr = eNr - self.cur_desc.append(desc) - - def _store_description(self): - buf = "; ".join(self.cur_desc).encode('utf-8') - buf = buf[0:2 ** 16 - 1] # limit to max value of buffer length field - len_buf = len(buf) - idx = self.cur_eNr - 1 - self.buf_index[idx]['DescriptionOffset'] = len(self.desc_buf) - self.buf_index[idx]['DescriptionLength'] = len_buf - self.desc_buf.append(numpy.ndarray((len_buf,), buffer=buf, dtype=tables.StringAtom(1))) - - -class GeneOntologyManager(object): - ontology_url = "http://purl.obolibrary.org/obo/go/go-basic.obo" - - def __init__(self, db, annotation_path, ontology_path): - self.db = db - self.annotation_path = annotation_path - self.ontology_path = ontology_path - self._go_buf = [] - self.quote_re = re.compile(r'([[,])([\w_:]+)([,\]])') - - def __enter__(self): - go_obo_file = download_url_if_not_present(self.ontology_url) - # check that ontology file is not broken. if we can build it, it should be ok - self.go = GeneOntology(OntologyParser(go_obo_file)) - self.go.parse() - - with open(go_obo_file, 'rb') as fh: - go_obo = fh.read() - root, name = os.path.split(self.ontology_path) - obo = self.db.create_carray(root, name, title='Gene ontology hierarchy definition', createparents=True, - obj=numpy.ndarray(len(go_obo), buffer=go_obo, dtype=tables.StringAtom(1))) - obo._f_setattr('ontology_release', self._get_obo_version(obo)) - - root, name = os.path.split(self.annotation_path) - self.go_tab = self.db.create_table(root, name, tablefmt.GeneOntologyTable, - 'Gene Ontology annotations', expectedrows=1e8, createparents=True) - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self._flush_buffers() - self.go_tab.flush() - - def _get_obo_version(self, obo_arr): - header = obo_arr[0:1000].tobytes() - rel_info = re.search(b'data-version:\s*(?P[\w/_ -]+)', header) - if rel_info is not None: - rel_info = rel_info.group('version').decode() - return rel_info - - def _flush_buffers(self): - common.package_logger.info('flushing go annotations buffers') - if len(self._go_buf) > 0: - self.go_tab.append(self._go_buf) - self._go_buf = [] - - def add_annotations(self, enr, gos): - """parse go annotations and add them to the go buffer""" - if not (isinstance(enr, int) and isinstance(gos, str)): - raise ValueError('input data invalid') - for t in gos.split('; '): - t = t.strip() - try: - term, rem = t.split('@') - except ValueError as e: - common.package_logger.warning('cannot parse GO annotation: ' + t) - continue - - try: - term_nr = self.go.term_by_id(term).id - except ValueError: - common.package_logger.warning('invalid GO term for entry {:d}: {:s} (likely obsolete)' - .format(enr, term)) - continue - rem = rem.replace('{', '[') - rem = rem.replace('}', ']') - rem = self.quote_re.sub('\g<1>"\g<2>"\g<3>', rem) - for evi, refs in eval(rem): - for ref in refs: - self._go_buf.append((enr, term_nr, evi, ref.encode('utf-8'))) - if len(self._go_buf) > 2e6: - self._flush_buffers() - - -class GroupAnnotatorInclGeneRefs(familyanalyzer.GroupAnnotator): - def _annotateGroupR(self, node, og, idx=0): - if familyanalyzer.OrthoXMLQuery.is_geneRef_node(node): - node.set('og', og) - else: - super()._annotateGroupR(node, og, idx) - - -class HogConverter(object): - def __init__(self, entry_tab): - self.fam_re = re.compile(r'HOG:(?P\d+)') - self.hogs = numpy.zeros(shape=(len(entry_tab) + 1,), dtype=entry_tab.cols.OmaHOG.dtype) - self.entry_tab = entry_tab - - def attach_newick_taxonomy(self, tree): - self.taxonomy = familyanalyzer.NewickTaxonomy(tree) - - def _assert_hogid_has_correct_prefix(self, fa_parser): - for grp in fa_parser.getToplevelGroups(): - if not grp.get('id').startswith('HOG:'): - grp.set('id', 'HOG:{:07d}'.format(int(grp.get('id')))) - - def convert_file(self, fn): - p = familyanalyzer.OrthoXMLParser(fn) - self._assert_hogid_has_correct_prefix(p) - if hasattr(self, 'taxonomy'): - p.augmentTaxonomyInfo(self.taxonomy) - else: - p.augmentTaxonomyInfo(familyanalyzer.TaxonomyFactory.newTaxonomy(p)) - GroupAnnotatorInclGeneRefs(p).annotateDoc() - - levs = [] - for fam in p.getToplevelGroups(): - m = self.fam_re.match(fam.get('og')) - fam_nr = int(m.group('fam_nr')) - levs.extend([(fam_nr, n.getparent().get('og'), n.get('value'),) + self.get_hog_scores(n.getparent()) - for n in p._findSubNodes('property', root=fam) - if n.get('name') == "TaxRange"]) - - geneNodes = p.root.findall('.//{{{ns0}}}geneRef'. - format(**familyanalyzer.OrthoXMLParser.ns)) - for x in geneNodes: - self.hogs[int(x.get('id'))] = x.get('og') - - return levs - - def write_hogs(self): - """update the Entry Table with the newly collected OmaHOG values for all - the proteins at once. - - .. note: This method will overwrite any previous value of the OmaHOG column""" - self.entry_tab.modify_column(0, len(self.entry_tab), 1, self.hogs[1:], 'OmaHOG') - self.entry_tab.flush() - - def get_hog_scores(self, og_node): - """extract the scores associated with an orthologGroup node - - only scores that are defined in HOGsTable are extract. The method - returns a tuple with the scores in the order of the score fields.""" - scores = collections.OrderedDict([(score, tablefmt.HOGsTable.columns[score].dflt) - for score in ('CompletenessScore', 'ImpliedLosses')]) - for score in og_node.iterfind('{*}score'): - score_id = score.get("id") - if score_id == "CompletenessScore": - scores['CompletenessScore'] = float(score.get('value')) - elif score_id == "ImpliedLosses": - scores['ImpliedLosses'] = int(score.get('value')) - return tuple(scores.values()) - - -class XRefImporter(object): - """Object to import various types of crossreferences into hdf5. - - The XRefImporter registers at a db_parser object various handlers - to import the various types of xrefs, namely ids, go-terms, - EC annotations and descriptions.""" - def __init__(self, db_parser, genomes_tab, xref_tab, ec_tab, go_manager, desc_manager): - self.xrefs = [] - self.ec = [] - self.xref_tab = xref_tab - self.ec_tab = ec_tab - self.go_manager = go_manager - self.desc_manager = desc_manager - - self.verif_enum = tablefmt.XRefTable.columns.get('Verification').enum - xrefEnum = tablefmt.XRefTable.columns.get('XRefSource').enum - tag_to_enums = { - 'GI': (xrefEnum['GI'], 'exact'), - 'EntrezGene': (xrefEnum['EntrezGene'], 'exact'), - 'WikiGene': (xrefEnum['WikiGene'], 'unchecked'), - 'IPI': (xrefEnum['IPI'], 'unchecked'), - 'Refseq_ID': (xrefEnum['RefSeq'], 'exact'), - 'SwissProt': (xrefEnum['UniProtKB/SwissProt'], 'exact'), - 'GeneName': (xrefEnum['Gene Name'], 'unchecked'), - 'ORFNames': (xrefEnum['ORF Name'], 'unchecked'), - 'OrderedLocusNames': (xrefEnum['Ordered Locus Name'], 'unchecked'), - 'ProtName': (xrefEnum['Protein Name'], 'unchecked'), - 'Synonyms': (xrefEnum['Synonym'], 'unchecked'), - 'HGNC_Id': (xrefEnum['HGNC'], 'unchecked'), - 'PMP': (xrefEnum['PMP'], 'exact'), - 'PDB': (xrefEnum['PDB'], 'unchecked'), - 'EMBL': (xrefEnum['EMBL'], 'unchecked'), - 'ID': (xrefEnum['SourceID'], 'exact'), - 'AC': (xrefEnum['SourceAC'], 'exact'), - } - for tag, enumval in tag_to_enums.items(): - db_parser.add_tag_handler( - tag, - lambda key, enr, typ=enumval: self.multi_key_handler(key, enr, typ[0], typ[1])) - db_parser.add_tag_handler('DE', - lambda key, enr: self.description_handler(key, enr)) - db_parser.add_tag_handler('GO', self.go_handler) - db_parser.add_tag_handler('ID', self.assign_source_handler) - db_parser.add_tag_handler('AC', self.assign_source_handler) - db_parser.add_tag_handler('EC', self.ec_handler) - - for tag in ['SwissProt_AC', 'UniProt']: # UniProt/TrEMBL tag is cut to UniProt! - db_parser.add_tag_handler(tag, - lambda key, enr, typ=xrefEnum['UniProtKB/TrEMBL']: - self.remove_uniprot_code_handler(key, enr, typ)) - - # register the potential_flush as end_of_entry_notifier - db_parser.add_end_of_entry_notifier(self.potential_flush) - - self.db_parser = db_parser - self.xrefEnum = xrefEnum - self.ENS_RE = re.compile(r'ENS(?P[A-Z]{0,3})(?P[GTP])(?P\d{11})') - self.FB_RE = re.compile(r'FB(?P[gnptr]{2})(?P\d{7})') - self.NCBI_RE = re.compile(r'[A-Z]{3}\d{5}\.\d$') - self.WB_RE = re.compile(r'WBGene\d{8}$') - self.EC_RE = re.compile(r'\d+\.(\d+|-)\.(\d+|-)\.(\d+|-)') - self.ENSGENOME_RE = re.compile(b'Ensembl (Metazoa|Plant|Fungi|Protist|Bacteria)', re.IGNORECASE) - - self.FLUSH_SIZE = 5e6 - - # info about current genome - self.genomes_tab = genomes_tab - self._cur_genome = None - - def _get_genome_info(self, entry_nr): - if not (self._cur_genome is not None and self._cur_genome['EntryOff'] < entry_nr <= - self._cur_genome['EntryOff'] + self._cur_genome['TotEntries']): - self._cur_genome = self.genomes_tab[self.genomes_tab['EntryOff'].searchsorted(entry_nr+1)-1] - return self._cur_genome - - def from_EnsemblGenome(self, entry_nr): - genome_info = self._get_genome_info(entry_nr) - return self.ENSGENOME_RE.search(genome_info['Release']) is not None - - def flush_buffers(self): - common.package_logger.info('flushing xrefs and ec buffers') - if len(self.xrefs) > 0: - self.xref_tab.append(sorted(uniq(self.xrefs))) - self.xrefs = [] - if len(self.ec) > 0: - self.ec_tab.append(sorted(uniq(self.ec))) - self.ec = [] - - def potential_flush(self): - if len(self.xrefs) > self.FLUSH_SIZE: - self.flush_buffers() - - def _add_to_xrefs(self, eNr, enum_nr, key, verif='unchecked'): - if not isinstance(eNr, int): - raise ValueError('eNr is of wrong type:' + str(eNr)) - self.xrefs.append((eNr, enum_nr, key.encode('utf-8'), self.verif_enum[verif], )) - - def key_value_handler(self, key, eNr, enum_nr, verif='unchecked'): - """basic handler that simply adds a key (the xref) under a given enum_nr""" - self._add_to_xrefs(eNr, enum_nr, key, verif) - - def multi_key_handler(self, multikey, eNr, enum_nr, verif='unchecked'): - """try to split the myltikey field using '; ' as a delimiter and add each - part individually under the passed enum_nr id type.""" - for key in multikey.split('; '): - if key.startswith('Rep'): - continue - pos = key.find('.Rep') - if pos > 0: - key = key[0:pos] - self._add_to_xrefs(eNr, enum_nr, key, verif) - - def assign_source_handler(self, multikey, eNr): - """handler that splits the multikey field at '; ' locations and - tries to guess for each part the id_type. If a type could be - identified, it is added under with this id type, otherwise left out.""" - for key in multikey.split('; '): - ens_match = self.ENS_RE.match(key) - if ens_match is not None: - typ = ens_match.group('typ') - if typ == 'P': - enum_nr = self.xrefEnum['Ensembl Protein'] - elif typ == 'G': - enum_nr = self.xrefEnum['Ensembl Gene'] - elif typ == 'T': - enum_nr = self.xrefEnum['Ensembl Transcript'] - common.package_logger.debug( - 'ensembl: ({}, {}, {})'.format(key, typ, enum_nr)) - self._add_to_xrefs(eNr, enum_nr, key, 'exact') - - for enum, regex in {'FlyBase': self.FB_RE, 'NCBI': self.NCBI_RE, 'WormBase': self.WB_RE}.items(): - match = regex.match(key) - if match is not None: - enum_nr = self.xrefEnum[enum] - self._add_to_xrefs(eNr, enum_nr, key, 'unchecked') - if self.from_EnsemblGenome(eNr): - self._add_to_xrefs(eNr, self.xrefEnum.EnsemblGenomes, key, 'exact') - - def go_handler(self, gos, enr): - self.go_manager.add_annotations(enr, gos) - - def ec_handler(self, ecs, enr): - for t in ecs.split('; '): - t = t.strip() - acc_match = self.EC_RE.match(t) - if acc_match is not None: - self.ec.append((enr, acc_match.group(0))) - - def description_handler(self, de, eNr): - self.desc_manager.add_description(eNr, de) - - def remove_uniprot_code_handler(self, multikey, eNr, enum_nr): - """remove the species part (sep by '_') of a uniprot long accession to the short acc""" - common.package_logger.debug( - 'remove_uniprot_code_handler called ({}, {},{})'.format(multikey, eNr, enum_nr)) - for key in multikey.split('; '): - pos = key.find('_') - if pos > 0: - self._add_to_xrefs(eNr, enum_nr, key[0:pos], 'exact') - else: - self._add_to_xrefs(eNr, enum_nr, key, 'exact') - - def build_suffix_index(self, force=False): - parent, name = os.path.split(self.xref_tab._v_pathname) - file_ = self.xref_tab._v_file - idx_node = get_or_create_tables_node(file_, os.path.join(parent, "{}_Index".format(name))) - for arr_name, typ in (('buffer', tables.StringAtom(1)), ('offset', tables.UInt32Atom())): - try: - n = idx_node._f_get_child(arr_name) - if not force: - raise tables.NodeError("Suffix index for xrefs does already exist. Use 'force' to overwrite") - n.remove() - except tables.NoSuchNodeError: - pass - file_.create_earray(idx_node, arr_name, typ, (0,), expectedrows=100e6) - buf, off = (idx_node._f_get_child(node) for node in ('buffer', 'offset')) - self._build_lowercase_xref_buffer(buf, off) - sa = sais(buf) - try: - idx_node._f_get_child('suffix').remove() - except tables.NoSuchNodeError: - pass - file_.create_carray(idx_node, 'suffix', obj=sa) - - def _build_lowercase_xref_buffer(self, buf, off): - cur_pos = 0 - for xref_row in tqdm(self.xref_tab): - lc_ref = xref_row['XRefId'].lower() - ref = numpy.ndarray((len(lc_ref),), buffer=lc_ref, dtype=tables.StringAtom(1)) - buf.append(ref) - off.append([cur_pos]) - cur_pos += len(lc_ref) - - -class DarwinDbEntryParser: - def __init__(self): - """Initializes a Parser for SGML formatted darwin database file - """ - self.tag_handlers = collections.defaultdict(list) - self.end_of_entry_notifier = [] - - def add_tag_handler(self, tag, handler): - """add a callback handler for a certain tag""" - self.tag_handlers[tag].append(handler) - common.package_logger.debug('# handlers for {}: {}'.format(tag, len(self.tag_handlers[tag]))) - - def add_end_of_entry_notifier(self, handler): - self.end_of_entry_notifier.append(handler) - - def parse_entrytags(self, fh): - """ AC, CHR, DE, E, EMBL, EntrezGene, GI, GO, HGNC_Name, HGNC_Sym, - ID, InterPro, LOC, NR , OG, OS, PMP, Refseq_AC, Refseq_ID, SEQ, - SwissProt, SwissProt_AC, UniProt/TrEMBL, WikiGene, flybase_transcript_id - - :param fh: an already opened file handle to the darwin database - file to be parsed.""" - eNr = 0 - for line in fh: - line = line.strip() - if not line.startswith(''): - common.package_logger.debug('skipping line:' + line) - continue - - eNr += 1 - common.package_logger.debug('entry {}: {}'.format(eNr, line.encode('utf-8'))) - entry = lxml.html.fragment_fromstring(line) - for tag, handlers in self.tag_handlers.items(): - common.package_logger.debug('tag {} ({} handlers)'.format(tag, len(handlers))) - tag_text = [t.text for t in entry.findall('./' + tag.lower())] - for value in tag_text: - # common.package_logger.debug('value of tag: {}'.format(value.encode('utf-8'))) - if value is None: - continue - for handler in handlers: - handler(value, eNr) - # common.package_logger.debug('called handler {} with ({},{})'.format( - # handler, value.encode('utf-8'), eNr)) - for notifier in self.end_of_entry_notifier: - notifier() - - -DomainDescription = collections.namedtuple('DomainDescription', - tables.dtype_from_descr(tablefmt.DomainDescriptionTable).names) - - -class CathDomainNameParser(object): - re_pattern = re.compile(r'(?P[0-9.]*)\s{3,}\w{7}\s{3,}:\s*(?P.*)') - source = b'CATH/Gene3D' - - def __init__(self, url): - self.fname = download_url_if_not_present(url) - - def parse(self): - open_lib = gzip.open if self.fname.endswith('.gz') else open - with open_lib(self.fname, 'rt') as fh: - for line in fh: - match = self.re_pattern.match(line) - if match is not None: - yield DomainDescription(DomainId=match.group('id').encode('utf-8'), - Source=self.source, - Description=match.group('desc').encode('utf-8')) - - -class PfamDomainNameParser(CathDomainNameParser): - re_pattern = re.compile(r'(?P\w*)\t\w*\t\w*\t\w*\t(?P.*)') - source = b'Pfam' - - -def augment_genomes_json_download_file(fpath, h5, backup='.bak'): - """Augment the genomes.json file in the download section with additional info - - This function stores the ncbi taxonomy identifiers of internal nodes and adds - the number of ancestral genes to the internal nodes. - - :param fpath: path to genomes.json file - :param h5: hdf5 database handle.""" - common.package_logger.info("Augmenting genomes.json file with Nr of HOGs per level") - # load nr of ancestral genomes at each level - ancestral_hogs = collections.Counter() - step = 2**15 - hog_tab = h5.get_node('/HogLevel') - for start in range(0, len(hog_tab), step): - ancestral_hogs.update((l.decode() for l in hog_tab.read(start, stop=start+step, field='Level'))) - # load taxonomy and sorter by Name - tax = h5.get_node('/Taxonomy').read() - sorter = numpy.argsort(tax['Name']) - with open(fpath, 'rt') as fh: - genomes = json.load(fh) - os.rename(fpath, fpath + '.bak') - - def traverse(node): - if 'children' not in node: - return - for child in node['children']: - traverse(child) - try: - node['nr_hogs'] = ancestral_hogs[node['name']] - except KeyError as e: - common.package_logger.warning('no ancestral hog counts for '+node['name']) - node['nr_hogs'] = 0 - - try: - n = node['name'].encode('utf-8') - idx = numpy.searchsorted(tax['Name'], n, sorter=sorter) - if tax['Name'][sorter[idx]] == n: - node['taxid'] = int(tax['NCBITaxonId'][sorter[idx]]) - else: - raise ValueError('not in taxonomy: {}'.format(n)) - except Exception: - common.package_logger.exception('Cannot identify taxonomy id') - - traverse(genomes) - with open(fpath, 'wt') as fh: - json.dump(genomes, fh) - - -def getLogger(level='DEBUG'): - import logging - - log = logging.getLogger('pyoma') - if isinstance(level, str): - level = logging.getLevelName(level.upper()) - if not isinstance(level, int): - level = logging.DEBUG - log.setLevel(level) - logHandler = logging.StreamHandler() - logHandler.setLevel(level) - logHandler.setFormatter(logging.Formatter( - '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) - log.addHandler(logHandler) - return log - - -def main(name="OmaServer.h5", k=6, idx_name=None, domains=None, log_level='INFO'): - idx_name = (name + '.idx') if idx_name is None else idx_name - - log = getLogger(log_level) - x = DarwinExporter(name, logger=log) - x.add_version() - x.add_species_data() - x.add_orthologs() - x.add_same_species_relations() - x.add_proteins() - x.add_hogs() - x.add_xrefs() - x.add_synteny_scores() - x.add_homoeology_confidence() - if domains is None: - domains = ["file://dev/null"] - x.add_domain_info(filter_duplicated_domains(only_pfam_or_cath_domains(itertools.chain( - iter_domains('ftp://orengoftp.biochem.ucl.ac.uk/gene3d/CURRENT_RELEASE/' + - 'representative_uniprot_genome_assignments.csv.gz'), - iter_domains('file://{}/additional_domains.mdas.csv.gz'.format(os.getenv('DARWIN_BROWSERDATA_PATH', ''))) - )))) - x.add_domainname_info(itertools.chain( - CathDomainNameParser('http://download.cathdb.info/cath/releases/latest-release/' - 'cath-classification-data/cath-names.txt').parse(), - PfamDomainNameParser('ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz').parse())) - x.add_canonical_id() - x.add_group_metadata() - x.add_hog_domain_prevalence() - x.close() - - x = DarwinExporter(name, logger=log) - x.create_indexes() - x.add_sequence_suffix_array(k=k, fn=idx_name) - x.update_summary_stats() - - genomes_json_fname = os.path.normpath(os.path.join( - os.path.dirname(name), '..', 'downloads', 'genomes.json')) - augment_genomes_json_download_file(genomes_json_fname, x.h5) - x.close() diff --git a/src/HogProf/build/lib/pyoma/browser/convert_omastandalone.py b/src/HogProf/build/lib/pyoma/browser/convert_omastandalone.py deleted file mode 100755 index 1857f3e..0000000 --- a/src/HogProf/build/lib/pyoma/browser/convert_omastandalone.py +++ /dev/null @@ -1,139 +0,0 @@ -from .convert import * -from pyoma.browser import OrthoXMLSplitter -import os - - -class StandaloneExporter(DarwinExporter): - DRW_CONVERT_FILE = os.path.abspath(os.path.splitext(__file__)[0] + ".drw") - - def __init__(self, root, name, **kwargs): - os.environ['DARWIN_BROWSERDATA_PATH'] = os.path.abspath(root) - super(StandaloneExporter, self).__init__(name, **kwargs) - self.transformed = False - self.cache_dir = os.path.join(os.getenv('DARWIN_BROWSERDATA_PATH'), 'pyoma') - - def add_homologs(self): - self.assert_cached_results() - for gs in self.h5.root.Genome.iterrows(): - genome = gs['UniProtSpeciesCode'].decode() - rel_node_for_genome = self._get_or_create_node('/PairwiseRelation/{}'.format(genome)) - if 'homologs' not in rel_node_for_genome: - pass - - def get_version(self): - # TODO: obtain real version - return "OmaStandalone; 1.0.x" - - def assert_cached_results(self): - if not self.transformed: - res = self.call_darwin_export("TransformDataToCache('{}');".format( - self.cache_dir)) - if res != 'success': - raise DarwinException('could not transform data from darwin') - self.transformed = True - os.environ['DARWIN_NETWORK_SCRATCH_PATH'] = os.getenv('DARWIN_BROWSERDATA_PATH') - - def add_orthologs(self): - self.assert_cached_results() - for gs in self.h5.root.Genome.iterrows(): - genome = gs['UniProtSpeciesCode'].decode() - rel_node_for_genome = self._get_or_create_node('/PairwiseRelation/{}'.format(genome)) - if 'VPairs' not in rel_node_for_genome: - cache_file = os.path.join( - os.getenv('DARWIN_NETWORK_SCRATCH_PATH', ''), - 'pyoma', 'vps', '{}.txt.gz'.format(genome)) - if os.path.exists(cache_file): - data = load_tsv_to_numpy((cache_file, 0, 0, False,)) - else: - # fallback to read from VPsDB - data = self.call_darwin_export('GetVPsForGenome({})'.format(genome)) - - vp_tab = self.h5.create_table(rel_node_for_genome, 'VPairs', tablefmt.PairwiseRelationTable, - expectedrows=len(data)) - if isinstance(data, list): - data = self._convert_to_numpyarray(data, vp_tab) - self._write_to_table(vp_tab, data) - vp_tab.cols.EntryNr1.create_csindex() - - def add_hogs(self): - hog_path = os.path.join( - os.environ['DARWIN_BROWSERDATA_PATH'], 'Output') - - entryTab = self.h5.get_node('/Protein/Entries') - - tree_filename = os.path.join( - os.environ['DARWIN_BROWSERDATA_PATH'], - 'EstimatedSpeciesTree.nwk') - - hog_converter = HogConverter(entryTab) - - if os.path.exists(tree_filename): - hog_converter.attach_newick_taxonomy(tree_filename) - - fn = 'HierarchicalGroups.orthoxml' - - # Split the OrthoXML up (puts in cache_dir/split_hog). - hog_cache_dir = os.path.join(self.cache_dir, 'split_hogs') - ortho_splitter = OrthoXMLSplitter.OrthoXMLSplitter(os.path.join(hog_path, fn), cache_dir=hog_cache_dir) - ortho_splitter() - - hogTab = self.h5.create_table('/', 'HogLevel', tablefmt.HOGsTable, - 'nesting structure for each HOG', expectedrows=1e8) - self.orthoxml_buffer = self.h5.create_earray('/OrthoXML', 'Buffer', - tables.StringAtom(1), (0,), 'concatenated orthoxml files', - expectedrows=1e9, createparents=True) - self.orthoxml_index = self.h5.create_table('/OrthoXML', 'Index', tablefmt.OrthoXmlHogTable, - 'Range index per HOG into OrthoXML Buffer', expectedrows=5e6) - - try: - levels = hog_converter.convert_file(os.path.join(hog_path, fn)) - hogTab.append(levels) - fam_nrs = set([z[0] for z in levels]) - for fam_nr in fam_nrs: - hog_fn = "HOG{:06d}.orthoxml".format(fam_nr) - self.add_orthoxml(os.path.join(hog_cache_dir, hog_fn), [fam_nr]) - except Exception as e: - self.logger.error('an error occured while processing ' + fn + ':') - self.logger.exception(e) - - hog_converter.write_hogs() - - def _get_genome_database_paths(self): - return self.call_darwin_export('GetGenomeFileNames();') - - def xref_databases(self): - return self._get_genome_database_paths() - - -def import_oma_run(path, outfile, add_domains=True, log_level='INFO'): - log = getLogger(log_level) - x = StandaloneExporter(path, outfile, logger=log, mode='write') - x.add_version() - x.add_species_data() - x.add_orthologs() - x.add_proteins() - x.add_hogs() - x.add_xrefs() - domain_url = ('ftp://orengoftp.biochem.ucl.ac.uk/gene3d/CURRENT_RELEASE/'+ - 'representative_uniprot_genome_assignments.csv.gz') - if not add_domains: - domain_url = 'file:///dev/null' - x.add_domain_info(only_pfam_or_cath_domains(iter_domains(domain_url))) - x.add_domainname_info(itertools.chain( - CathDomainNameParser('http://download.cathdb.info/cath/releases/latest-release/' - 'cath-classification-data/cath-names.txt').parse(), - PfamDomainNameParser('ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz').parse())) - x.add_canonical_id() - x.add_group_metadata() - x.add_hog_domain_prevalence() - x.close() - - x = StandaloneExporter(path, outfile, logger=log) - x.create_indexes() - x.add_sequence_suffix_array() - x.update_summary_stats() - x.close() - - -if __name__ == "__main__": - import_oma_run('~/Repositories/OmaStandalone', 'oma.h5') diff --git a/src/HogProf/build/lib/pyoma/browser/db.py b/src/HogProf/build/lib/pyoma/browser/db.py deleted file mode 100755 index 17315fe..0000000 --- a/src/HogProf/build/lib/pyoma/browser/db.py +++ /dev/null @@ -1,1770 +0,0 @@ -from __future__ import division, print_function -from builtins import chr, range, object, zip, bytes -import io -import itertools -import time -from Bio.UniProt import GOA -from bisect import bisect_left -import dateutil -import pandas as pd -import pyopa -import tables -import threading -import numpy -import numpy.lib.recfunctions -import re -import json -import os -import collections -import logging -from .KmerEncoder import KmerEncoder -from .models import LazyProperty, KeyWrapper, ProteinEntry, Genome -from .geneontology import GeneOntology, OntologyParser, AnnotationParser, GOAspect -from xml.etree import ElementTree as et - -logger = logging.getLogger(__name__) - -# Raise stack limit for PyOPA ~400MB -threading.stack_size(4096*100000) - -# Global initialisations -GAF_VERSION = '2.1' - - -def count_elements(iterable): - """return the number of elements in an iterator in the most efficient way. - - Be aware that for unbound iterators, this method won't terminate! - :param iterable: an iterable object. - """ - counter = itertools.count() - collections.deque(zip(iterable, counter), maxlen=0) # (consume at C speed) - return next(counter) - - -_first_cap_re = re.compile('(.)([A-Z][a-z]+)') -_all_cap_re = re.compile('([a-z0-9])([A-Z])') -def to_snail_case(name): - """function to convert from CamelCase to snail_case""" - s1 = _first_cap_re.sub(r'\1_\2', name) - return _all_cap_re.sub(r'\1_\2', s1).lower() - - -class Database(object): - """This is the main interface to the oma database. Queries - will typically be issued by methods of this object. Typically - the result of queries will be :py:class:`numpy.recarray` objects.""" - EXPECTED_DB_SCHEMA = "3.2" - - def __init__(self, db): - if isinstance(db, str): - logger.info('opening {} for read-only'.format(db)) - self.db = tables.open_file(db, 'r') - elif isinstance(db, tables.File): - self.db = db - else: - raise ValueError(str(db) + ' is not a valid database type') - - try: - db_version = self.db.get_node_attr('/', 'db_schema_version') - except AttributeError: - db_version = "1.0" - - logger.info('database version: {}'.format(db_version)) - if db_version != self.EXPECTED_DB_SCHEMA: - exp_tup = self.EXPECTED_DB_SCHEMA.split('.') - db_tup = db_version.split('.') - if db_tup[0] != exp_tup[0]: - raise DBVersionError('Unsupported database version: {} != {} ({})' - .format(db_version, self.EXPECTED_DB_SCHEMA, self.db.filename)) - else: - logger.warning("outdated database version, but only minor version change: " - "{} != {}. Some functions might fail" - .format(db_version, self.EXPECTED_DB_SCHEMA)) - self.db_schema_version = tuple(int(z) for z in db_version.split(".")) - - try: - self.seq_search = SequenceSearch(self) - except DBConsistencyError as e: - logger.exception("Cannot load SequenceSearch. Any future call to seq_search will fail!") - self.seq_search = object() - self.id_resolver = IDResolver(self) - self.id_mapper = IdMapperFactory(self) - genomes = [Genome(self, g) for g in self.db.root.Genome.read()] - self.tax = Taxonomy(self.db.root.Taxonomy.read(), - genomes={g.ncbi_taxon_id: g for g in genomes}) - self._re_fam = None - self.format_hogid = None - self._set_hogid_schema() - - @LazyProperty - def gene_ontology(self): - """returns GeneOntology object containing hierarchy - of terms using the is_a and part_of relations. See - :meth:`load_gene_ontology` to parametrize the - creation of GeneOntology object.""" - return self.load_gene_ontology(GeneOntology) - - def load_gene_ontology(self, factory=None, rels=None): - """Instantiate GeneOntology object - - By default, a GeneOntology object is returned based on - the default relations (which are defined in :mod:`.gene_ontology`) - - The factory parameter allows to specify an subtype of - GeneOntology, e.g. :class:`.gene_ontology.FreqAwareGeneOntology`, - - The rels parameter should be a list of relation strings that - should be used as parents relations. - - :param factory: GeneOntology factory - :param rels: list of rels for parent relations - - :returns: GeneOntology object""" - try: - fp = io.StringIO(self.db.root.Ontologies.GO.read().tobytes().decode('utf-8')) - except tables.NoSuchNodeError: - p = os.path.join(os.path.dirname(self.db.filename), 'go-basic.obo') - fp = open(p, 'rt') - if factory is None: - factory = GeneOntology - go = factory(OntologyParser(fp), rels=rels) - go.parse() - fp.close() - return go - - def get_hdf5_handle(self): - """return the handle to the database hdf5 file""" - return self.db - - def get_conversion_date(self): - """return the conversion end date from the DB attributes""" - return dateutil.parser.parse(self.db.root._v_attrs['conversion_end']) - - def ensure_entry(self, entry): - """This method allows to use an entry or an entry_nr. - - If necessary it will load the entry from the entry_nr, - otherwise returning the same object again. - - :param entry: the entry_nr of a protein to be loaded or a - protein entry.""" - try: - t = entry['AltSpliceVariant'] - return entry - except (TypeError, AttributeError, IndexError): - if isinstance(entry, (int, numpy.number)): - return self.entry_by_entry_nr(entry) - raise TypeError('Invalid type to retrieve an Entry') - except Exception: - raise TypeError('Invalid type to retrieve an Entry') - - def entry_by_entry_nr(self, entry_nr): - """Returns the entry from the /Protein/Entries table - corresponding to entry_nr. - - :param int entry_nr: a numeric identifier for the protein - entry""" - entry = self.db.root.Protein.Entries[entry_nr - 1] - if entry['EntryNr'] != entry_nr: - logger.warning('EntryNr {} not at position {}. Using index instead'.format(entry_nr, entry_nr - 1)) - entry = self.db.root.Protein.Entries.read_where( - 'EntryNr == {:d}'.format(entry_nr)) - if len(entry) != 1: - raise ValueError("there are {} entries with entry_nr {}".format(len(entry), entry_nr)) - entry = entry[0] - return entry - - def _set_hogid_schema(self): - """Determines the used HOG ID schema - - Some versions of the database have HOG IDs of the form - "HOG:0000001" and others without the prefix (e.g. standalone) - or with the prefix, but without padding. This method checks - which schema is used and sets the appropriate member vars - """ - re_id = re.compile(b'(?PHOG:)(?P\d+)') - for entry in self.db.root.Protein.Entries: - m = re_id.match(entry['OmaHOG']) - if m is None: - continue - nr = m.group('nr') - if len(nr) >= 7 and not nr.startswith(b'0'): - continue # a case where we cannot determine if padded nr - is_padded = nr.startswith(b'0') - prefix = m.group('prefix').decode() - if prefix is None: - prefix = '' - fmt = "{}{{:{}d}}".format(prefix, "07" if is_padded else "") - self._re_fam = re.compile('{}(?P\d{})' - .format(prefix, "{7,}" if is_padded else "+") - .encode('ascii')) - self.format_hogid = lambda fam: fmt.format(fam) - logger.info("setting HOG ID schema: re_fam: {}, hog_fmt: {}" - .format(self._re_fam, fmt)) - return - raise DBConsistencyError('no protein in a hog') - - def all_proteins_of_genome(self, genome): - """return all protein entries of a genome""" - rng = self.id_mapper['OMA'].genome_range(genome) - prot_tab = self.get_hdf5_handle().get_node('/Protein/Entries') - return prot_tab.read_where('(EntryNr >= {}) & (EntryNr <= {})'.format(rng[0], rng[1])) - - def main_isoforms(self, genome): - """returns the proteins that are the main isoforms of a genome. - - The main isoform is in this context the isoform that we used in OMA to - infer the orthologs. It is the one variant that has the most alignment - matches to all other gnomes. - - The genome parameter should be the UniProtSpeciesCode of the species of - interest. If it is a numeric value, the genome parameter is interpreted - as the protein entrynr. The method returns then the main isoforms for - the species to which this protein belongs. - - :Note: OMA only predicts orthologs for the main isoform, so there is no - difference if you work with only the main isoforms or all proteins of - a genome in terms of orthologs. - - :param genome: UniProtSpeciesCode of the genome of interest, or a gene - number (EntryNr) from the genome of interest. - """ - rng = self.id_mapper['OMA'].genome_range(genome) - prot_tab = self.get_hdf5_handle().get_node('/Protein/Entries') - return prot_tab.read_where( - '(EntryNr >= {}) & (EntryNr <= {}) & ((AltSpliceVariant == EntryNr) | (AltSpliceVariant == 0))' - .format(rng[0], rng[1])) - - def get_splicing_variants(self, entry): - e = self.ensure_entry(entry) - if e['AltSpliceVariant'] == 0: - return numpy.array([e], dtype=e.dtype) - # TODO: create index on AltSpliceVariant column?! - return self.get_hdf5_handle().get_node('/Protein/Entries').read_where( - '(EntryNr >= {:d}) & (EntryNr < {:d}) & (AltSpliceVariant == {:d})' - .format(e['EntryNr']-100, e['EntryNr']+100, e['AltSpliceVariant'])) - - def _get_vptab(self, entry_nr): - return self._get_pw_tab(entry_nr, 'VPairs') - - def _get_pw_tab(self, entry_nr, subtab): - genome = self.id_mapper['OMA'].genome_of_entry_nr(entry_nr)['UniProtSpeciesCode'].decode() - return self.db.get_node('/PairwiseRelation/{}/{}'.format(genome, subtab)) - - def count_vpairs(self, entry_nr): - vptab = self._get_vptab(entry_nr) - try: - cnt = count_elements(vptab.where('(EntryNr1=={:d})'.format(entry_nr))) - except (TypeError, ValueError): - cnt = 0 - return cnt - - def count_homoeologs(self, entry_nr): - pwtab = self._get_pw_tab(entry_nr, 'within') - homolog_typ_nr = pwtab.get_enum('RelType')['homeolog'] - try: - cnt = count_elements(pwtab.where('(EntryNr1=={:d}) & (RelType == {:d})'.format(entry_nr, homolog_typ_nr))) - except (TypeError, ValueError): - cnt = 0 - return cnt - - def _get_pw_data(self, entry_nr, tab, typ_filter=None, extra_cols=None): - query = "(EntryNr1 == {:d})".format(entry_nr) - if typ_filter is not None: - query += " & (RelType == {:d})".format(typ_filter) - dat = tab.read_where(query) - typ = tab.get_enum('RelType') - cols = ['EntryNr1', 'EntryNr2', 'Score', 'Distance'] - if extra_cols is not None: - cols.extend(extra_cols) - res = numpy.lib.recfunctions.append_fields( - dat[cols], - names='RelType', - data=[typ(x) for x in dat['RelType']], - usemask=False) - return res - - def get_vpairs(self, entry_nr): - """returns the verified pairs of a query protein. - - This method returns an instance of a :class:`numpy.recarray` class - containing the verified pairs of a query protein entry. - The returned array contains columns with EntryNr1 and EntryNr2 to - identify the pair together with RelType (indicating the subtype of - orthology), the alignment score and the distance. The score and - distance will be set to -1 if unknown. - - :param int entry_nr: the numeric entry_nr of the query protein.""" - vp_tab = self._get_vptab(entry_nr) - return self._get_pw_data(entry_nr, vp_tab) - - def get_within_species_paralogs(self, entry_nr): - """returns the within species paralogs of a given entry - - This method returns a :class:`numpy.recarray` instance - containing the close paralogs. Close paralogs are within - species paralogs that are inparalogs to at least one - ortholog of the query gene in OMA. - - The returned array contains columns with EntryNr1 and EntryNr2 to - identify the pair together with RelType (indicating the subtype of - paralogy), the alignment score and the distance. The score and - distance will be set to -1 if unknown. - - :param int entry_nr: the numeric entry_id of the query protein""" - within_species_paralogs = self._get_pw_tab(entry_nr, 'within') - return self._get_pw_data(entry_nr, within_species_paralogs) - - def get_homoeologs(self, entry_nr): - within_species = self._get_pw_tab(entry_nr, 'within') - homolog_typ_nr = within_species.get_enum('RelType')['homeolog'] - return self._get_pw_data(entry_nr, within_species, - typ_filter=homolog_typ_nr, - extra_cols=['SyntenyConservationLocal', 'Confidence']) - - def neighbour_genes(self, entry_nr, window=1): - """Returns neighbor genes around a query gene. - - This method returns a tuple containing a numpy recarray with - gene entries located around the query gene, and an index - pointing to the query gene. The genes are sorted according to - their position on the chromosome. - - The *windows* parameter specifies the number of genes up- and - downstream of the query gene that should be reported. Note - that the actual number can be smaller if the query gene is close - to a chromosome start or end. - - :param entry_nr: the entry number of the query gene - :param window: the number of neighboring genes on each - side to return""" - if window <= 0 or not isinstance(window, int): - raise ValueError('windows parameters must be a positive integer value') - - dat = self.entry_by_entry_nr(entry_nr) - target_chr = dat['Chromosome'] - genome_range = self.id_mapper['OMA'].genome_range(entry_nr) - f = 5 - data = self.db.root.Protein.Entries.read_where( - '(EntryNr >= {:d}) & (EntryNr <= {:d}) & ' - '(Chromosome == {!r}) & ' - '((AltSpliceVariant == 0) |' - ' (AltSpliceVariant == EntryNr))'.format( - max(genome_range[0], entry_nr - f * window), - min(genome_range[1], entry_nr + f * window), - target_chr)) - data.sort(order=['EntryNr']) - idx = data['EntryNr'].searchsorted(entry_nr) - res = data[max(0, idx - window):min(len(data), idx + window + 1)] - idx = res['EntryNr'].searchsorted(entry_nr) - return res, idx - - def parse_hog_id(self, hog_id): - hog_id = hog_id if isinstance(hog_id, bytes) else hog_id.encode('ascii') - m = self._re_fam.match(hog_id) - if m is not None: - return int(m.group('fam')) - else: - raise ValueError('invalid hog id format') - - def hog_family(self, entry): - entry = self.ensure_entry(entry) - m = self._re_fam.match(entry['OmaHOG']) - if m is None: - raise Singleton(entry) - return int(m.group('fam')) - - def hog_levels_of_fam(self, fam_nr): - """get all taxonomic levels covered by a family. - - The family coresponds to the toplevel numeric id of a HOG, - i.e. for HOG:002421 the fam_nr should be 2421. If a HOG - covers a certain level more than once, it will be returned - several times. - - :param fam_nr: the numeric id of the family (== Toplevel HOG) - """ - return self.db.root.HogLevel.read_where( - '(Fam=={})'.format(fam_nr))['Level'] - - def get_subhogids_at_level(self, fam_nr, level): - """get all the hog ids within a given family at a given taxonomic - level of interest. - - After a duplication in an ancestor lineage, there exists multiple - sub-hogs for any taxonomic level after the duplication. This method - allows to get the list of hogids at the requested taxonomic level. - - E.g. assume in family 1 (HOG:0000001) there has been a duplication - between Eukaryota and Metazoa. this method would return for - get_subhogids_at_level(1, 'Eukaryota') --> ['HOG:0000001'] - and for - get_subhogids_at_level(1, 'Metazoa') --> ['HOG:0000001.1a', 'HOG:0000001.1b'] - - :param fam_nr: the numeric family id - :param level: the taxonomic level of interest""" - lev = level if isinstance(level, bytes) else level.encode('ascii') - return self.db.root.HogLevel.read_where( - '(Fam=={}) & (Level=={!r})'.format(fam_nr, lev))['ID'] - - def member_of_hog_id(self, hog_id, level=None): - """return an array of protein entries which belong to a given hog_id. - - E.g. if hog_id = 'HOG122.1a', the method returns all the proteins that - have either exactly this hog id or an inparalogous id such a HOG122.1a.4b.2a - - If you are only interested in the members of a specific lineage (identified - through its taxonomic range), you can pass the taxonomic range as an - additional argument. Only the proteins of genomes belonging to this clade - will be returned. Otherwise, all proteins with having this specific hog_id - will be returned. - - :param str hog_id: the requested hog_id. - :param level: the taxonomic level of interest - :type level: str or None - - :return: a numpy.array with the protein entries belonging to the requested hog. - :rtype: :class:`numpy.ndarray` - - :Note: Even if you obtained a certain hog_id using - :py:meth:`get_subhogids_at_level` - using a certain level, if you do not specify the level in - :meth:`member_of_hog_id` again, you will likely get proteins from other - clades. Only if it happens that the deepest level of the hog_id - coincides with the taxonomic range of interest, the two will be identical. - """ - hog_range = self._hog_lex_range(hog_id) - # get the proteins which have that HOG number - memb = self.db.root.Protein.Entries.read_where( - '({!r} <= OmaHOG) & (OmaHOG < {!r})'.format(*hog_range)) - if level is not None: - memb = [x for x in memb if level.encode('ascii') in self.tax.get_parent_taxa( - self.id_mapper['OMA'].genome_of_entry_nr(x['EntryNr'])['NCBITaxonId'])['Name']] - - return memb - - def iter_members_of_hog_id(self, hog_id): - """iterates over all proteins that belong to a specific hog_id. - - A hog_id might be an ID of the following form: HOG:0000212.1a - This method will yield all proteins in the form of - :class:`ProteinEntry` instances that are part of this hog_id. - - :param str hog_id: the requested HOG ID. - :return: :py:class:`ProteinEntry` objects - :rtype: iter(:class:`ProteinEntry`)""" - hog_range = self._hog_lex_range(hog_id) - it = self.db.root.Protein.Entries.where( - '({!r} <= OmaHOG) & (OmaHOG < {!r})'.format(*hog_range)) - for row in it: - yield ProteinEntry(self, row.fetch_all_fields()) - - def member_of_fam(self, fam): - """returns an array of protein entries which belong to a given fam""" - if not isinstance(fam, (int, numpy.number)): - raise ValueError('expect a numeric family id') - return self.member_of_hog_id(self.format_hogid(fam)) - - def hog_members(self, entry, level): - """get hog members with respect to a given taxonomic level. - - The method will return a list of protein entries that are all - member of the same hog with respect to the taxonomic range - of interest. - - :param entry: an entry or entry_nr of a query protein - :param level: the taxonomic level of interest""" - query = self.ensure_entry(entry) - members = self.hog_members_from_hog_id(query['OmaHOG'], level) - if query not in members: - raise ValueError(u"Level '{0:s}' undefined for query gene".format(level)) - return members - - def hog_members_from_hog_id(self, hog_id, level): - """get hog members with respect to a given taxonomic level. - - The method will return a list of protein entries that are all - member of the same hog with respect to the taxonomic range - of interest. - - :param bytes hog_id: the query hog id - :param str level: the taxonomic level of interest""" - if isinstance(hog_id, str): - hog_id = hog_id.encode('ascii') - query_fam = self.parse_hog_id(hog_id) - hoglev = None - for hog_candidate in self.db.root.HogLevel.where( - '(Fam == {:d}) & (Level == {!r})'.format(query_fam, level.encode('ascii'))): - if hog_id.startswith(hog_candidate['ID']): - hoglev = hog_candidate - break - if hoglev is None: - raise ValueError(u'Level "{0:s}" undefined for query gene'.format(level)) - # get the entries which have this hogid (or a sub-hog) - members = self.member_of_hog_id(hoglev['ID']) - if level != 'LUCA': - # last, we need to filter the proteins to the tax range of interest - members = [x for x in members if level.encode('ascii') in self.tax.get_parent_taxa( - self.id_mapper['OMA'].genome_of_entry_nr(x['EntryNr'])['NCBITaxonId'])['Name']] - return members - - def get_orthoxml(self, fam): - """returns the orthoxml of a given toplevel HOG family - - :param fam: numeric id of requested toplevel hog""" - idx = self.db.root.OrthoXML.Index.read_where('Fam == {:d}'.format(fam)) - if len(idx) < 1: - raise ValueError('cannot retrieve orthoxml for {}'.format(fam)) - idx = idx[0] - return self.db.root.OrthoXML.Buffer[ - idx['HogBufferOffset']:idx['HogBufferOffset'] + idx['HogBufferLength']].tostring() - - def _hog_lex_range(self, hog): - """return the lexographic range of a hog. - - This can be used to search of sub-hogs which are nested in - the query hog. The semantics is such that - _hog_lex_range[0] <= hog < _hog_lex_range[1]. - This is equivalent to say that a sub-hog starts with the - query hog.""" - hog_str = hog.decode() if isinstance(hog, bytes) else hog - return hog_str.encode('ascii'), (hog_str[0:-1] + chr(1 + ord(hog_str[-1]))).encode('ascii') - - def oma_group_members(self, group_id): - """get the member entries of an oma group. - - This method returns a numpy array of protein entries that form - an oma group. If the group id is invalid (not positive - integer value or a valid Fingerprint), an `InvalidId` Exception - is raised. - - :param group_id: numeric oma group id or Fingerprint""" - group_nr = self.resolve_oma_group(group_id) - members = self.db.root.Protein.Entries.read_where('OmaGroup=={:d}'.format(group_nr)) - return members - - def resolve_oma_group(self, group_id): - if isinstance(group_id, int) and 0 < group_id <= self.get_nr_oma_groups(): - return group_id - elif isinstance(group_id, numpy.integer): - return self.resolve_oma_group(int(group_id)) - elif isinstance(group_id, (bytes, str)): - if group_id.isdigit(): - return self.resolve_oma_group(int(group_id)) - if isinstance(group_id, str): - group_id = group_id.encode('utf-8') - if group_id == b'n/a': - raise InvalidId('Invalid ID (n/a) for an OMA Group') - if not self.seq_search.contains_only_valid_chars(group_id): - raise InvalidId("Invalid ID: non-amino-accids characters in Fingerprint or sequence pattern") - if len(group_id) == 7: - # most likely a fingerprint. let's check that first - group_meta_tab = self.db.get_node('/OmaGroups/MetaData') - try: - e = next(group_meta_tab.where('(Fingerprint == {!r})' - .format(group_id))) - return int(e['GroupNr']) - except StopIteration: - pass - # search in suffix array - entry_nrs = self.seq_search.exact_search( - group_id.decode(), only_full_length=False) - if len(entry_nrs) == 0: - raise InvalidId('No sequence contains search pattern') - group_nrs = {self.entry_by_entry_nr(nr)['OmaGroup'] for nr in entry_nrs} - group_nrs.discard(0) - if len(group_nrs) == 1: - return int(group_nrs.pop()) - elif len(group_nrs) == 0: - raise InvalidId("Sequence with pattern '{}' does not belong to any group" - .format(group_id.decode())) - else: - raise AmbiguousID("sequence pattern matches several oma groups", candidates=group_nrs) - raise InvalidId('Invalid type to determine OMA Group: {} (type: {})'.format(group_id, type(group_id))) - - def oma_group_metadata(self, group_nr): - """get the meta data associated with a OMA Group - - The meta data contains the fingerprint and the keywords infered for this group. - The method retuns this information as a dictionary. The parameter must be - the numeric oma group nr. - - :param int group_nr: a numeric oma group id.""" - if not isinstance(group_nr, (int, numpy.integer)) or group_nr < 0: - raise InvalidId('Invalid group nr: {} (type: {})'.format(group_nr, type(group_nr))) - meta_tab = self.db.get_node('/OmaGroups/MetaData') - try: - e = next(meta_tab.where('GroupNr == {:d}'.format(group_nr))) - kw_buf = self.db.get_node('/OmaGroups/KeywordBuffer') - res = {'fingerprint': e['Fingerprint'].decode(), - 'group_nr': int(e['GroupNr']), - 'keywords': kw_buf[e['KeywordOffset']:e['KeywordOffset']+e['KeywordLength']].tostring().decode(), - 'size': int(e['NrMembers'])} - return res - except StopIteration: - raise InvalidId('invalid group nr') - - def get_nr_oma_groups(self): - """returns the number of OMA Groups in the database""" - tab = self.db.get_node('/Protein/Entries') - try: - idx = tab.colindexes['OmaGroup'][-1] - return int(tab[idx]['OmaGroup']) - except KeyError: - hist = self.group_size_histogram('oma') - return int(hist['Count'].sum()) - - def get_nr_toplevel_hogs(self): - """returns the number of toplevel hogs, i.e. roothogs""" - hist = self.group_size_histogram('hog') - return int(hist['Count'].sum()) - - def group_size_histogram(self, typ=None): - """returns a table with two columns, e.g. Size and Count. - - if typ is set to 'oma' or not set, then the data for the - oma groups is returned. if it is set to 'hog', the data for - the rootlevel hogs is returned. - - :param typ: either 'oma' or 'hog', defaults to 'oma'""" - if typ is None or typ.lower() == 'oma': - tabname = 'OmaGroup' - elif typ.lower() == 'hog': - tabname = 'OmaHOG' - else: - raise ValueError('{} is not a valid group typ'.format(typ)) - tab = self.db.get_node('/Summary/{}_size_hist'.format(tabname)) - return tab.read() - - def get_sequence(self, entry): - """get the protein sequence of a given entry as a string - - :param entry: the entry or entry_nr for which the sequence is requested""" - entry = self.ensure_entry(entry) - seqArr = self.db.get_node('/Protein/SequenceBuffer') - seq = seqArr[entry['SeqBufferOffset']:entry['SeqBufferOffset'] + entry['SeqBufferLength'] - 1] - return seq.tostring() - - def get_cdna(self, entry): - """get the protein sequence of a given entry as a string""" - entry = self.ensure_entry(entry) - seqArr = self.db.get_node('/Protein/CDNABuffer') - seq = seqArr[entry['CDNABufferOffset']:entry['CDNABufferOffset'] + entry['CDNABufferLength'] - 1] - return seq.tostring() - - def get_description(self, entry): - entry = self.ensure_entry(entry) - descArr = self.db.get_node('/Protein/DescriptionBuffer') - desc = descArr[entry['DescriptionOffset']:entry['DescriptionOffset'] + entry['DescriptionLength']] - return desc.tostring() - - def get_release_name(self): - return str(self.db.get_node_attr('/', 'oma_version')) - - def get_exons(self, entry_nr): - genome = self.id_mapper['OMA'].genome_of_entry_nr(entry_nr)['UniProtSpeciesCode'].decode() - locus_tab = self.db.get_node('/Protein/Locus/{}'.format(genome)) - return locus_tab.read_where('EntryNr == {}'.format(entry_nr)) - - def get_domains(self, entry_nr): - try: - return self.db.root.Annotations.Domains.read_where('EntryNr == {:d}'.format(entry_nr)) - except ValueError as e: - raise InvalidId('require a numeric entry id, got {}'.format(entry_nr)) - - def get_representative_entry_of_hog(self, fam): - """Get the information of the representative entry for a given family (roothog). - - For each family we select a represenative entry that has the most prevalent - domain architecture. This method returns the entry_nr that we selected, together - with the domain architecture and its prevalence. In case no representative entry - has been found, the method raises an :class:`NoReprEntry` Exception. - - :param int fam: The numeric family number.""" - domprev_tab = self.db.get_node('/HOGAnnotations/DomainArchPrevalence') - try: - row = next(domprev_tab.where('Fam == {:d}'.format(fam))) - fields = (to_snail_case(z) for z in domprev_tab.dtype.names) - res = dict(zip(fields, row.fetch_all_fields())) - res['domains'] = self.get_domains(int(row['ReprEntryNr'])) - res['prevalence'] = 100.0 * res['prev_count'] / res['fam_size'] - return res - except StopIteration: - raise NoReprEntry() - - def get_prevalent_domains(self, fam): - # Gets the prevalent domains for a particular top level HOG / family. - # returns: (family_row, similar_families) - # family_row contains: family ID, representative entry, DA prevalence. - # similar_families contains: same, with similarity score. Ordered. - domprev_tab = self.db.get_node('/HOGAnnotations/DomainArchPrevalence') - dom2hog_tab = self.db.get_node('/HOGAnnotations/Domains') - - try: - fam_row = self.get_representative_entry_of_hog(fam) - except NoReprEntry: - return None, None - - # Get the family's consensus DA and count them... - fam_da = collections.Counter(fam_row['domains']['DomainId']) - - # Retrieve the relevant other families... - sim_fams = collections.defaultdict(collections.Counter) - for d in fam_da: - for hog_with_domain in dom2hog_tab.where('DomainId == {}'.format(d)): - sim_fams[hog_with_domain['Offset']][d] += 1 - - if len(sim_fams) == 0: - return fam_row, None - - # Now get similar families and order them by similarity - sim_fams_df = pd.DataFrame(domprev_tab[list(sim_fams.keys())]) - sim_fams_df['sim'] = list(map(lambda i: sum((sim_fams[i] & fam_da).values()), - sim_fams.keys())) - - # Sort by similarity & family size - sim_fams_df.sort_values(['sim', 'FamSize'], inplace=True, ascending=False) - sim_fams_df.reset_index(drop=True, inplace=True) - - # Prevalence - sim_fams_df['Prev'] = 100.0 * (sim_fams_df['PrevCount'] / sim_fams_df['FamSize']) - - return fam_row, sim_fams_df - - def get_gene_ontology_annotations(self, entry_nr, stop=None, as_dataframe=False, as_gaf=False): - """Retrieve the gene ontology annotations for an entry or entry_range - - The method returns the gene ontology annotations stored in the database - for a given entry (if `stop` parameter is not provided) or for all the - entries between [entry_nr, stop). Like in slices, the stop entry_nr is - not inclusive, where as the entry_nr - the start of the slice - is. - - By default the result are returned as numpy arrays of type - :class:`tablefmt.GeneOntologyTable`. If as_dataframe is set to true, the - result will be a pandas dataframe, and if as_gaf is set to true, a gaf - formatted text file with the annotations is returned. - - :param int entry_nr: numeric protein entry - """ - # function to check if an annotation term is obsolete - def filter_obsolete_terms(term): - try: - self.gene_ontology.term_by_id(term) - return True - except (KeyError, ValueError): - return False - try: - if stop is None: - query = 'EntryNr == {:d}'.format(entry_nr) - else: - if not isinstance(stop, int) or stop < entry_nr: - raise TypeError("stop argument needs to be a entry number that is larger than 'entry_nr'") - query = '(EntryNr >= {:d}) & (EntryNr < {:d})'.format(entry_nr, stop) - annots = self.db.root.Annotations.GeneOntology.read_where(query) - - # for test database we also have some obsolete terms. we need to filter those - if len(annots) > 0: - not_obsolete = numpy.vectorize(filter_obsolete_terms)(annots['TermNr']) - annots = annots[not_obsolete] - except ValueError as e: - raise InvalidId('require a numeric entry id, got {}'.format(entry_nr)) - if not as_dataframe and not as_gaf: - return annots - - # early return if no annotations available - if len(annots) == 0: - return '!gaf-version: {}\n'.format(GAF_VERSION) if as_gaf else None - - df = pd.DataFrame(annots) - - # 1R DB - df['DB'] = 'OMA' - # 2R DB Object ID - df['DB_Object_ID'] = df['EntryNr'].apply(self.id_mapper['Oma'].map_entry_nr) - # 3R DB Object Symbol - df['DB_Object_Symbol'] = df['DB_Object_ID'] - # 4O Qualifier - df['Qualifier'] = '' - # 5R GO ID - df['GO_ID'] = df['TermNr'].apply(lambda t: 'GO:{:07d}'.format(t)) - # 6R DB:Reference - df['DB:Reference'] = df['Reference'].apply(lambda x: x.decode('ascii')) - # 7R Evidence code - df['Evidence'] = df['Evidence'].apply(lambda x: x.decode('ascii')) - # 8O With (or) From - df['With'] = '' - # 9R Aspect - df['Aspect'] = df['GO_ID'].apply(lambda t: GOAspect.to_char(self.gene_ontology.term_by_id(t).aspect)) - # 10O DB Object Name - df['DB_Object_Name'] = '' - # 11O DB Object Synonym (|Synonym) - df['Synonym'] = '' - # 12R DB Object Type - df['DB_Object_Type'] = 'protein' - # 13R Taxon (|taxon) - df['Taxon_ID'] = df['EntryNr'].apply(lambda e: 'taxon:{:d}' - .format(self.id_mapper['Oma'].genome_of_entry_nr(e)['NCBITaxonId'])) - # 14R Date - df['Date'] = self.get_conversion_date().strftime('%Y%m%d') - # 15R Assigned by - TODO: FIX FOR NON OMA!!! - df['Assigned_By'] = df['DB'] - # 16O Annotation Extension - df['Annotation_Extension'] = '' - # 17O Gene Product Form ID - df['Gene_Product_Form_ID'] = '' - - df = df[GOA.GAF20FIELDS] - return (df if not as_gaf else - ('!gaf-version: {}\n'.format(GAF_VERSION) + - '\n'.join(df.apply(lambda e: '\t'.join(map(str, e)), axis=1)) + - '\n')) - - -class SuffixSearcher(object): - def __init__(self, suffix_index_node, buffer=None, lookup=None): - if isinstance(suffix_index_node, tables.Group): - self.buffer_arr = buffer if buffer else suffix_index_node._f_get_child('buffer') - self.suffix_arr = suffix_index_node._f_get_child('suffix') - self.lookup_arr = lookup if lookup else suffix_index_node._f_get_child('offset') - else: - self.buffer_arr = buffer - self.suffix_arr = suffix_index_node - self.lookup_arr = lookup - self.lookup_arr = self.lookup_arr[:] - - def find(self, query): - n = len(query) - if n > 0: - slicer = KeyWrapper(self.suffix_arr, - key=lambda i: - self.buffer_arr[i:(i + n)].tobytes()) - ii = bisect_left(slicer, query) - if ii and (slicer[ii] == query): - # Left most found. - jj = ii + 1 - while (jj < len(slicer)) and (slicer[jj] == query): - # zoom to end -> -> -> - jj += 1 - - # Find entry numbers and filter to remove incorrect entries - return numpy.searchsorted(self.lookup_arr, self.suffix_arr[ii:jj]+1) - 1 - return [] - - -class SequenceSearch(object): - ''' - Contains all the methods for searching the sequence - - TODO: implement taxonomic filtering. - ''' - from .KmerEncoder import DIGITS_AA - PROTEIN_CHARS = frozenset(map(lambda x: x.decode(), DIGITS_AA)) - PAM100 = pyopa.generate_env(pyopa.load_default_environments()['log_pam1'], - 100) - - def __init__(self, db): - # Backup reference to used DB method. - self.get_sequence = db.get_sequence - - # Assume the index is stored in the main DB if there is no .idx file - self.db = db.get_hdf5_handle() - self.db_idx = (self.db if not os.path.isfile(self.db.filename + '.idx') else - tables.open_file(self.db.filename + '.idx', 'r')) - - # Protein search arrays. - try: - self.seq_idx = self.db_idx.root.Protein.SequenceIndex - if isinstance(self.seq_idx, tables.link.ExternalLink): - self.seq_idx = self.seq_idx() - self.kmer_lookup = self.db_idx.root.Protein.KmerLookup - if isinstance(self.kmer_lookup, tables.link.ExternalLink): - self.kmer_lookup = self.kmer_lookup() - except (AttributeError, OSError) as e: - raise DBConsistencyError("Suffix index for protein sequences is not available: "+str(e)) - self.seq_buff = self.db.root.Protein.SequenceBuffer - self.n_entries = len(self.db.root.Protein.Entries) - - # Kmer lookup arrays / kmer setup - self.k = self.kmer_lookup._f_getattr('k') - self.encoder = KmerEncoder(self.k) - logger.info('KmerLookup of size k={} loaded'.format(self.k)) - - def get_entry_length(self, ii): - """Get length of a particular entry.""" - return self.db.root.Protein.Entries[ii - 1]['SeqBufferLength'] - 1 - - @LazyProperty - def entry_idx(self): - ''' - Caches the index lookup part of the SA. - ''' - return self.seq_idx[:self.n_entries] - - def get_entrynr(self, ii): - ''' - Get the entry number(s) corresponding to a location in the sequence - buffer. - ''' - return (numpy.searchsorted(self.entry_idx, ii) + 1) - - def contains_only_valid_chars(self, seq): - """returns true iff `seq` contains only valid AA chars. - - The method ignores the case of the seq, i.e. upper - or lower case chars both match. - - :param (bytes, str) seq: sequence to be checked - :returns bool - """ - if isinstance(seq, bytes): - seq = seq.decode() - return all(map(lambda c: c in self.PROTEIN_CHARS, seq.upper())) - - def _sanitise_seq(self, seq): - ''' - Sanitise a string protein sequence. Deletes "invalid" characters. - TODO: add functionality for biopython sequence / skbio sequence. - ''' - assert type(seq) == str - return ''.join(filter(lambda c: c in self.PROTEIN_CHARS, - seq.upper())).encode('ascii') - - def search(self, seq, n=None, coverage=None, is_sanitised=None): - ''' - Searches the database for entries that match. If can't find an exact - match performs a kmer + local alignment approach to approximate - search. - ''' - seq = (self._sanitise_seq(seq) if not is_sanitised else seq) - m = self.exact_search(seq, is_sanitised=True) - # TODO: taxonomic filtering. - if len(m) == 0: - # Do approximate search - m = self.approx_search(seq, n=n, coverage=coverage, is_sanitised=True) - # TODO: taxonomic filtering. - return ('approx', m) if m is not [] else None - else: - return 'exact', m - - def exact_search(self, seq, only_full_length=True, is_sanitised=None): - ''' - Performs an exact match search using the suffix array. - ''' - # TODO: work out whether to just use the approximate search and then - # check if any are actually exact matches. Do the counting and then - # do an equality checking on any of the sequences that have the correct - # number of kmer matches. - seq = (seq if is_sanitised else self._sanitise_seq(seq)) - nn = len(seq) - if nn > 0: - z = KeyWrapper(self.seq_idx, - key=lambda i: - self.seq_buff[i:(i + nn)].tobytes()) - ii = bisect_left(z, seq, lo=self.n_entries) - - if ii and (z[ii] == seq): - # Left most found. - jj = ii + 1 - while (jj < len(z)) and (z[jj] == seq): - # zoom to end -> -> -> - jj += 1 - - # Find entry numbers and filter to remove incorrect entries - return list(filter(lambda e: (not only_full_length) or self.get_entry_length(e) == nn, - self.get_entrynr(self.seq_idx[ii:jj]))) - - # Nothing found. - return [] - - def approx_search(self, seq, n=None, is_sanitised=None, coverage=None): - ''' - Performs an exact match search using the suffix array. - ''' - seq = (seq if is_sanitised else self._sanitise_seq(seq)) - n = (n if n is not None else 50) - coverage = (0.0 if coverage is None else coverage) - - # 1. Do kmer counting vs entry numbers TODO: switch to np.unique? - c = collections.Counter() - for z in map(lambda kmer: numpy.unique(self.kmer_lookup[int(kmer)], - return_counts=True), - self.encoder.decompose(seq)): - c.update(dict(zip(*z))) - - # 2. Filter to top n if necessary - z = len(seq) - self.k + 1 - cut_off = coverage * z - c = [(x[0], (x[1] / z)) for x in c.items() if x[1] >= cut_off] - c = (sorted(c, - reverse=True, - key=lambda x: x[1])[:n] if n > 0 else c) - - # 3. Do local alignments and return count / score / alignment - if len(c) > 0: - return sorted([(m[0], {'kmer_coverage': m[1], - 'score': a[0], - 'alignment': a[1]}) - for (m, a) in self._align_entries(seq, c)], - key=lambda z: z[1]['score'], - reverse=True) - return [] - - def _align_entries(self, seq, matches): - # Does the alignment for the approximate search - def align(s1, s2s, env, aligned): - for s2 in s2s: - z = pyopa.align_double(s1, s2, env, False, False, True) - a = pyopa.align_strings(s1, s2, env, False, z) - aligned.append((z[0], ((a[0].convert_readable(), - (z[3], z[1])), - (a[1].convert_readable(), - (z[4], z[2]))))) - - aligned = [] - query = pyopa.Sequence(seq.decode('ascii')) - entries = list(map(lambda m: - pyopa.Sequence(self.get_sequence(int(m[0])).decode('ascii')), - matches)) - t = threading.Thread(target=align, - args=(query, entries, self.PAM100, aligned)) - t.start() - t.join() - assert (len(aligned) > 0), 'Alignment thread crashed.' - return zip(matches, aligned) - - -class OmaIdMapper(object): - def __init__(self, db): - self.genome_table = db.get_hdf5_handle().root.Genome.read() - self._entry_off_keys = self.genome_table.argsort(order=('EntryOff')) - self._genome_keys = self.genome_table.argsort( - order=('UniProtSpeciesCode')) - self._taxid_keys = self.genome_table.argsort(order=('NCBITaxonId')) - self._omaid_re = re.compile(r'(?P[A-Z][A-Z0-9]{4})(?P\d+)') - self._db = db - - def genome_of_entry_nr(self, e_nr): - """returns the genome code belonging to a given entry_nr""" - idx = self.genome_table['EntryOff'].searchsorted( - e_nr - 1, side='right', - sorter=self._entry_off_keys) - return self.genome_table[self._entry_off_keys[idx - 1]] - - def map_entry_nr(self, entry_nr): - genome = self.genome_of_entry_nr(entry_nr) - return "{0:s}{1:05d}".format(genome['UniProtSpeciesCode'].decode(), - entry_nr - genome['EntryOff']) - - def genome_from_UniProtCode(self, code): - code = code.encode('ascii') - idx = self.genome_table['UniProtSpeciesCode'].searchsorted( - code, sorter=self._genome_keys) - try: - genome = self.genome_table[self._genome_keys[idx]] - except IndexError: - raise UnknownSpecies('{} is unknown'.format(code)) - - if genome['UniProtSpeciesCode'] != code: - raise UnknownSpecies('{} is unknown'.format(code)) - return genome - - def genome_from_taxid(self, taxid): - try: - taxid = int(taxid) - idx = self.genome_table['NCBITaxonId'].searchsorted( - taxid, sorter=self._taxid_keys) - genome = self.genome_table[self._taxid_keys[idx]] - except (IndexError, ValueError): - raise UnknownSpecies('TaxonId "{}" is unknown'.format(taxid)) - if genome['NCBITaxonId'] != taxid: - raise UnknownSpecies('TaxonId "{}" is unknown'.format(taxid)) - return genome - - def identify_genome(self, code): - """identify genome based on either a UniProtSpeciesCode or an - NCBI Taxonomy Id""" - if isinstance(code, int) or code.isdigit(): - return self.genome_from_taxid(code) - else: - return self.genome_from_UniProtCode(code) - - def omaid_to_entry_nr(self, omaid): - """returns the internal numeric entrynr from a - UniProtSpeciesCode+nr id. this is the inverse - function of 'map_entry_nr'.""" - match = self._omaid_re.match(omaid) - if match is None: - raise InvalidOmaId(omaid) - code, nr = match.group('genome'), int(match.group('nr')) - genome = self.genome_from_UniProtCode(code) - if nr <= 0 or nr > genome['TotEntries']: - raise InvalidOmaId(omaid) - return genome['EntryOff'] + int(match.group('nr')) - - def genome_range(self, query): - """returns the internal range of EntryNr associated with - 'query'. 'query' can be either a numeric id of a protein - or a UniProtSpeciesCode of a genome. If 'query' is unknown - by the database, an InvalidOmaId exception is raised. - - The return range is a tuple of length two, and the numbers - indicated the *inclusive* boundaries, e.g. (1,5) indicates - that the entries 1,2,3,4 and 5 belong to the query species""" - if isinstance(query, (int, numpy.integer)): - genome_row = self.genome_of_entry_nr(query) - if query <= 0 or query > genome_row['EntryOff'] + genome_row['TotEntries']: - raise InvalidOmaId(query) - else: - genome_row = self.genome_from_UniProtCode(query) - return (genome_row['EntryOff'] + 1, - genome_row['EntryOff'] + genome_row['TotEntries'],) - - def species_ordering(self, root=None): - """get ordering of the genomes with respect to taxonomy. - - This method returns a linear ordering of all the genomes with - respect to their lineage, i.e. genomes that are evolutionary - "close" to each other appear close in the ordering. - Optionally, one can give a root genome, that will be the species - the ordering is going to start with. - - :param root: UniProtSpeciesCode of the root genome. - :returns: a list of species codes in the correct order.""" - if root is None: - root = self.genome_table[0]['UniProtSpeciesCode'] - root_genome = self.genome_from_UniProtCode(root) - lins = {g['UniProtSpeciesCode']: [lev['Name'] for lev in self._db.tax.get_parent_taxa(g['NCBITaxonId'])][::-1] - for g in self.genome_table} - root_lin = lins[root_genome['UniProtSpeciesCode']] - sort_key = {} - for g, lin_g in lins.items(): - for k in range(min(len(root_lin), len(lin_g))): - if root_lin[k] != lin_g[k]: - k -= 1 - break - sort_key[g] = (-k, lin_g) - sorted_genomes = sorted(list(sort_key.keys()), key=lambda g: sort_key[g]) - return {g.decode(): v for v, g in enumerate(sorted_genomes)} - - -class AmbiguousID(Exception): - def __init__(self, message, candidates): - super(AmbiguousID, self).__init__(message, candidates) - self.candidates = candidates - - -class IDResolver(object): - def __init__(self, db): - entry_nr_col = db.get_hdf5_handle().root.Protein.Entries.cols.EntryNr - self.max_entry_nr = entry_nr_col[int(entry_nr_col.index[-1])] - self._db = db - - def _from_numeric(self, e_id): - nr = int(e_id) - if not 0 < nr <= self.max_entry_nr: - raise InvalidId('{0:d} out of protein range: {1:}'.format(nr, e_id)) - return nr - - def _from_omaid(self, e_id): - return int(self._db.id_mapper['OMA'].omaid_to_entry_nr(e_id)) - - def search_xrefs(self, e_id): - """search for all xrefs. TODO: what happens if xref is ambiguous?""" - res = set([x['EntryNr'] for x in self._db.id_mapper['XRef'].search_xref(e_id)]) - if len(res) == 0: - # let's try to mach as substring using suffix array case insensitive - res = set([x['EntryNr'] for x in self._db.id_mapper['XRef'].search_xref(e_id, match_any_substring=True)]) - if len(res) == 0: - raise InvalidId(e_id) - if len(res) > 1: - # check whether its only different isoforms, then return canonical isoform - splice_variants = set([x['AltSpliceVariant'] for x in (self._db.entry_by_entry_nr(eNr) for eNr in res)]) - logger.info('xref {} has {} entries, {} splice variants'.format(e_id, len(res), len(splice_variants))) - if len(splice_variants) > 1 or 0 in splice_variants: - raise AmbiguousID('Cross-ref "{}" is ambiguous'.format(e_id), res) - else: - res = splice_variants - return int(res.pop()) - - def resolve(self, e_id): - """maps an id to the entry_nr of the current OMA release.""" - try: - nr = self._from_numeric(e_id) - except ValueError: - try: - nr = self._from_omaid(e_id) - except (InvalidOmaId, UnknownSpecies) as e: - nr = self.search_xrefs(e_id) - return nr - - -class Taxonomy(object): - """Taxonomy provides an interface to navigate the taxonomy data. - - The input data is the same as what is stored in the Database in - table "/Taxonomy".""" - - def __init__(self, data, genomes=None, _valid_levels=None): - if not isinstance(data, numpy.ndarray): - raise ValueError('Taxonomy expects a numpy table.') - self.genomes = genomes if genomes is not None else {} - self.tax_table = data - self.taxid_key = self.tax_table.argsort(order=('NCBITaxonId')) - self.parent_key = self.tax_table.argsort(order=('ParentTaxonId')) - self.all_hog_levels = _valid_levels - if _valid_levels is None: - self._load_valid_taxlevels() - - def _load_valid_taxlevels(self): - forbidden_chars = re.compile(r'[^A-Za-z. -]') - try: - with open(os.environ['DARWIN_BROWSERDATA_PATH'] + '/TaxLevels.drw') as f: - taxStr = f.read() - tax_json = json.loads(("[" + taxStr[14:-3] + "]").replace("'", '"')) - self.all_hog_levels = frozenset([t.encode('ascii') for t in - tax_json if forbidden_chars.search(t) is None]) - except (IOError, KeyError): - self.all_hog_levels = frozenset([l for l in self.tax_table['Name'] - if forbidden_chars.search(l.decode()) is None]) - - def _table_idx_from_numeric(self, tid): - i = self.tax_table['NCBITaxonId'].searchsorted( - tid, sorter=self.taxid_key) - idx = self.taxid_key[i] - if self.tax_table[idx]['NCBITaxonId'] != tid: - raise InvalidTaxonId(u"{0:d} is an invalid/unknown taxonomy id".format(tid)) - return idx - - def _get_root_taxon(self): - i1 = self.tax_table['ParentTaxonId'].searchsorted(0, sorter=self.parent_key) - i2 = self.tax_table['ParentTaxonId'].searchsorted(0, sorter=self.parent_key, side='right') - if i2 - i1 == 0: - raise DBConsistencyError('Not a single root in Taxonomy: {}' - .format(self.tax_table[self.parent_key[i1]])) - elif i2 - i1 == 1: - res = self.tax_table[self.parent_key[i1]] - else: - res = numpy.array([(0, -1, b'LUCA')], dtype=self.tax_table.dtype)[0] - return res - - def _taxon_from_numeric(self, tid): - idx = self._table_idx_from_numeric(tid) - return self.tax_table[idx] - - def _direct_children_taxa(self, tid): - i = self.tax_table['ParentTaxonId'].searchsorted(tid, sorter=self.parent_key) - idx = [] - while i < len(self.parent_key) and self.tax_table[self.parent_key[i]]['ParentTaxonId'] == tid: - idx.append(self.parent_key[i]) - i += 1 - return self.tax_table.take(idx) - - def get_parent_taxa(self, query): - """Get array of taxonomy entries leading towards the - root of the taxonomy. - - :param query: the starting taxonomy level""" - idx = [] - parent = query - count = 0 - while parent != 0: - i = self._table_idx_from_numeric(parent) - idx.append(i) - tmp = self.tax_table[i]['ParentTaxonId'] - if tmp == parent: - raise InvalidTaxonId(u"{0:d} has itself as parent".format(tmp)) - parent = tmp - count += 1 - if count > 100: - raise InvalidTaxonId(u"{0:d} exceeds max depth of 100. Infinite recursion?".format(query)) - return self.tax_table.take(idx) - - def _get_taxids_from_any(self, it, skip_missing=True): - if not isinstance(it, numpy.ndarray): - try: - it = numpy.fromiter(it, dtype='i4') - except ValueError: - it = numpy.fromiter(it, dtype='S255') - if it.dtype.type is numpy.string_: - try: - ns = self.name_key - except AttributeError: - ns = self.name_key = self.tax_table.argsort(order='Name') - idxs = self.tax_table['Name'].searchsorted(it, sorter=ns) - idxs = numpy.clip(idxs, 0, len(ns) - 1) - taxs = self.tax_table[ns[idxs]] - keep = taxs['Name'] == it - if not skip_missing and not keep.all(): - raise KeyError('not all taxonomy names could be found') - res = taxs['NCBITaxonId'][keep] - else: - res = it - return res - - def get_induced_taxonomy(self, members, collapse=True, augment_parents=False): - """Extract the taxonomy induced by a given set of `members`. - - This method allows to extract the part which is induced by a - given set of levels and leaves that should be part of the - new taxonomy. `members` must be an iterable, the levels - must be either numeric taxids or scientific names. - - Unless `augment_parents` is set to true, the resulting sub-taxonomy - will only contain levels that are specified in `members`. If - `augment_parents` is set to True, also all parent nodes of the - levels passed in members are considered for the sub-taxonomy. - - :param iter members: an iterable containing the levels - and leaves that should remain in the new taxonomy. can be - either axonomic ids or scientific names. - - :param bool collapse: whether or not levels with only one child - should be skipped or not. This defaults to True - - :param bool augment_parents: whether or not to consider parent - levels of members for the resulting taxonomy.""" - - taxids_to_keep = numpy.sort(self._get_taxids_from_any(members)) - if augment_parents: - # find all the parents of all the members, add them to taxids_to_keep - additional_levels = set([]) - for cur_tax in taxids_to_keep: - try: - additional_levels.update(set(self.get_parent_taxa(cur_tax)['NCBITaxonId'])) - except KeyError: - logger.info("{} seems not to exist in Taxonomy".format(cur_tax)) - pass - # add and remove duplicates - all_levels = numpy.append(taxids_to_keep, list(additional_levels)) - taxids_to_keep = numpy.unique(all_levels) - - idxs = numpy.searchsorted(self.tax_table['NCBITaxonId'], taxids_to_keep, sorter=self.taxid_key) - idxs = numpy.clip(idxs, 0, len(self.taxid_key) - 1) - subtaxdata = self.tax_table[self.taxid_key[idxs]] - if not numpy.alltrue(subtaxdata['NCBITaxonId'] == taxids_to_keep): - raise KeyError('not all levels in members exists in this taxonomy') - - updated_parent = numpy.zeros(len(subtaxdata), 'bool') - for i, cur_tax in enumerate(taxids_to_keep): - if updated_parent[i]: - continue - # get all the parents and check which ones we keep in the new taxonomy. - parents = self.get_parent_taxa(cur_tax)['NCBITaxonId'] - mask = numpy.in1d(parents, taxids_to_keep) - # find the position of them in subtaxdata (note: subtaxdata and - # taxids_to_keep have the same ordering). - new_idx = taxids_to_keep.searchsorted(parents[mask]) - taxids = taxids_to_keep[new_idx] - # parent taxid are ncbitaxonids shifted by one position! - parents = numpy.roll(taxids, -1) - parents[-1] = 0 - subtaxdata['ParentTaxonId'][new_idx] = parents - updated_parent[new_idx] = True - - if collapse: - nr_children = collections.defaultdict(int) - for p in subtaxdata['ParentTaxonId']: - nr_children[p] += 1 - rem = [p for (p, cnt) in nr_children.items() if cnt == 1 and p != 0] - if len(rem) > 0: - idx = taxids_to_keep.searchsorted(rem) - return self.get_induced_taxonomy(numpy.delete(taxids_to_keep, idx)) - return Taxonomy(subtaxdata, genomes=self.genomes, _valid_levels=self.all_hog_levels) - - def newick(self): - """Get a Newick representation of the Taxonomy - - Note: as many newick parsers do not support quoted labels, - the method instead replaces spaces with underscores.""" - def newick_enc(s): - return s.translate({ord(' '): u'_', ord('('): u'[', ord(')'): u']'}) - - def _rec_newick(node): - children = [] - for child in self._direct_children_taxa(node['NCBITaxonId']): - children.append(_rec_newick(child)) - - if len(children) == 0: - return newick_enc(node['Name'].decode()) - else: - t = ",".join(children) - return '(' + t + ')' + newick_enc(node['Name'].decode()) - - return _rec_newick(self._get_root_taxon()) + ';' - - def as_dict(self): - """Encode the Taxonomy as a nested dict. - - This representation can for example be used to serialize - a Taxonomy in json format.""" - - def _rec_phylogeny(node): - res = {'name': node['Name'].decode(), 'id': int(node['NCBITaxonId'])} - children = [] - for child in self._direct_children_taxa(node['NCBITaxonId']): - children.append(_rec_phylogeny(child)) - if len(children) > 0: - res['children'] = children - else: - try: - g = self.genomes[res['id']] - res['code'] = g.uniprot_species_code - except KeyError: - pass - return res - - return _rec_phylogeny(self._get_root_taxon()) - - def as_phyloxml(self): - """Encode the Taxonomy as phyloxml output""" - - def _rec_phyloxml(node): - n = et.Element("clade") - tax = et.SubElement(n, "taxonomy") - id_ = et.SubElement(tax, "id", provider="uniprot") - id_.text = str(node['NCBITaxonId']) - - children = [] - for child in self._direct_children_taxa(node['NCBITaxonId']): - children.append(_rec_phyloxml(child)) - if len(children) == 0: - try: - g = self.genomes[int(node['NCBITaxonId'])] - code = et.SubElement(tax, 'code') - code.text = g.uniprot_species_code - except ValueError: - pass - sci = et.SubElement(tax, 'scientific_name') - sci.text = node['Name'].decode() - n.extend(children) - return n - - root = et.Element('phyloxml', xmlns="http://www.phyloxml.org") - phylo = et.SubElement(root, "phylogeny", rooted="true", rerootable="false") - name = et.SubElement(phylo, "name") - name.text = "(Partial) species phylogeny from OMA Browser" - phylo.append(_rec_phyloxml(self._get_root_taxon())) - - return et.tostring(root, encoding='utf-8') - - -class InvalidTaxonId(Exception): - pass - - -class DBVersionError(Exception): - pass - - -class DBConsistencyError(Exception): - pass - - -class InvalidId(Exception): - pass - - -class InvalidOmaId(InvalidId): - pass - - -class UnknownIdType(Exception): - pass - - -class UnknownSpecies(Exception): - pass - - -class Singleton(Exception): - def __init__(self, entry, msg=None): - super(Singleton, self).__init__(msg) - self.entry = entry - - -class NoReprEntry(Exception): - pass - - -class IdMapperFactory(object): - def __init__(self, db_obj): - self.db = db_obj - self.mappers = {} - - def __getitem__(self, idtype): - return self.get_mapper(idtype) - - def get_mapper(self, idtype): - try: - mapper = self.mappers[idtype] - except KeyError: - try: - mapper = globals()[str(idtype).title() + 'IdMapper'](self.db) - self.mappers[idtype] = mapper - except KeyError: - raise UnknownIdType('{} is unknown'.format(str(idtype))) - return mapper - - -class XrefIdMapper(object): - def __init__(self, db): - self._db = db - self.xref_tab = db.get_hdf5_handle().get_node('/XRef') - self.xrefEnum = self.xref_tab.get_enum('XRefSource') - self.idtype = frozenset(list(self.xrefEnum._values.keys())) - self.xref_index = SuffixSearcher(db.get_hdf5_handle().get_node('/XRef_Index')) - - def map_entry_nr(self, entry_nr): - """returns the XRef entries associated with the query protein. - - The types of XRefs that are returned depends on the idtype - class member variable. In the base-class, idtype contains - all valid xref types. Typically, subclasses of XrefIdMapper - will change this set. - - :param entry_nr: the numeric id of the query protein. - :returns: list of dicts with 'source' and 'xref' keys.""" - res = [{'source': self.xrefEnum._values[row['XRefSource']], - 'xref': row['XRefId'].decode()} - for row in self.xref_tab.where('EntryNr=={:d}'.format(entry_nr)) - if row['XRefSource'] in self.idtype] - return res - - def canonical_source_order(self): - """returns the list of xref sources in order of their importance. - - Most important source - in the base class for example UniProtKB/SwissProt - are first. The canonical order is defined in the enum definition. - - :returns: list of source strings""" - return [self.xrefEnum(z) for z in sorted(self.idtype)] - - def iter_xrefs_for_entry_nr(self, entry_nr): - """Iterate over the xrefs of a given entry number. - - This method returns a dict with 'source' and 'xref' fields - (both str) holding the information of the xref record. - - :param entry_nr: the numeric id of the query protein""" - for row in self.xref_tab.where('EntryNr=={:d}'.format(entry_nr)): - if row['XRefSource'] in self.idtype: - yield {'source': self.xrefEnum._values[row['XRefSource']], - 'xref': row['XRefId'].decode()} - - def _combine_query_values(self, field, values): - parts = ['({}=={})'.format(field, z) for z in values] - return '|'.join(parts) - - def map_many_entry_nrs(self, entry_nrs): - """map several entry_nrs with as few db queries as possible - to their cross-references. The function returns a - :class:`numpy.recarray` containing all fields as defined in - the table. - - :param entry_nrs: a list with numeric protein entry ids""" - mapped_junks = [] - junk_size = 32 - len(self.idtype) # respect max number of condition variables. - source_condition = self._combine_query_values('XRefSource', self.idtype) - for start in range(0, len(entry_nrs), junk_size): - condition = "({}) & ({})".format( - self._combine_query_values('EntryNr', - entry_nrs[start:start + junk_size]), - source_condition) - mapped_junks.append(self.xref_tab.read_where(condition)) - return numpy.lib.recfunctions.stack_arrays( - mapped_junks, - usemask=False) - - def search_xref(self, xref, is_prefix=False, match_any_substring=False): - """identify proteins associcated with `xref`. - - The crossreferences are limited to the types in the class - member `idtype`. In the base class, all types are valid - xrefs. The method returns a :class:`numpy.recarry` defined - for the XRef table with all entries pointing to `xref`. - - The method by default returns only exact matches. By setting - `is_prefix` to True, one can indicated that the requested xref - should be interpreted as a prefix and all entries matching this - prefix should be returned. - - :param str xref: an xref to be located - :param bool is_prefix: treat xref as a prefix and return - potentially several matching xrefs""" - if match_any_substring: - query = xref.encode('utf-8').lower() - res = self.xref_tab[self.xref_index.find(query)] - else: - if is_prefix: - up = xref[:-1] + chr(ord(xref[-1])+1) - cond = '(XRefId >= {!r}) & (XRefId < {!r})'.format( - xref.encode('utf-8'), up.encode('utf-8')) - else: - cond = 'XRefId=={!r}'.format(xref.encode('utf-8')) - res = self.xref_tab.read_where(cond) - if len(res) > 0 and len(self.idtype) < len(self.xrefEnum): - res = res[numpy.in1d(res['XRefSource'], list(self.idtype))] - return res - - def source_as_string(self, source): - """string representation of xref source enum value - - this auxiliary method converts the numeric value of - a xref source into a string representation. - - :param int source: numeric value of xref source""" - try: - return self.xrefEnum._values[source] - except KeyError: - raise ValueError("'{}' is not a valid xref source value".format(source)) - - def xreftab_to_dict(self, tab): - """convert a xreftable to a dictionary per entry_nr. - - All rows in `tab` are converted into a nested dictionary - where the outer key is a protein entry number and the - inner key the xref source type. - - :param tab: a :class:`numpy.recarray` corresponding to XRef - table definition to be converted""" - xrefdict = collections.defaultdict(dict) - for row in tab: - try: - typ = self.xrefEnum._values[row['XRefSource']] - except IndexError: - logger.warning('invalid XRefSource value in {}'.format(row)) - continue - if typ not in xrefdict[row['EntryNr']]: - xrefdict[row['EntryNr']][typ] = {'id': row['XRefId']} - return xrefdict - - -class UniProtIdMapper(XrefIdMapper): - def __init__(self, db): - super(UniProtIdMapper, self).__init__(db) - self.idtype = frozenset([self.xrefEnum[z] - for z in ['UniProtKB/SwissProt', 'UniProtKB/TrEMBL']]) - - -class LinkoutIdMapper(XrefIdMapper): - def __init__(self, db): - super(LinkoutIdMapper, self).__init__(db) - self.idtype = frozenset([self.xrefEnum[z] - for z in ['UniProtKB/SwissProt', 'UniProtKB/TrEMBL', - 'Ensembl Protein', 'Ensembl Gene', - 'EntrezGene']]) - - def url(self, typ, id_): - # TODO: improve url generator in external module with all xrefs - url = None - try: - id_ = id_.decode() - except AttributeError: - pass - - if typ.startswith('UniProtKB'): - url = 'http://uniprot.org/uniprot/{}'.format(id_) - elif typ == 'EntrezGene': - url = 'http://www.ncbi.nlm.nih.gov/gene/{}'.format(id_) - elif typ.startswith('Ensembl'): - url = 'http://ensembl.org/id/{}'.format(id_) - return url - - def xreftab_to_dict(self, tab): - xref = super(LinkoutIdMapper, self).xreftab_to_dict(tab) - for d in list(xref.values()): - for typ, elem in list(d.items()): - elem['url'] = self.url(typ, elem['id']) - return xref - - def iter_xrefs_for_entry_nr(self, entry_nr): - """same as base clase but includes also the url as a field""" - for xref in super(LinkoutIdMapper, self).iter_xrefs_for_entry_nr(entry_nr): - xref['url'] = self.url(xref['source'], xref['xref']) - yield xref - - -class DomainNameIdMapper(object): - def __init__(self, db): - self.domain_src = db.get_hdf5_handle().root.Annotations.DomainDescription.read() - self.domain_src.sort(order='DomainId') - - def _get_dominfo(self, domain_id): - idx = self.domain_src['DomainId'].searchsorted(domain_id) - if self.domain_src[idx]['DomainId'] != domain_id: - raise KeyError("no domain info available for {}".format(domain_id)) - return self.domain_src[idx] - - def get_info_dict_from_domainid(self, domain_id): - info = self._get_dominfo(domain_id) - return {'name': info['Description'].decode(), 'source': info['Source'].decode(), - 'domainid': domain_id.decode()} - - -class FastMapper(object): - """GO Function projection to sequences from OMA hdf5 file""" - - def __init__(self, db): - self.db = db - - def iter_projected_goannotations(self, records): - # gene ontology fast mapping, uses exact / approximate search. - # todo: implement taxonomic restriction. - # Input: iterable of biopython SeqRecords - - for rec in records: - logger.debug('projecting function to {}'.format(rec)) - r = self.db.seq_search.search(str(rec.seq)) - if r is not None: - logger.debug(str(r)) - if r[0] == 'exact': - tdfs1 = [] - for enum in r[1]: - df = self.db.get_gene_ontology_annotations(enum, as_dataframe=True) - if df is not None: - df['With'] = 'Exact:{}'.format(self.db.id_mapper['Oma'].map_entry_nr(enum)) - tdfs1.append(df) - go_df = pd.concat(tdfs1, ignore_index=True) - - else: - # Take best match. TODO: remove those below some level of match. - match_enum = r[1][0][0] - match_score = r[1][0][1]['score'] - logger.debug('match: enum: {}, score:{}'.format(match_enum, match_score)) - go_df = self.db.get_gene_ontology_annotations(match_enum, as_dataframe=True) - if go_df is not None: - go_df['With'] = 'Approx:{}:{}'.format(self.db.id_mapper['Oma'].map_entry_nr(match_enum), - match_score) - if go_df is not None: - go_df['DB'] = 'OMA_FastMap' - go_df['Assigned_By'] = go_df['DB'] - go_df['DB_Object_ID'] = rec.id - go_df['DB_Object_Symbol'] = go_df['DB_Object_ID'] - go_df['Evidence'] = 'IEA' - go_df['DB:Reference'] = 'OMA_Fun:002' - go_df['Taxon_ID'] = 'taxon:-1' - len_with_dupl = len(go_df) - go_df.drop_duplicates(inplace=True) - logger.debug('cleaning duplicates: from {} to {} annotations'.format(len_with_dupl, len(go_df))) - for row in go_df.to_dict('records'): - yield row - - def write_annotations(self, file, seqrecords): - """Project annotations and write them to file - - This method takes a filehandle and an iterable of BioPython - SeqRecords objects as input. The function computes the - projected annotations and writes them to the file in gaf - format. - - :param file: filehandle to write annotations to - :param seqrecords: input sequencs to project functions to - """ - - file.write('!gaf-version: {}\n'.format(GAF_VERSION)) - file.write('!Project Name: OMA Fast Function Projection\n') - file.write('!Date created: {}\n'.format(time.strftime("%c"))) - file.write('!Contact Email: contact@omabrowser.org\n') - for anno in self.iter_projected_goannotations(seqrecords): - GOA.writerec(anno, file, GOA.GAF20FIELDS) diff --git a/src/HogProf/build/lib/pyoma/browser/geneontology.py b/src/HogProf/build/lib/pyoma/browser/geneontology.py deleted file mode 100755 index 7782e97..0000000 --- a/src/HogProf/build/lib/pyoma/browser/geneontology.py +++ /dev/null @@ -1,424 +0,0 @@ -from builtins import int, bytes, str -import collections -import csv -import logging -import math -import re -import numpy - -""" -IMPORTANT NOTE: ---------------- -This module has been copied from the dessimoz zoo library -directly. If you want to add functionality to this module, -make sure it is also integrated into the zoo. At the moment -we don't want to depend with pyoma on the zoo library as it -has many dependencies that are difficult to maintain. - - -Gene ontology module defining classes and methods to parse -and navigate the gene ontology DAG as well as to parse GO -annotations. - - -:author: Adrian Altenhoff -:institute: ETH Zurich -""" - -NUM_ONT = 3 -NUM_GO_ID_DIGITS = 7 -UP_RELS = frozenset(['is_a', 'part_of']) -_REV_RELS = {'is_a': 'can_be', 'part_of': 'has_part'} - - -def reverse_name_of_rels(rels): - down = frozenset([_REV_RELS[z] for z in rels]) - return down - - -def validate_go_id(term): - if isinstance(term, (int, numpy.integer)): - return int(term) - - term = term.strip() - if term.startswith('GO:'): - digits = term[3:] - else: - digits = term - if not digits.isdigit() or len(digits) > NUM_GO_ID_DIGITS: - raise ValueError("GO ID {} is not a valid go term".format(term)) - return int(digits) - - -class GOAspect(object): - aspects = dict(molecular_function=0, biological_process=1, cellular_component=2) - aspect2char = {0: 'F', 1: 'P', 2: 'C'} - - @classmethod - def from_string(cls, aspect): - return cls.aspects[aspect] - - @classmethod - def to_string(cls, aspectnr): - for o, i in cls.aspects.items(): - if i == aspectnr: - return o - raise KeyError('aspect number not found: ' + str(aspectnr)) - - @classmethod - def to_char(cls, aspectnr): - # Converts an encoded aspect to the character required for GOA files - return cls.aspect2char[aspectnr] - - -class GOterm(object): - """A class representing a single Gene Ontology term. - - This class can serve as a factory for the OntologyParser. For that, - pass it as a factory on 'Term'. """ - - def __init__(self, stanza): - self.id = validate_go_id(stanza['id'][0]) - self.name = ' '.join(stanza['name']) - self.definition = ' '.join(stanza['def']) - self.aspect = GOAspect.from_string(' '.join(stanza['namespace'])) - self.is_a = [validate_go_id(parent) for parent in stanza['is_a']] - for rel in stanza['relationship']: - reltype, partner = rel.strip().split() - if not reltype in self.__dict__.keys(): - self.__dict__[reltype] = list() - self.__dict__[reltype].append(validate_go_id(partner)) - - def replace_parentnames_by_refs(self, ont): - for rel in [('is_a', 'can_be'), ('part_of', 'has_part')]: - if rel[0] in self.__dict__.keys(): - for i, parent_id in enumerate(self.__dict__[rel[0]]): - parent_obj = ont[parent_id] - self.__dict__[rel[0]][i] = parent_obj - parent_obj._add_relation(self, rel[1]) - - def _add_relation(self, term, rel): - if rel not in self.__dict__.keys(): - self.__dict__[rel] = list() - self.__dict__[rel].append(term) - - def get_parents(self, rels=None): - """iterate over the direct parent GO terms. - - by default "is_a" and "part_of" relations are followed. This can be overwritten - with the `rels`. - - :param rels: a set of relations to follow.""" - if rels is None: - rels = UP_RELS - for rel in rels: - try: - for term in getattr(self, rel): - yield term - except AttributeError: - pass - - def __str__(self): - fmt = "GO:{{0:0{}d}}".format(NUM_GO_ID_DIGITS) - return fmt.format(self.id) - - -class AbstractParser(object): - def __init__(self, fp): - self.close_fp = False - if isinstance(fp, str): - if fp.endswith('.gz'): - from gzip import GzipFile - fp = GzipFile(fp, 'rb') - else: - fp = open(fp, 'r') - self.close_fp = True - self.fp = fp - self._read_headers() - - def _read_headers(self): - pass - - def close_if_opened(self): - if self.close_fp: - self.fp.close() - - -class OntologyParser(AbstractParser): - """A general purpose Ontolgoy parser - - Any ontology in the OBO format can be parsed with this object. The - stanzas are converted to objects using the factories passed in the - initializer.""" - def __init__(self, fp, factories=None): - """creates an ontology parser - - :param fp: a filehandle or path to file (either plaintext or - gzipped) containing the ontology. - :param factories: a dictionary containing per stanza class - (e.g. [Term]) a factory that returns an object from the - data. The data is passed as dict to the factory""" - if not factories: - factories = dict(Term=GOterm) - super(OntologyParser, self).__init__(fp) - self.factories = factories - self.tag_value_pair_re = re.compile(r"\s*(?P[^:]+):\s*(?P[^!]*)") - self.stanza_name_re = re.compile(r"\[(?P[^]]*)\]") - - def stanzas(self): - """iterates over the stanzas in the ontology yielding - objects according to the factory parameter provided - in the constructor.""" - curStanza = None - for line in self.fp: - line = line.strip() - if not line or line[0] == '!': - continue - - # check whether a new stanza starts - match = self.stanza_name_re.match(line) - if match is not None: - obj = self._create_obj_from_stanza(curStanza) - if obj is not None: - yield obj - curStanza = collections.defaultdict(list) - curStanza['_name'] = match.group("name") - elif curStanza is not None: - # it has to be value-pair line. add it to the stanza - match = self.tag_value_pair_re.match(line) - curStanza[match.group('tag')].append(match.group('value')) - obj = self._create_obj_from_stanza(curStanza) - if obj is not None: - yield obj - self.close_if_opened() - - def _create_obj_from_stanza(self, stanza): - """method which creates the appropriate object from a given - stanza or returns None, e.g. for stanza types without a provided - factory or obsolete terms or ...""" - res = None - if stanza is not None: - try: - factory = self.factories[stanza['_name']] - except KeyError: - # no factory for this stanza type. ignore - pass - else: - if ((not "is_obsolete" in stanza) or - (not 'true' in stanza['is_obsolete'])): - res = factory(stanza) - return res - - def __iter__(self): - return self.stanzas() - - -class GeneOntology(object): - """The GeneOntology object contains the whole ontology in an internal - format and allows to traverse it efficiently.""" - def __init__(self, parser, rels=None): - if rels is None: - rels = UP_RELS - if not isinstance(parser, OntologyParser): - raise Exception('requires an OntologyParser instance') - self.parser = parser - self.up_rels = UP_RELS.intersection(rels) - self.down_rels = reverse_name_of_rels(self.up_rels) - - def parse(self): - """parse the ontology data. - - This method should be called after instantiation and before any traversal""" - self.terms = dict() - for cur_term in self.parser: - self.terms[cur_term.id] = cur_term - - # replace parents nrs by the references to the GO terms objects - # this can be only done once all the terms have been created - for term in self.terms.values(): - term.replace_parentnames_by_refs(self.terms) - - def ensure_term(self, term): - """returns the term object associated with term. if term is already - a GOterm object, it is simply return. term_ids can be either numeric - ids of existing GO-terms or propper GO-terms ids, i.e. GO:000002 - - :param term: a term_id or a GOterm object""" - if isinstance(term, GOterm): - return term - else: - return self.term_by_id(term) - - def term_by_id(self, term_id): - """Returns the term object associated with term_id. - - :param term_id: a GO-term number or a GO-term id (GO:008150).""" - try: - term = self.terms[validate_go_id(term_id)] - return term - except KeyError: - raise ValueError(str(term_id) + ' is an invalid GO term.') - - def get_superterms_incl_queryterm(self, term, max_steps=-1): - """returns a set with all the superterms of a query term. - - :param max_steps: The search can be limited to contain only - terms that are at most 'max_steps' upwards. If set to -1, no - limit is applied and the search goes up to the root.""" - term = self.ensure_term(term) - return self._traverseGraph(term, max_steps, self.up_rels) - - def get_subterms(self, term, max_steps=-1): - term = self.ensure_term(term) - return self._traverseGraph(term, max_steps, self.down_rels) - - def _traverseGraph(self, node, max_steps, rels): - """_traverseGraph traverses the graph in a breath first manner - and reports all the nodes reachable within max_steps.""" - remain = set([node]) - found = set() - while len(remain) > 0 and max_steps != 0: - novel = set() - for t in remain: - for rel in rels: - try: - novel.update(t.__dict__[rel]) - except KeyError: - pass - found.update(remain) - remain = novel.difference(found) - max_steps -= 1 - return found - - -class FreqAwareGeneOntology(GeneOntology): - """GO hierarchy represents the Gene Ontology vocabulary. - - It gets loaded from the xml file and, in conjunction with - an annotation file (GOA) the relative frequencies per term get - estimated. this estimation respects the hierarchy of the - vocabulary. - Further, this class provides methods to traverse the hierarchy - in an easy way.""" - - def __init__(self, fp, rels=UP_RELS): - super(FreqAwareGeneOntology, self).__init__(fp, rels=rels) - self.reset_freqs() - - def reset_freqs(self): - self.cnts = dict() - self.tot_cnts = [0] * NUM_ONT - - def estimate_freqs(self, annotations): - for anno in annotations: - try: - self._update_counts(self.term_by_id(anno['TermNr'])) - except ValueError: - logging.info("invalid annotation term_id in freq estim:" + - str(anno.term_id)) - - def _update_counts(self, term): - for cur_term in self.get_superterms_incl_queryterm(term): - self.cnts[cur_term.id] = self.cnts.get(cur_term.id, 0) + 1 - self.tot_cnts[cur_term.aspect] += 1 - - def get_term_frequency(self, term): - term = self.ensure_term(term) - try: - freq = self.cnts.get(term.id, 0) / self.tot_cnts[term.aspect] - return freq - except ZeroDivisionError: - return 0 - - def last_common_ancestor(self, *terms): - cand = self.get_superterms_incl_queryterm(terms[0]) - for t in terms[1:]: - cand.intersection_update(self.get_superterms_incl_queryterm(t)) - lca = min(cand, key=self.get_term_frequency) - return lca - - def lin_similarity(self, term1, term2): - term1 = self.ensure_term(term1) - term2 = self.ensure_term(term2) - if term1.aspect != term2.aspect: - # early abort, since the two terms will be by - # definition not similar - sim = 0 - else: - lca = self.last_common_ancestor(term1, term2) - sim = (2 * math.log(self.get_term_frequency(lca)) / - (math.log(self.get_term_frequency(term1)) + - math.log(self.get_term_frequency(term2)))) - return sim - - -class AnnotationFilter(object): - EXP_CODES = frozenset(['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP']) - TRUST_IEA_REFS = frozenset([ - 'GO_REF:0000002', 'GOA:interpro', 'GOA:interpro|GO_REF:0000002', # InterPro - 'GO_REF:0000003', 'GOA:spec', 'GOA:spec|GO_REF:0000003' # EC number - 'GO_REF:0000004', 'GOA:spkw', 'GOA:spkw|GO_REF:0000004', - 'GO_REF:0000037', 'GO_REF:0000038' # SwissProt Keywords - 'GO_REF:0000023', 'GOA:spsl', 'GOA:spsl|GO_REF:0000023', - 'GO_REF:0000039', 'GO_REF:0000040', # UniProtKB Subcellular Location - ]) - - @staticmethod - def is_negated(a): - return a.qualifier.find('NOT') >= 0 - - @classmethod - def is_exp_annotation(cls, a): - return a.evidence in cls.EXP_CODES and not cls.is_negated(a) - - @classmethod - def is_trusted_electronic(cls, a): - return a.evidence == 'IEA' and a.db_ref in cls.TRUST_IEA_REFS and not cls.is_negated(a) - - @classmethod - def is_exp_or_trusted_electronic(cls, a): - return cls.is_exp_annotation(a) or cls.is_trusted_electronic(a) - - -# definition of the GOA Annotations. for performance reasons, we -# keep this as a namedtuple collection. -GOA_Annotation = collections.namedtuple('GOA_Annotation', - ['db', 'db_obj_id', 'db_obj_sym', 'qualifier', 'term_id', - 'db_ref', 'evidence', 'with_from', 'aspect', 'db_obj_name', - 'db_obj_syn', 'db_obj_typ', 'taxon', 'date', 'assigned_by', - 'ext', 'gene_product_from_id']) - - -class AnnotationParser(object): - def __init__(self, fp, factory=GOA_Annotation._make): - self._needs_close = False - if isinstance(fp, str): - if fp.endswith('.gz'): - from gzip import GzipFile - fp = GzipFile(fp, 'rb') - self._needs_close = True - else: - fp = open(fp, 'rb') - self.fp = fp - self.factory = factory - - self._read_headers() - - def _read_headers(self): - pass - - def annotations(self): - """Iterates over the annotations in the file yielding objects - constructed by the factory argument passed to the constructor - of this class for each annotation.""" - csv_reader = csv.reader((l for l in self.fp if not l.startswith('!')), - delimiter='\t') - for row in csv_reader: - yield self.factory(row) - if self._needs_close: - self.fp.close() - - def __iter__(self): - return self.annotations() - - diff --git a/src/HogProf/build/lib/pyoma/browser/homoeologs.py b/src/HogProf/build/lib/pyoma/browser/homoeologs.py deleted file mode 100755 index ce0177c..0000000 --- a/src/HogProf/build/lib/pyoma/browser/homoeologs.py +++ /dev/null @@ -1,238 +0,0 @@ -import pandas -import logging -import collections -import numpy as np -import matplotlib -matplotlib.use('agg') -from skfuzzy import control as ctrl -from skfuzzy import gaussmf -import sklearn -import sklearn.preprocessing -import tables - -try: - from tqdm import tqdm -except ImportError: - tqdm = lambda x, **kwargs: x -logger = logging.getLogger(__name__) - - -def define_universe(df): - # New Antecedent/Consequent objects hold universe variables and membership functions - distance = ctrl.Antecedent(np.arange(0, df['Distance'].max() + .01, .01), 'distance') - synteny = ctrl.Antecedent(np.arange(0, 1.01, .01), 'synteny_score') - total_nb_hom = ctrl.Antecedent(np.arange(2, df['TotalCopyNr'].max() + 1, 1), 'total_nb_homoeologs') - conf = ctrl.Consequent(np.arange(0, 101, 1), 'conf') - return distance, synteny, total_nb_hom, conf - - -def create_fuzzy_rules(distance, synteny, total_nb_hom, conf): - """Takes the antecedent and consequent objects as input""" - - # very low confidence - rule1 = ctrl.Rule(synteny['low'] & distance['high'] & total_nb_hom['high'], conf['very_low']) - - # low confidence - rule2 = ctrl.Rule((synteny['low'] & distance['high'] & total_nb_hom['low']) | - (synteny['low'] & distance['high'] & total_nb_hom['med']) | - (synteny['low'] & distance['med'] & total_nb_hom['high']) | - (synteny['med'] & distance['high'] & total_nb_hom['high']), - conf['low']) - - # medium confidence - rule3 = ctrl.Rule((synteny['high'] & distance['high'] & total_nb_hom['high']) | - - (synteny['low'] & distance['med'] & total_nb_hom['low']) | - (synteny['low'] & distance['med'] & total_nb_hom['med']) | - (synteny['med'] & distance['high'] & total_nb_hom['low']) | - (synteny['med'] & distance['high'] & total_nb_hom['med']) | - (synteny['med'] & distance['med'] & total_nb_hom['high']) | - (synteny['med'] & distance['med'] & total_nb_hom['low']) | - (synteny['med'] & distance['med'] & total_nb_hom['med']) | - (synteny['low'] & distance['low'] & total_nb_hom['low']) | - (synteny['low'] & distance['low'] & total_nb_hom['med']) | - (synteny['low'] & distance['low'] & total_nb_hom['high']), - conf['med']) - - # high confidence - rule4 = ctrl.Rule((synteny['high'] & distance['high'] & total_nb_hom['low']) | - (synteny['high'] & distance['high'] & total_nb_hom['med']) | - (synteny['high'] & distance['low'] & total_nb_hom['high']) | - (synteny['high'] & distance['low'] & total_nb_hom['med']) | - (synteny['high'] & distance['med'] & total_nb_hom['high']) | - (synteny['high'] & distance['med'] & total_nb_hom['low']) | - (synteny['high'] & distance['med'] & total_nb_hom['med']) | - (synteny['med'] & distance['low'] & total_nb_hom['high']) | - (synteny['med'] & distance['low'] & total_nb_hom['med']) | - (synteny['med'] & distance['low'] & total_nb_hom['low']), - conf['high']) - - # very high confidence - rule5 = ctrl.Rule(synteny['high'] & distance['low'] & total_nb_hom['low'], - conf['very_high']) - return [rule1, rule2, rule3, rule4, rule5] - - -def get_distance_mf(df, distance): - # here, the first numnber is the universe, second number the central point, third the standard deviation - distance['low'] = gaussmf(distance.universe, 0, (df['Distance'].max() / 10)) - - distance['med'] = gaussmf(distance.universe, - (df['Distance'].max() / 4), - (df['Distance'].max() / 10)) - - distance['high'] = gaussmf(distance.universe, - df['Distance'].max(), - (df['Distance'].max() / 2.5)) - return distance - - -def get_synteny_mf(df, synteny, view=False): - # synteny (gaussian) - synteny['low'] = gaussmf(synteny.universe, 0, .15) - synteny['med'] = gaussmf(synteny.universe, .3, .15) - synteny['high'] = gaussmf(synteny.universe, .7, .25) - return synteny - - -def get_total_nb_hom_mf(df, total_nb_hom): - copy_nr_median = df['TotalCopyNr'].median() - total_nb_hom['low'] = gaussmf(total_nb_hom.universe, - copy_nr_median, copy_nr_median) - - total_nb_hom['med'] = gaussmf(total_nb_hom.universe, - 4 * copy_nr_median, - 1.5 * copy_nr_median) - - total_nb_hom['high'] = gaussmf(total_nb_hom.universe, - df['TotalCopyNr'].max(), - df['TotalCopyNr'].max() / 2.5) - return total_nb_hom - - -def get_conf_mf(df, conf): - # confidence (gaussian) - conf['very_low'] = gaussmf(conf.universe, 0, 20) - conf['low'] = gaussmf(conf.universe, 50, 10) - conf['med'] = gaussmf(conf.universe, 70, 10) - conf['high'] = gaussmf(conf.universe, 90, 10) - conf['very_high'] = gaussmf(conf.universe, 100, 10) - return conf - - -def get_conf_score(simulation, input_dic): - """This function takes the simulation and outputs confidence score - 'input_dic' is a dictionary of the inputs for a homoeolog pair""" - - simulation.inputs(input_dic) - simulation.compute() - return simulation.output['conf'] - - -class HomeologsConfidenceCalculator(object): - def __init__(self, h5_handle, genome): - self.h5_handle = h5_handle - self.genome = genome - if isinstance(h5_handle, tables.File): - self.h5_handle = h5_handle - elif isinstance(h5_handle, (str, bytes)): - self.h5_handle = tables.open_file(h5_handle, 'r') - else: - raise TypeError("expected h5_handle to be either h5-file handle or a path to file") - - genome_row = next(self.h5_handle.root.Genome.where('UniProtSpeciesCode == genome')) - self.genome_range = (int(genome_row['EntryOff']) + 1, - int(genome_row['EntryOff'] + genome_row['TotEntries'])) - genome_df = pandas.DataFrame(self.h5_handle.root.Protein.Entries.read_where( - '(EntryNr >= {}) & (EntryNr <= {})'.format(*self.genome_range))) - self.genome_df = genome_df[ - (genome_df['AltSpliceVariant'] == 0) | (genome_df['AltSpliceVariant'] == genome_df['EntryNr'])] - self.genome_df.reset_index(inplace=True) - self.relations_df = self._load_pairwise_relations() - - def _load_pairwise_relations(self): - """load the homoeologous relations of the cannonical splice variants only - The method returns a pandas dataframe with the relations.""" - df = pandas.DataFrame( - self.h5_handle.get_node('/PairwiseRelation/{}/within'.format(self.genome)).read_where('RelType == 5')) - df = df[df['EntryNr1'].isin(self.genome_df['EntryNr']) & df['EntryNr2'].isin(self.genome_df['EntryNr'])] - return df[['EntryNr1', 'EntryNr2', 'SyntenyConservationLocal', 'Distance']] - - def _count_homeologs_per_entry(self, df): - return collections.Counter(df['EntryNr1']) - - def _augment_dataframe_with_all_features(self, df): - counts = self._count_homeologs_per_entry(df) - df['TotalCopyNr'] = df.apply(lambda x: counts[x['EntryNr1']] + counts[x['EntryNr2']], axis=1) - df.loc[df.SyntenyConservationLocal < 0, 'SyntenyConservationLocal'] = 0 - return df - - def calculate_scores(self): - # load dataframe - df = self.relations_df - df = self._augment_dataframe_with_all_features(df) - - distanceObj, syntenyObj, total_nb_homObj, confObj = define_universe(df) - distance = get_distance_mf(df, distanceObj) - synteny = get_synteny_mf(df, syntenyObj) - total_nb_hom = get_total_nb_hom_mf(df, total_nb_homObj) - conf = get_conf_mf(df, confObj) - - # create simulation - rules = create_fuzzy_rules(distance, synteny, total_nb_hom, conf) - control_system = ctrl.ControlSystem(rules) - simulation = ctrl.ControlSystemSimulation(control_system) - - def defuzzify(row): - return get_conf_score(simulation, - {'distance': row['Distance'], - 'synteny_score': row['SyntenyConservationLocal'], - 'total_nb_homoeologs': row['TotalCopyNr']}) - - df['fuzzy_confidence'] = df.apply(defuzzify, axis=1) - - # scale the confidence between minimum value and 100 - min_max_scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(df['fuzzy_confidence'].min(), 100)) - df['fuzzy_confidence_scaled'] = min_max_scaler.fit_transform(df['fuzzy_confidence'].values.reshape(-1, 1)) - return df - - -class HomeologsConfidenceCalculatorFromTSV(HomeologsConfidenceCalculator): - def __init__(self, infile): - self.relations_df = pandas.read_csv(infile, sep='\t') - expected_columns = ['EntryNr1', 'EntryNr2', 'SyntenyConservationLocal', 'Distance'] - if len(set(expected_columns) - set(self.relations_df.columns.values)) > 0: - raise KeyError("provided inputfile does not have all expected columns. " - "Expected columns are {}".format(expected_columns)) - - -if __name__ == "__main__": - import argparse - # Get arguments from command line - parser = argparse.ArgumentParser( - description='Computes homoeology confidence score using fuzzy logic') - grp = parser.add_mutually_exclusive_group(required=True) - grp.add_argument('--h5', help="name of hdf5 file, full path") - grp.add_argument('--csv', help="tab-separated file with input data as alternative to hdf5 file") - parser.add_argument('--genome', - help="5 letter code of polyploid genome to analyze. " - "Must be specified if used with --h5 option.") - parser.add_argument('--outfile', - help="name where results will be stored (file name created to include parameters)", - default="homoeolog_confidence.tsv") - - args = parser.parse_args() - logging.basicConfig(level=logging.INFO) - - if args.h5 is not None and args.genome is None: - import sys - sys.stderr.write("genomes argument required if using with an hdf5 file as input") - sys.exit(1) - - if args.h5: - import tables - scorer = HomeologsConfidenceCalculator(tables.open_file(args.h5), args.genome) - else: - scorer = HomeologsConfidenceCalculatorFromTSV(args.csv) - data = scorer.calculate_scores() - data.to_csv(args.outfile, sep='\t', header=True, index=True) diff --git a/src/HogProf/build/lib/pyoma/browser/linkout.py b/src/HogProf/build/lib/pyoma/browser/linkout.py deleted file mode 100755 index 98fa0ee..0000000 --- a/src/HogProf/build/lib/pyoma/browser/linkout.py +++ /dev/null @@ -1,276 +0,0 @@ -import collections -import operator -import os -import logging -import math -import re -import ftplib - -from tqdm import tqdm -from lxml import etree -from .db import Database - -logger = logging.getLogger(__name__) - -"""Module to generate external data and crosslinks for NCBI. - -""" - - -class NCBILinkOutXML(object): - root_node = "not_set" - provider_id = "9822" - - def __init__(self): - root = etree.Element(self.root_node) - for key, value in self.root_children().items(): - root.append(self.text_elemement(key, value)) - self.tree = etree.ElementTree(root) - self._add_doctype() - - def root_children(self): - return {} - - def _add_doctype(self): - self.tree.docinfo.public_id = '-//NLM//DTD LinkOut 1.0//EN' - self.tree.docinfo.system_url = 'https://www.ncbi.nlm.nih.gov/projects/linkout/doc/LinkOut.dtd' - - def text_elemement(self, tag, text): - el = etree.Element(tag) - el.text = text - return el - - def write(self, fh): - fh.write(etree.tostring(self.tree, pretty_print=True, xml_declaration=True, encoding='utf-8')) - - -class Provider(NCBILinkOutXML): - root_node = "Provider" - - def root_children(self): - elements = collections.OrderedDict( - [("ProviderId", self.provider_id), - ("Name", "OMA Browser: Orthologous MAtrix"), - ("NameAbbr", "OMA"), - #("SubjectType", "taxonomy/phylogenetic"), - ("Url", "http://omabrowser.org"), - ("Brief", "OMA is a method and database for the inference of orthologs among complete genomes. " - "We provide browsable orthology predictions, APIs, flat file downloads among thousands " - "of genomes.")]) - return elements - - -class Resource(NCBILinkOutXML): - root_node = "LinkSet" - link_id = 1 - - def _add_objs(self, accs): - objsel = etree.Element("ObjectSelector") - objsel.append(self.text_elemement("Database", self.database())) - objlst = etree.Element("ObjectList") - objsel.append(objlst) - for acc in accs: - objlst.append(self.object_node(acc)) - return objsel - - def object_node(self, acc): - return self.text_elemement("ObjId", acc) - - def _add_url_section(self, acc): - el = etree.Element('ObjectUrl') - el.append(self.text_elemement('Base', self.base_url())) - nxt = rule = etree.Element("Rule") - for k, rule_part in enumerate(self.rule_url(acc)): - if isinstance(rule_part, str): - if k == 0: - nxt.text = rule_part - else: - nxt.tail = rule_part - elif rule_part.tag == etree.Entity: - nxt.append(rule_part) - nxt = rule_part - el.append(rule) - el.append(self.text_elemement('SubjectType', "taxonomy/phylogenetic")) - return el - - def add_link(self, accs): - lnk = etree.Element("Link") - lnk.append(self.text_elemement('LinkId', str(self.link_id))) - lnk.append(self.text_elemement('ProviderId', self.provider_id)) - lnk.append(self._add_objs(accs)) - lnk.append(self._add_url_section(accs)) - self.tree.getroot().append(lnk) - self._bump_link_id() - - @classmethod - def _bump_link_id(cls): - cls.link_id += 1 - - def database(self): - return "not set" - - def base_url(self): - return "https://omabrowser.org/oma/hogs/" - - def rule_url(self, acc): - return "", - - -class GenesResource(Resource): - DISKSIZE_HEADER = 200 - DISKSIZE_PER_LINK = 435 - base_name = 'resource_genes' - - def base_url(self): - return "https://omabrowser.org/cgi-bin/gateway.pl/" - - def rule_url(self, acc): - return "?f=DisplayEntry&p1=" + next(iter(acc.values())), - - def database(self): - return "Gene" - - -class ProteinResource(Resource): - DISKSIZE_HEADER = 500 - DISKSIZE_PER_LINK = 45 - base_name = 'resource_protein' - - def base_url(self): - return "https://omabrowser.org/oma/hogs/" - - def object_node(self, acc): - return self.text_elemement("Query", "{}[accn]".format(acc)) - - def rule_url(self, acc): - return etree.Entity("lo.pacc"), "/vis/" - - def database(self): - return "Protein" - - -class TaxonomyResource(Resource): - DISKSIZE_HEADER = 200 - DISKSIZE_PER_LINK = 435 - base_name = 'resource_taxonomy' - - def database(self): - return "taxonomy" - - def base_url(self): - return "https://omabrowser.org/cgi-bin/gateway.pl/" - - def rule_url(self, acc): - return "?f=DisplayOS&p1=" + next(iter(acc.values())), - - -class LinkoutBuffer(object): - def __init__(self, resource, outdir='/tmp', bulk_add=True, max_file_size=20*2**20): - self.max_records = math.floor((max_file_size - resource.DISKSIZE_HEADER) / - resource.DISKSIZE_PER_LINK) - self.cur_nr = 0 - self.buf = [] - self.bulk_add = bulk_add - self.resource_type = resource - self.outdir = outdir - logger.info('Setup Linkout buffer for {} with max {} records ({}bytes) per file, bulk_add={}' - .format(resource.__name__, self.max_records, max_file_size, bulk_add)) - - def add(self, obj): - self.buf.append(obj) - if len(self.buf) >= self.max_records: - self.flush() - - def flush(self): - res = self.resource_type() - if self.bulk_add: - res.add_link(self.buf) - else: - for obj in self.buf: - res.add_link(obj) - fn = os.path.join(self.outdir, - '{}_{:02d}.xml'.format(res.base_name, self.cur_nr)) - with open(fn, 'wb') as fh: - res.write(fh) - self.cur_nr += 1 - self.buf = [] - - -class GenesPriorizationHandler(object): - """Adapter to LinkoutBuffer to select only a limited number of crossrefs. - NCBI linkout caps at 10%""" - - def __init__(self, max_linkouts=None, db=None, **kwargs): - self.max_links = int(max_linkouts) if max_linkouts else 20357436//10 # obtained in Jan2018 - logger.info('Limiting Genes to {} links max'.format(self.max_links)) - self.genes_buffer = LinkoutBuffer(GenesResource, **kwargs) - self.genes = [] - self.db = db - - def add(self, key, value): - self.genes.append((key, value)) - - def _genome_size_map(self): - gs = self.db.get_hdf5_handle().get_node('/Genome').read() - return {row['UniProtSpeciesCode'].decode(): row['TotEntries'] for row in gs} - - def flush(self): - priority_prefixes = ['HUMAN', 'MOUSE', 'RATNO', 'PIGXX', 'DRO', 'SCH', 'YEAST', 'ARA', - 'WHEAT', 'PLAF', 'ECO', 'BAC', 'PANTR', 'ORY', 'GOSHI', 'BRA', - 'DANRE', 'CAE', 'MYC', 'STR', 'MAIZE', 'GORGO', 'PANTR', 'PONAB', - 'MACMU', 'YARLI', 'PEDHC', 'TRICA', 'XENTR', 'YERPE', 'POPTR'] - pat = re.compile(r"^({})".format('|'.join(priority_prefixes))) - if len(self.genes) > self.max_links: - # final sort order will be 'priority genome', genome size and proteins within genome - self.genes.sort(key=operator.itemgetter(1)) - if self.db is not None: - genome_size = self._genome_size_map() - self.genes.sort(key=lambda x: genome_size[x[1][0:5]], reverse=True) - self.genes.sort(key=lambda x: pat.match(x[1]) is None) - for link_acc, link_target in self.genes[0:self.max_links]: - self.genes_buffer.add({link_acc: link_target}) - self.genes_buffer.flush() - - c = collections.defaultdict(int) - for acc, target in self.genes[self.max_links:]: - c[target[0:5]] += 1 - logger.info('Skipping genes link in the following species: {}'.format(c)) - - -def prepare_linkout_files(outdir='/tmp', infile='../pyomabrowser/OmaServer.h5'): - prov = Provider() - with open(os.path.join(outdir, 'provider.xml'), 'wb') as fh: - prov.write(fh) - - db = Database(infile) - xrefs = db.get_hdf5_handle().get_node('/XRef') - xref_source_enum = xrefs.get_enum('XRefSource') - - protein_buffer = LinkoutBuffer(ProteinResource, outdir=outdir, bulk_add=True) - genes_buffer = GenesPriorizationHandler(db=db, outdir=outdir, bulk_add=False) - for xref in tqdm(xrefs): - if xref['XRefSource'] == xref_source_enum['RefSeq']: - protein_buffer.add(xref['XRefId'].decode()) - elif xref['XRefSource'] == xref_source_enum['EntrezGene']: - genes_buffer.add(xref['XRefId'].decode(), - db.id_mapper['OMA'].map_entry_nr(xref['EntryNr'])) - protein_buffer.flush() - genes_buffer.flush() - - with open(os.path.join(outdir, 'resource_taxonomy.xml'), 'wb') as fh: - taxs = TaxonomyResource() - for row in db.id_mapper['OMA'].genome_table: - taxs.add_link({str(row['NCBITaxonId']): row['UniProtSpeciesCode'].decode()}) - taxs.write(fh) - - -def copy_to_ncbi(dir, password, host='ftp-private.ncbi.nlm.nih.gov', user='omabrow'): - with ftplib.FTP(host, user, password) as session: - session.cwd('/holdings') - - for fname in os.listdir(dir): - if fname.endswith('.xml'): - with open(os.path.join(dir, fname), 'rb') as fh: - cmd = "STOR {}".format(fname) - session.storbinary(cmd, fp=fh) - logger.info('finished transfering '+fname) \ No newline at end of file diff --git a/src/HogProf/build/lib/pyoma/browser/locus_parser.py b/src/HogProf/build/lib/pyoma/browser/locus_parser.py deleted file mode 100755 index 51597fd..0000000 --- a/src/HogProf/build/lib/pyoma/browser/locus_parser.py +++ /dev/null @@ -1,68 +0,0 @@ -import numpy -import collections -from lark import Lark, Transformer, ParseError -from tables import dtype_from_descr -from .tablefmt import LocusTable -import logging - - -logger = logging.getLogger(__name__) - -"""This package is intended to parse the darwin locus structure -and create a numpy recarray out of it. """ - - -Exon = collections.namedtuple('Exon', ['start', 'end', 'strand']) - -grammar = '''?locus : join | complement | complement_join | location - join : "join" "(" (complement | location ) ("," (complement | location ))+ ")" - complement : "complement" "(" location ")" - complement_join : "complement" "(" "join" "(" location ("," location)+ ")" ")" - location : pos [".." pos ] | "FromElsewhere" "('" _SEQID "'," pos [".." pos] ")" - ?pos : num | "Before" "(" num ")" | "After" "(" num ")" - ?num : NUMBER -> number - _SEQID: /[A-Za-z0-9._-]+/ - - %import common.NUMBER - %import common.WS - %ignore WS''' - - -class LocusTransformer(Transformer): - def number(self, vals): - return int(vals[0]) - - def location(self, value): - return Exon(value[0], value[1] if len(value) > 1 else value[0], 1) - - def complement(self, value): - rev = [e._replace(strand=-1*e.strand) for e in value] - if len(rev) == 1: - return rev[0] - else: - return rev - - def complement_join(self, value): - return self.complement(value) - - def join(self, values): - return values - - -class LocusParser(object): - def __init__(self): - self.parser = Lark(grammar, start='locus') - self.locus_transformer = LocusTransformer() - self.dtype = dtype_from_descr(LocusTable) - - def parse(self, locus_string, entry_nr=0): - try: - tree = self.parser.parse(locus_string) - except ParseError as e: - raise ValueError("cannot parse '{}' locus string".format(locus_string)) - data = self.locus_transformer.transform(tree) - nr_exons = 1 if isinstance(data, Exon) else len(data) - locus_data = numpy.empty(nr_exons, dtype=self.dtype) - locus_data[['Start', 'End', 'Strand']] = data - locus_data['EntryNr'] = entry_nr - return locus_data diff --git a/src/HogProf/build/lib/pyoma/browser/models.py b/src/HogProf/build/lib/pyoma/browser/models.py deleted file mode 100755 index 9ad654e..0000000 --- a/src/HogProf/build/lib/pyoma/browser/models.py +++ /dev/null @@ -1,429 +0,0 @@ -from __future__ import division - -import collections - -import time - - -def format_sciname(sci, short=False): - p = set([sci.find(x) for x in ['(', 'serogroup', 'serotype', 'serovar', - 'biotype', 'subsp', 'pv.', 'bv.']]) - if sci.startswith('Escherichia coli'): - p.add(sci.find('O')) - p.discard(-1) - p = min(p) if len(p) > 0 else len(sci) - return {'species': sci[0:p], 'strain': sci[p:]} - - -class LazyProperty(object): - """Decorator to evaluate a property only on access. - - Compute the attribute value and caches it in the instance. - Python Cookbook (Denis Otkidach) http://stackoverflow.com/users/168352/denis-otkidach - This decorator allows you to create a property which can be computed once and - accessed many times.""" - - def __init__(self, method, name=None): - # record the unbound-method and the name - self.method = method - self.name = name or method.__name__ - self.__doc__ = method.__doc__ - - def __get__(self, inst, cls): - if inst is None: - return self - # compute, cache and return the instance's attribute value - result = self.method(inst) - # setattr redefines the instance's attribute so this doesn't get called again - setattr(inst, self.name, result) - return result - - -class KeyWrapper(object): - ''' - Enables the use of functions, e.g. bisect, with a key function. - ''' - def __init__(self, it, key): - self.it = it - self.key = key - - def __getitem__(self, i): - return self.key(self.it[i]) - - def __len__(self): - return len(self.it) - - -class Singleton(type): - """A meta-class to enforce a Singleton, e.g. a class that can be - instantiated only exactly once. - - Modified from Python Cookbook, 3rd Edition, p 357ff. - - :Example: - - class Foo(metaclass=Singleton): - def __init__(self): - pass #This part is executed only once - """ - def __init__(self, *args, **kwargs): - self.__instance = None - super(Singleton, self).__init__(*args, **kwargs) - - def __call__(self, *args, **kwargs): - if self.__instance is None: - self.__instance = super(Singleton, self).__call__(*args, **kwargs) - return self.__instance - - -class ProteinEntry(object): - """Model for a protein object - - This class provides an easy to use interface for a given protein - form the database. - - If instantiated with an entry_nr only, no data is loaded until a - property or method is accessed. Properties that need to access - additional data or loaded lazily and are cached in the object - (but not kept after deletion of object).""" - def __init__(self, db, e): - self._stored_entry = e - self._db = db - - @LazyProperty - def _entry(self): - return (self._db.entry_by_entry_nr(self._stored_entry) - if isinstance(self._stored_entry, int) - else self._stored_entry) - - @classmethod - def from_entry_nr(cls, db, eNr): - # e = db.entry_by_entry_nr(eNr) - return cls(db, int(eNr)) - - @property - def entry_nr(self): - return int(self._entry['EntryNr']) - - @property - def locus_start(self): - return int(self._entry['LocusStart']) - - @property - def locus_end(self): - return int(self._entry['LocusEnd']) - - @property - def strand(self): - return int(self._entry['LocusStrand']) - - @LazyProperty - def exons(self): - return ExonStructure.from_entry_nr(self._db, self.entry_nr) - - @property - def oma_group(self): - return int(self._entry['OmaGroup']) - - @property - def oma_hog(self): - return self._entry['OmaHOG'].decode() - - @property - def chromosome(self): - return self._entry['Chromosome'].decode() - - @property - def canonicalid(self): - return self._entry['CanonicalId'].decode() - - @property - def sequence_md5(self): - return self._entry['MD5ProteinHash'].decode() - - @LazyProperty - def genome(self): - g = self._db.id_mapper['OMA'].genome_of_entry_nr(self._entry['EntryNr']) - return Genome(self._db, g) - - @LazyProperty - def omaid(self): - return self._db.id_mapper['OMA'].map_entry_nr(self._entry['EntryNr']) - - @LazyProperty - def cdna(self): - return self._db.get_cdna(self._entry).decode() - - @property - def gc_content(self): - cdna = self.cdna - cnts = list(map(cdna.count, 'GCAT')) - try: - return sum(cnts[0:2])/sum(cnts) - except ZeroDivisionError: - return 0 - - @LazyProperty - def sequence(self): - return self._db.get_sequence(self._entry).decode() - - @property - def sequence_length(self): - return int(self._entry['SeqBufferLength']) - 1 - - @LazyProperty - def description(self): - return self._db.get_description(self._entry).decode() - - @property - def subgenome(self): - return self._entry['SubGenome'].decode() - - @LazyProperty - def hog_family_nr(self): - from .db import Singleton as HOGSingleton - try: - fam = self._db.hog_family(self._entry) - except HOGSingleton: - fam = 0 - return fam - - @property - def is_main_isoform(self): - return (self._entry['AltSpliceVariant'] == 0 or - self._entry['AltSpliceVariant'] == self._entry['EntryNr']) - - @LazyProperty - def alternative_isoforms(self): - return [ProteinEntry(self._db, e) - for e in self._db.get_splicing_variants(self._entry) - if e['EntryNr'] != self.entry_nr] - - def __repr__(self): - return "<{}({}, {})>".format(self.__class__.__name__, self.entry_nr, self.omaid) - - def __len__(self): - return self.sequence_length - - -class Genome(object): - def __init__(self, db, g): - self._genome = g - self._db = db - - @property - def ncbi_taxon_id(self): - return int(self._genome['NCBITaxonId']) - - @property - def uniprot_species_code(self): - return self._genome['UniProtSpeciesCode'].decode() - - @property - def sciname(self): - return self._genome['SciName'].decode() - - @property - def common_name(self): - try: - return self._genome['CommonName'].decode() - except ValueError: - return "" - - @property - def synonym_name(self): - return self._genome['SynName'].decode() - - @LazyProperty - def species_and_strain_as_dict(self): - return format_sciname(self.sciname) - - def species(self): - return self.species_and_strain_as_dict['species'] - - def strain(self): - return self.species_and_strain_as_dict['strain'] - - @property - def url(self): - return self._genome['Url'].decode() - - @property - def source(self): - return self._genome['Source'].decode() - - @property - def release(self): - return self._genome['Release'].decode() - - @property - def last_modfied_timestamp(self): - return self._genome['Date'] - - @property - def last_modified(self): - return self.modification_date("%Y-%b-%d") - - def modification_date(self, fmt): - if self._db.db_schema_version >= (3, 2): - return time.strftime(fmt, time.localtime(self.last_modfied_timestamp)) - else: - return 'n/a' - - @property - def nr_entries(self): - return int(self._genome['TotEntries']) - - @property - def entry_nr_offset(self): - return int(self._genome['EntryOff']) - - @LazyProperty - def kingdom(self): - # TODO: store directly in db - return self._db.tax.get_parent_taxa(self._genome['NCBITaxonId'])[-1]['Name'].decode() - - @property - def is_polyploid(self): - return self._genome['IsPolyploid'] - - @LazyProperty - def lineage(self): - return [lev['Name'].decode() for lev in self._db.tax.get_parent_taxa( - self._genome['NCBITaxonId'])] - - @LazyProperty - def chromosomes(self): - chrs = collections.defaultdict(list) - entry_tab = self._db.get_hdf5_handle().get_node('/Protein/Entries') - for row in entry_tab.where('(EntryNr > {}) & (EntryNr <= {})' - .format(self.entry_nr_offset, self.entry_nr_offset+self.nr_entries)): - chrs[row['Chromosome'].decode()].append(row['EntryNr']) - return chrs - - def __repr__(self): - return "<{}({}, {})>".format(self.__class__.__name__, self.uniprot_species_code, - self.ncbi_taxon_id) - - def __len__(self): - return self.nr_entries - - -class PairwiseRelation(object): - def __init__(self, db, relation): - self._relation = relation - self._db = db - - @property - def distance(self): - return float(self._relation['Distance']) - - @property - def score(self): - return float(self._relation['Score']) - - @property - def alignment_overlap(self): - return float(self._relation['AlignmentOverlap']) - - @property - def synteny_conservation_local(self): - return float(self._relation['SyntenyConservationLocal']) - - @property - def confidence(self): - return float(self._relation['Confidence']) - - @LazyProperty - def rel_type(self): - if not isinstance(self._relation['RelType'], str): - type_map = self._db._get_pw_tab(self._relation['EntryNr1'], 'VPairs').get_enum("RelType") - return type_map(self._relation['RelType']) - else: - return self._relation['RelType'] - - @LazyProperty - def entry_1(self): - return ProteinEntry(self._db, self._db.entry_by_entry_nr(self._relation['EntryNr1'])) - - @LazyProperty - def entry_2(self): - return ProteinEntry(self._db, self._db.entry_by_entry_nr(self._relation['EntryNr2'])) - - -class GeneOntologyAnnotation(object): - def __init__(self, db, anno): - self.db = db - self.anno = anno - - @LazyProperty - def term(self): - return self.db.gene_ontology.term_by_id(self.anno['TermNr']) - - @property - def evidence(self): - return self.anno['Evidence'].decode() - - @property - def reference(self): - return self.anno['Reference'].decode() - - @property - def entry_nr(self): - return int(self.anno['EntryNr']) - - @LazyProperty - def aspect(self): - from .geneontology import GOAspect - return GOAspect.to_string(self.term.aspect) - - -class ExonStructure(object): - def __init__(self, db, exons): - self._stored = exons - self._db = db - - @LazyProperty - def _exons(self): - return (self._db.get_exons(self._stored) - if isinstance(self._stored, int) - else self._stored) - - @classmethod - def from_entry_nr(cls, db, eNr): - return cls(db, int(eNr)) - - def _iter_exons(self): - if self._exons['Strand'][0] < 0: - self._exons[::-1].sort(order='Start') - else: - self._exons.sort(order='Start') - for exon in self._exons: - yield Exon(exon) - - def __len__(self): - return len(self._exons) - - def __repr__(self): - return "<{}(entry_nr={}, nr_exons={})>"\ - .format(self.__class__.__name__, - self._exons[0]['EntryNr'], len(self)) - - def __str__(self): - exs = list(str(e) for e in self._iter_exons()) - if len(exs) > 1: - return "join({})".format(", ".join(exs)) - else: - return exs[0] - - -class Exon(object): - def __init__(self, exon): - self.exon = exon - - def __str__(self): - if self.exon['Strand'] < 0: - template = "complement({}..{})" - else: - template = "{}..{}" - return template.format(self.exon['Start'], self.exon['End']) diff --git a/src/HogProf/build/lib/pyoma/browser/synteny.py b/src/HogProf/build/lib/pyoma/browser/synteny.py deleted file mode 100755 index 0abbcfe..0000000 --- a/src/HogProf/build/lib/pyoma/browser/synteny.py +++ /dev/null @@ -1,102 +0,0 @@ -import pandas -import tables -import logging -try: - from tqdm import tqdm -except ImportError: - tqdm = lambda x, **kwargs: x -logger = logging.getLogger(__name__) - - -class SyntenyScorer(object): - def __init__(self, h5_handle, genome, windowsize=10): - self.h5_handle = h5_handle - self.genome = genome - self.windowsize = windowsize - if isinstance(h5_handle, tables.File): - self.h5_handle = h5_handle - elif isinstance(h5_handle, (str, bytes)): - self.h5_handle = tables.open_file(h5_handle, 'r') - else: - raise TypeError("expected h5_handle to be either h5-file handle or a path to file") - - genome_row = next(self.h5_handle.root.Genome.where('UniProtSpeciesCode == genome')) - self.genome_range = (int(genome_row['EntryOff']) + 1, - int(genome_row['EntryOff'] + genome_row['TotEntries'])) - genome_df = pandas.DataFrame(self.h5_handle.root.Protein.Entries.read_where( - '(EntryNr >= {}) & (EntryNr <= {})'.format(*self.genome_range))) - self.genome_df = genome_df[(genome_df['AltSpliceVariant'] == 0) | (genome_df['AltSpliceVariant'] == genome_df['EntryNr'])] - self.genome_df.reset_index(inplace=True) - self.relations_df = self._load_pairwise_relations() - - def _load_pairwise_relations(self): - df = pandas.DataFrame(self.h5_handle.get_node('/PairwiseRelation/{}/within'.format(self.genome)).read_where('RelType == 5')) - return df[['EntryNr1', 'EntryNr2', 'SyntenyConservationLocal']] - - def get_neighbor_genes(self, query): - q = self.genome_df[self.genome_df['EntryNr'] == query] - if len(q) == 0: - logger.error("querying neighbor genes for non-primary variant (EntryNr: {})".format(query)) - return [] - query_chr = q['Chromosome'] - neighbor = self.genome_df[max(0, q.index.item() - self.windowsize // 2): q.index.item() + self.windowsize//2 + 1] - return neighbor[neighbor['Chromosome'] == query_chr.item()] - - def score_of_pair(self, entry1, entry2): - neigh1 = self.get_neighbor_genes(entry1) - neigh2 = self.get_neighbor_genes(entry2) - if len(neigh1) <= 1 or len(neigh2) <= 1: - raise TooSmallChromosome("too few genes on chromosome: {}, {}".format(len(neigh1), len(neigh2))) - - rels_among_windows = self.relations_df[ - (self.relations_df['EntryNr1'] >= neigh1.iloc[0]['EntryNr']) & - (self.relations_df['EntryNr1'] <= neigh1.iloc[-1]['EntryNr']) & - (self.relations_df['EntryNr2'] >= neigh2.iloc[0]['EntryNr']) & - (self.relations_df['EntryNr2'] <= neigh2.iloc[-1]['EntryNr'])] - - score1 = (len(set(rels_among_windows['EntryNr1'])) - 1) / (len(neigh1) - 1) - score2 = (len(set(rels_among_windows['EntryNr2'])) - 1) / (len(neigh2) - 1) - res = {'entry_nr1': int(entry1), 'entry_nr2': int(entry2), - 'chr1': neigh1.iloc[0]['Chromosome'].decode(), - 'chr2': neigh2.iloc[0]['Chromosome'].decode(), - 'nr_genes_window1': len(neigh1), 'nr_genes_window2': len(neigh2), - 'synteny_score_1': score1, 'synteny_score_2': score2, - 'mean_synteny_score': (score1 + score2) / 2} - return res - - def compute_scores(self): - res = [] - for idx, rel in tqdm(self.relations_df.iterrows(), total=len(self.relations_df)): - try: - res.append(self.score_of_pair(rel['EntryNr1'], rel['EntryNr2'])) - except TooSmallChromosome as e: - logging.info("Skipping {}/{}: {}".format(int(rel['EntryNr1']), int(rel['EntryNr2']), e)) - pass - return pandas.DataFrame(res) - - -class TooSmallChromosome(Exception): - pass - - -#### MAIN #### -if __name__ == "__main__": - import argparse - # Get arguments from command line - parser = argparse.ArgumentParser( - description='Returns windows and their proportion of homoeolog VPs for a given chromosome') - parser.add_argument('--h5', help='name of h5 file, full path', required=True) - parser.add_argument('--window_genes', help='window size in genes', default=10) - parser.add_argument('--genome', help='5 letter code of polyploid genome to analyze') - parser.add_argument('--outfile', help='name where results will be stored (file name created to include parameters)', \ - default="synteny_results.tsv") - - args = parser.parse_args() - h5file_path = args.h5 - logging.basicConfig(level=logging.INFO) - - scorer = SyntenyScorer(tables.open_file(h5file_path), args.genome) - data = scorer.compute_scores() - columns = ['entry_nr1', 'chr1', 'nr_genes_window1', 'entry_nr2', 'chr2', 'nr_genes_window2', 'synteny_score_1', - 'synteny_score_2', 'mean_synteny_score'] - data[columns].to_csv(args.outfile, sep='\t', header=True, index=True) diff --git a/src/HogProf/build/lib/pyoma/browser/tablefmt.py b/src/HogProf/build/lib/pyoma/browser/tablefmt.py deleted file mode 100755 index eea8c7f..0000000 --- a/src/HogProf/build/lib/pyoma/browser/tablefmt.py +++ /dev/null @@ -1,162 +0,0 @@ -import tables - -"""This module contains the definitions of the database tables -used in the browser database. Some of these tables are used -multiple times, e.g. the PairwiseRelationTable is used -for each genome pair. - -From these table definitions one can easily extract the numpy -dtype that can hold the data: - - >>>tables.dtype_from_descr(HOGsTable) - dtype([('Fam', ' 0 and keys != oldkeys ) : - first = False - oldkeys = keys - leaves = set([leaf.name for leaf in tree.get_leaves()]) - orphans = set(genome_ids_list) - leaves - print(len(orphans)) - for orphan in orphans: - if str(orphan_info[orphan][-1]) in newdict: - newdict[str(orphan_info[orphan][-1])].append(orphan) - else: - newdict[str(orphan_info[orphan][-1])] = [orphan] - keys = set(list(newdict.keys())) - for n in tree.traverse(): - if n.name in newdict and n.name not in leaves: - for orph in newdict[n.name]: - n.add_sister(name=orph) - del newdict[n.name] - - for orphan in orphans: - if len(orphan_info[orphan]) > 1: - orphan_info[orphan].pop() - - newdict = {} - nodes = {} - print(orphans) - #clean up duplicates - for n in tree.traverse(): - if n.name not in nodes: - nodes[ n.name] =1 - else: - nodes[ n.name] +=1 - - for n in tree.traverse(): - if nodes[ n.name] >1: - if n.is_leaf()== False: - n.delete() - nodes[ n.name]-= 1 - - - return tree diff --git a/src/HogProf/build/lib/utils/goatools_utils.py b/src/HogProf/build/lib/utils/goatools_utils.py deleted file mode 100755 index d3c7dee..0000000 --- a/src/HogProf/build/lib/utils/goatools_utils.py +++ /dev/null @@ -1,186 +0,0 @@ - -from goatools import semantic -from goatools.obo_parser import GODag - -import json -from utils import hashutils -from utils import config_utils -import pickle -from goatools.go_enrichment import GOEnrichmentStudy - -##############enrichment############################################## - -def return_enrichment_study_obj(gaf_taxfiltered): - ''' - Generate go enrichment study object with a background dataset. - ''' - - obodag = GODag(config_utils.datadir+"/GOData/go-basic.obo") - goeaobj = GOEnrichmentStudy( - gaf_taxfiltered.keys(), # - gaf_taxfiltered, # geneid/GO associations possible with tree used for DB - obodag, # Ontologies - propagate_counts = False, - alpha = 0.15, # default significance cut-off - methods = ['fdr_bh']) # defult multipletest correction method - return goeaobj - -def buildGAF(gaf_file , universe= None): - gaf_filtered = {} - with open(gaf_file, mode='r') as gafin: - for line in gafin: - words = line.split() - if words[0] not in gaf_filtered: - gaf_filtered[words[0]]=set([words[1]]) - else: - gaf_filtered[words[0]].add(words[1]) - - if universe: - gaf_filtered = { prot:gaf_filtered[prot] for prot in universe} - - - return gaf_filtered - -def run_GOEA_onresults(results, db_obj, goeaobj, outname = None): - ''' - Perform enrichment analysis on returned results - grabs all member protein of all hogs in result - returns goe results and HOG composition - ''' - #print(db_obj.member_of_hog_id(int(results[0]))) - hogids =[ "HOG:" + (7-len(fam_id)) * '0' + fam_id for fam_id in results ] - #print( db_obj.member_of_hog_id(hogids[0]) ) - HOGS={} - print('compiling hogs') - prots = [] - for i,result in enumerate(hogids): - if i %10 ==0: - print(i) - HOGS[result]=[] - for member in db_obj.iter_members_of_hog_id(result): - HOGS[result].append(member.omaid) - prots.append(member.omaid) - print('done') - print('running GO enrichment study') - - - goea_results_all = goeaobj.run_study(prots ) - print('done') - with open( config_utils.datadir + outname + 'Hogs2Prots.pkl' , 'wb' ) as save: - save.write(pickle.dumps(HOGS,2)) - - goeaobj.wr_txt(config_utils.datadir+ str(outname)+"enrichment.txt", goea_results_all) - print('DONE!') - return goea_results_all, HOGS - - -def run_GOEA_onresults_tar(results, tar, goeaobj, outname = None): - ''' - Perform enrichment analysis on returned results - grabs all member protein of all hogs in result - returns goe results and HOG composition - ''' - ## TODO: finish this function with tar hog to list of prot IDS - #print(db_obj.member_of_hog_id(int(results[0]))) - #hogids =[ "HOG:" + (7-len(fam_id)) * '0' + fam_id for fam_id in results ] - #print( db_obj.member_of_hog_id(hogids[0]) ) - - - HOGS={} - print('compiling hogs') - prots = [] - for i,result in enumerate(hogids): - if i %10 ==0: - print(i) - HOGS[result]=[] - for member in db_obj.iter_members_of_hog_id(result): - HOGS[result].append(member.omaid) - prots.append(member.omaid) - print('done') - print('running GO enrichment study') - - goea_results_all = goeaobj.run_study(prots ) - print('done') - with open( config_utils.datadir + outname + 'Hogs2Prots.pkl' , 'wb' ) as save: - save.write(pickle.dumps(HOGS,2)) - - goeaobj.wr_txt(config_utils.datadir+ str(outname)+"enrichment.txt", goea_results_all) - print('DONE!') - return goea_results_all, HOGS - - -######################resnik semsim ################################################### - -def resnik_sim_hdf5(go_id1, go_id2, godag, termcounts, hdf5): - ''' - Computes Resnik's similarity measure. - ''' - try: - msca_goid = deepest_common_ancestor_hdf5([goterm2id(go_id1), goterm2id(go_id2)], godag, hdf5) - score = semantic.get_info_content(msca_goid, termcounts) - except: - score = -1 - return score - - -def deepest_common_ancestor_hdf5(go_ids, godag, hdf5): - ''' - Gets the nearest common ancestor - using the above function. - Only returns single most specific - assumes unique exists. - ''' - # Take the element at maximum depth. - return max(common_parent_go_ids_hdf5(go_ids, hdf5), key=lambda t: godag[t].depth) - -def common_parent_go_ids_hdf5(go_ids, hdf5_set): - ''' - Finds the common ancestors in the GO - tree of the list of goids in the input. - ''' - candidates = set(hdf5_set[go_ids[0]].tolist()) - for go_id in go_ids[1:]: - candidates_to_add = set(hdf5_set[go_id].tolist()) - candidates.intersection_update(candidates_to_add) - corrected_candidates = [id2goterm(c) for c in candidates] - return corrected_candidates - -def resnik_sim_pandas(tup, df , termcounts): - ''' - Computes Resnik's similarity measure. - ''' - go_id1, go_id2 = tup - #print(df.head()) - if go_id1 == go_id2: - return semantic.get_info_content(go_id1, termcounts) - - elif go_id2 in df.index and go_id1 in df.index: - - ancestors = df.loc[str(go_id2)].parents - ancestors += df.loc[str(go_id1)].parents - terms = df.loc[ancestors] - ancestors_set = terms.parents.tolist() - intersection = set(ancestors_set[0]).intersection(* ancestors_set[1:]) - common_ancestors = df.loc[list(intersection)] - common_ancestors = common_ancestors.sort_values('depth', ascending= False) - msca_goid = common_ancestors.index.tolist()[0] - return semantic.get_info_content(msca_goid, termcounts) - - else: - return -1 - - -def get_go_terms_gaf(hog_id, pyoma_dbobj, gaf , genomes = None): - ''' - iterate over hog members and get the go information from a gaf in memory - ''' - fam = hashutils.hogid2fam(hog_id) - go_terms = { mr.omaid:gaf[mr.omaid] for mr in pyoma_dbobj.iter_members_of_hog_id(hog_id) if mr.omaid in gaf } - return go_terms - - -def goterm2id(go_term_to_modif): - - return int(go_term_to_modif.split(':')[1]) - -def id2goterm(go_term_to_modif): - return 'GO:{:07d}'.format(go_term_to_modif) diff --git a/src/HogProf/build/lib/utils/hashutils.py b/src/HogProf/build/lib/utils/hashutils.py deleted file mode 100755 index 6f293b5..0000000 --- a/src/HogProf/build/lib/utils/hashutils.py +++ /dev/null @@ -1,166 +0,0 @@ - - -import datasketch -import itertools -import ete3 -import copy -import math -import numpy as np -import pandas as pd - - -def generate_treeweights( mastertree, taxaIndex , taxfilter, taxmask ): - #weighing function for tax level, masking levels etc. sets all weights to 1 - """ - Generate the weights of each taxonomic level to be applied during the - constructin of weighted minhashes - :param mastertree: full corrected ncbi taxonomy - :param taxaIndex: dict mapping taxa to columns - :param taxfilter: list of branches to delete - :param taxmask: if this is not NONE taxmask, the DB is constructed with this subtree - :return: weights: a vector of weights for each tax level - """ - - weights = { type: np.zeros((len(taxaIndex),1)) for type in ['presence', 'loss', 'dup']} - print(len(taxaIndex)) - newtree = mastertree - for event in weights: - for n in newtree.traverse(): - if taxmask: - if str(n.name) == str(taxmask): - newtree = n - break - if taxfilter: - if n.name in taxfilter: - n.delete() - for event in weights: - for n in newtree.traverse(): - weights[event][taxaIndex[n.name]] = 1 - return weights - -def hash_tree(tp , taxaIndex , treeweights , wmg): - """ - Generate a weighted minhash and binary matrix row for a tree profile - - :param tp: a pyham tree profile - :param taxaIndex: dict mapping taxa to columns - :param treeweights: a vector of weights for each tax levels - :param wmg: Datasketch weighted minhash generator - :return hog_matrix: a vector of weights for each tax level - :return weighted_hash: a weighted minhash of a HOG - - """ - - losses = [ taxaIndex[n.name] for n in tp.traverse() if n.lost and n.name in taxaIndex ] - dupl = [ taxaIndex[n.name] for n in tp.traverse() if n.dupl and n.name in taxaIndex ] - presence = [ taxaIndex[n.name] for n in tp.traverse() if n.nbr_genes > 0 and n.name in taxaIndex ] - indices = dict(zip (['presence', 'loss', 'dup'],[presence,losses,dupl] ) ) - hog_matrix_weighted = np.zeros((1, 3*len(taxaIndex))) - hog_matrix_binary = np.zeros((1, 3*len(taxaIndex))) - for i,event in enumerate(indices): - if len(indices[event])>0: - taxindex = np.asarray(indices[event]) - hogindex = np.asarray(indices[event])+i*len(taxaIndex) - hog_matrix_weighted[:,hogindex] = treeweights[hogindex].ravel() - hog_matrix_binary[:,hogindex] = 1 - weighted_hash = wmg.minhash(list(hog_matrix_weighted.flatten())) - - return hog_matrix_binary , weighted_hash - -def tree2str_DCA(tp , taxaIndex ): - """ - Generate a string where each column is a tax level - each letter code corresponds to an event type - each row is a protein family. for use with DCA pipelines - - :param tp: a pyham tree profile - :param taxaIndex: dict mapping taxa to columns - :return dcaMat: a weighted minhash of a HOG - """ - #convert a tree profile to a weighted minhash - - - losses = [ taxaIndex[n.name] for n in tp.traverse() if n.lost and n.name in taxaIndex ] - dupl = [ taxaIndex[n.name] for n in tp.traverse() if n.dupl and n.name in taxaIndex ] - presence = [ taxaIndex[n.name] for n in tp.traverse() if n.nbr_genes > 0 and n.name in taxaIndex ] - - - Ds = list( set(dupl).intersection(set(presence))) - Ps= list(set(presence).difference(set(dupl))) - Ls= list(set(losses)) - charar = np.chararray(len(taxaIndex) ) - #set to absent - - charar.fill( 'A') - charar[Ds] = 'D' - charar[Ls] = 'L' - charar[Ps] = 'P' - - return charar - -def row2hash(row , taxaIndex , treeweights , wmg): - """ - turn a dataframe row with an orthoxml file to hash and matrix row - :param row: lsh builder dataframe row - :param taxaIndex: dict mapping taxa to columnsfam - :param treeweights: a vector of weights for each tax levels - :param wmg: Datasketch weighted minhash generator - :return: hog_matrix: a vector of weights for each tax level - :return: weighted_hash: a weighted minhash of a HOG - """ - #convert a dataframe row to a weighted minhash - fam, treemap = row.tolist() - hog_matrix,weighted_hash = hash_tree(treemap , taxaIndex , treeweights , wmg) - return [weighted_hash,hog_matrix] - - -def fam2hash_hdf5(fam, hdf5, dataset = None, nsamples = 128 ): - #read the stored hash values and return a weighted minhash - """ - Read the stored hash values and return a weighted minhash - :param fam: hog id - :param hdf5: h5py object of the hashvalues - :param dataset: which dataset to use when constructing the hash - :return: minhash1: the weighted hash of your HOG - """ - if dataset is None: - dataset = list(hdf5.keys())[0] - hashvalues = np.asarray(hdf5[dataset][fam, :].reshape(nsamples,2 )) - hashvalues = hashvalues.astype('int64') - minhash1 = datasketch.WeightedMinHash( seed = 1, hashvalues=hashvalues) - return minhash1 - -def hogid2fam(hog_id): - """ - For use with OMA HOGs - Get fam given hog id - :param hog_id: hog id - :return: fam - - """ - - if not hog_id: - return hog_id - if type(hog_id) is int: - return hog_id - - if ':' in hog_id: - hog_id = hog_id.split(':')[1] - if '.' in hog_id: - hog_id = hog_id.split('.')[0] - hog_id = hog_id.replace("'",'') - fam = int(hog_id) - else: - fam = int(hog_id) - return fam - - -def fam2hogid(fam_id): - """ - For use with OMA HOGs - Get hog id given fam - :param fam_id: fam - :return: hog id - """ - hog_id = "HOG:" + (7-len(str(fam_id))) * '0' + str(fam_id) - return hog_id diff --git a/src/HogProf/build/lib/utils/preprocess_config.py b/src/HogProf/build/lib/utils/preprocess_config.py deleted file mode 100755 index 38427c1..0000000 --- a/src/HogProf/build/lib/utils/preprocess_config.py +++ /dev/null @@ -1,17 +0,0 @@ - -#turn the goDAG into a set of dictionaries -preprocessGO = True - -string_interactors = '/scratch/cluster/monthly/dmoi/stringdata/protein.links.detailed.v10.5.txt' - -preprocessSTRINGDB = False -uniprotmappings = '/scratch/cluster/monthly/dmoi/uniprotmapping/idmapping.dat' -startseq = 'Q7VBF3' - -preprocessUNIPROT = False -#empty redis before storing string info -clearRedis= False -#use GO information in OMA -#use mapping info from uniprot - -verbose = True diff --git a/src/HogProf/build/lib/utils/pyhamutils.py b/src/HogProf/build/lib/utils/pyhamutils.py deleted file mode 100755 index d5c4135..0000000 --- a/src/HogProf/build/lib/utils/pyhamutils.py +++ /dev/null @@ -1,86 +0,0 @@ -import pyham -import xml.etree.cElementTree as ET -import pickle -from utils import config_utils - -def get_orthoxml_oma(fam, db_obj): - orthoxml = db_obj.get_orthoxml(fam).decode() - return orthoxml - -def get_orthoxml_tar(fam, tar): - f = tar.extractfile(fam) - if f is not None: - return f.read() - else: - raise Exception( member + ' : not found in tarfile ') - return orthoxml - - -def get_species_from_orthoxml(orthoxml): - NCBI_taxid2name = {} - root = ET.fromstring(orthoxml) - for child in root: - if 'species' in child.tag: - NCBI_taxid2name[child.attrib['NCBITaxId']] = child.attrib['name'] - return NCBI_taxid2name - -def switch_name_ncbi_id(orthoxml , mapdict = None): - #swap ncbi taxid for species name to avoid ambiguity - #mapdict should be a mapping from species name to taxid if the info isnt in the orthoxmls - root = ET.fromstring(orthoxml) - for child in root: - if 'species' in child.tag: - child.attrib['name'] = child.attrib['NCBITaxId'] - elif mapdict: - child.attrib['name'] = mapdict[child.attrib['name']] - orthoxml = ET.tostring(root, encoding='unicode', method='xml') - return orthoxml - - - -def get_ham_treemap_from_row(row, tree , level = None): - fam, orthoxml = row - orthoxml = switch_name_ncbi_id(orthoxml) - try: - if level is None: - ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml", use_internal_name=True, orthoXML_as_string=True) - tp = ham_obj.create_tree_profile(hog=ham_obj.get_list_top_level_hogs()[0]) - return tp.treemap - else: - ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml", use_internal_name=True, orthoXML_as_string=True) - #return subHOGs at level - slice = ham_obj.get_ancestral_genome_by_name(level) - treeprofiles = [ ham_obj.create_tree_profile(hog=h) for h in ham_obj.get_list_top_level_hogs()[0].get_at_level(slice) ] - - except TypeError as err: - print('Type error:', err) - return None - except AttributeError as err: - print('Attribute error:', err) - return None - - -def yield_families(h5file, start_fam): - """ - Given a h5file containing OMA server, returns an iterator over the families - (not sure if still in use) - :param h5file: omafile - :param start_fam: fam to start on - :return: fam number - """ - for row in h5file.root.OrthoXML.Index: - if row[0] > start_fam: - yield row[0] - - -def get_one_family(i, h5file): - ''' - get one family from database - Args: - i : family number - h5file : OMA server file - Return : - family - Not sure if still in use - ''' - return h5file.root.OrthoXML.Index[i][0] diff --git a/src/HogProf/lshbuilder.py b/src/HogProf/lshbuilder.py index 803cb55..10efcb2 100755 --- a/src/HogProf/lshbuilder.py +++ b/src/HogProf/lshbuilder.py @@ -1,48 +1,55 @@ -from tables import * -import functools import argparse -import sys +import functools +import gc +import logging import multiprocessing as mp -import glob -import pandas as pd -import time as t +import os import pickle -import xml.etree.cElementTree as ET - -from datasketch import MinHashLSHForest , WeightedMinHashGenerator +import random +import time +import time as t from datetime import datetime + +import ete3 import h5py -import time -import gc -from pyoma.browser import db -from HogProf.utils import pyhamutils, hashutils , files_utils import numpy as np +import pandas as pd import tqdm -import random -import tqdm -import os -import ete3 +from datasketch import MinHashLSHForest, WeightedMinHashGenerator +from pyoma.browser import db +from tables import * + +from HogProf.utils import pyhamutils, hashutils, files_utils + random.seed(0) np.random.seed(0) -class LSHBuilder: +logging.basicConfig(level=logging.DEBUG, + format='%(asctime)x [%(levelname)s] %(message)s', + handlers=[ + logging.FileHandler('debug.log'), + logging.StreamHandler()]) + +class LSHBuilder: """ - This class contains the stuff you need to make - a phylogenetic profiling + This class contains the stuff you need to make + a phylogenetic profiling database with input orthxml files and a taxonomic tree - You must either input an OMA hdf5 file or an ensembl tarfile + You must either input an OMA hdf5 file or an ensembl tarfile containing orthoxml file with orthologous groups. - You can provide a species tree or use the ncbi taxonomy + You can provide a species tree or use the ncbi taxonomy with a list of taxonomic codes for all the species in your db """ - def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving_name=None , numperm = 256, treeweights= None , taxfilter = None, taxmask= None , lossonly = False, duplonly = False, verbose = False , use_taxcodes = False , datetime = datetime.now()): - + def __init__(self, h5_oma=None, fileglob=None, taxa=None, masterTree=None, saving_name=None, numperm=256, + treeweights=None, taxfilter=None, taxmask=None, lossonly=False, duplonly=False, verbose=False, + use_taxcodes=False, datetime=datetime.now()): + """ Initializes the LSHBuilder class with the specified parameters and sets up the necessary objects. - + Args: - tarfile_ortho (str): path to an ensembl tarfile containing orthoxml files - h5_oma (str): path to an OMA hdf5 file @@ -56,6 +63,10 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving - verbose (bool): whether to print verbose output (default: False) """ + logging.info('Initialising %s' % self.__class__.__name__) + self.groups = None + self.errorfile = None + if h5_oma: self.h5OMA = h5_oma self.db_obj = db.Database(h5_oma) @@ -64,7 +75,7 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving self.h5OMA = None self.db_obj = None self.oma_id_obj = None - + self.tax_filter = taxfilter self.tax_mask = taxmask self.verbose = verbose @@ -72,222 +83,237 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving self.fileglob = fileglob self.date_string = "{:%B_%d_%Y_%H_%M}".format(datetime.now()) - if saving_name: - self.saving_name= saving_name - if self.saving_name[-1]!= '/': - self.saving_name = self.saving_name+'/' + self.saving_name = saving_name + if self.saving_name[-1] != '/': + self.saving_name = self.saving_name + '/' self.saving_path = saving_name if not os.path.isdir(self.saving_path): os.mkdir(path=self.saving_path) else: - raise Exception( 'please specify an output location' ) - + raise Exception('please specify an output location') + if masterTree is None: if h5_oma: genomes = pd.DataFrame(h5_oma.root.Genome.read())["NCBITaxonId"].tolist() - genomes = [ str(g) for g in genomes] - taxa = genomes + [ 131567, 2759, 2157, 45596 ]+[ taxrel[0] for taxrel in list(h5_oma.root.Taxonomy[:]) ] + [ taxrel[1] for taxrel in list(h5_oma.root.Taxonomy[:]) ] - self.tree_string , self.tree_ete3 = files_utils.get_tree(taxa=taxa, genomes = genomes , outdir=self.saving_path ) - elif taxa: - with open(taxa, 'r') as taxin: - taxlist = [ int(line) for line in taxin ] - self.tree_string , self.tree_ete3 = files_utils.get_tree(taxa=taxlist , outdir=self.saving_path) + genomes = [str(g) for g in genomes] + taxa = genomes + [131567, 2759, 2157, 45596] + [taxrel[0] for taxrel in + list(h5_oma.root.Taxonomy[:])] + [taxrel[1] for taxrel + in list( + h5_oma.root.Taxonomy[:])] + self.tree_string, self.tree_ete3 = files_utils.get_tree(taxa=taxa, genomes=genomes, + outdir=self.saving_path) else: - raise Exception( 'please specify either a list of taxa or a tree' ) + raise Exception('please specify either a list of taxa or a tree') self.swap2taxcode = True - elif mastertree: + elif masterTree: self.tree_ete3 = ete3.Tree(masterTree, format=1) with open(masterTree) as treein: self.tree_string = treein.read() self.swap2taxcode = use_taxcodes - self.taxaIndex, self.reverse = files_utils.generate_taxa_index(self.tree_ete3 , self.tax_filter, self.tax_mask) - with open( self.saving_path + 'taxaIndex.pkl', 'wb') as taxout: - taxout.write( pickle.dumps(self.taxaIndex)) + self.taxaIndex, self.reverse = files_utils.generate_taxa_index(self.tree_ete3, self.tax_filter, self.tax_mask) + + with open(self.saving_path + 'taxaIndex.pkl', 'wb') as taxout: + taxout.write(pickle.dumps(self.taxaIndex)) + self.numperm = numperm + if treeweights is None: - #generate aconfig_utilsll ones - self.treeweights = hashutils.generate_treeweights(self.tree_ete3 , self.taxaIndex , taxfilter, taxmask) + # generate aconfig_utilsll ones + self.treeweights = hashutils.generate_treeweights(self.tree_ete3, self.taxaIndex, taxfilter, taxmask) else: - #load machine learning weights + # load machine learning weights self.treeweights = treeweights - print(self.treeweights) - wmg = WeightedMinHashGenerator(3*len(self.taxaIndex), sample_size = numperm , seed=1) - with open( self.saving_path + 'wmg.pkl', 'wb') as wmgout: - wmgout.write( pickle.dumps(wmg)) + + wmg = WeightedMinHashGenerator(3 * len(self.taxaIndex), sample_size=numperm, seed=1) + + with open(self.saving_path + 'wmg.pkl', 'wb') as wmgout: + wmgout.write(pickle.dumps(wmg)) self.wmg = wmg - print( 'configuring pyham functions') + + logging.info('Configuring pyham functions') + if self.h5OMA: - self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string , swap_ids=self.swap2taxcode ) + self.HAM_PIPELINE = functools.partial(pyhamutils.get_ham_treemap_from_row, tree=self.tree_string, + swap_ids=self.swap2taxcode) else: - self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string , swap_ids=self.swap2taxcode , orthoXML_as_string = False ) - self.HASH_PIPELINE = functools.partial( hashutils.row2hash , taxaIndex=self.taxaIndex, treeweights=self.treeweights, wmg=wmg , lossonly = lossonly, duplonly = duplonly) - if self.h5OMA: + self.HAM_PIPELINE = functools.partial(pyhamutils.get_ham_treemap_from_row, tree=self.tree_string, + swap_ids=self.swap2taxcode, orthoXML_as_string=False) + self.HASH_PIPELINE = functools.partial(hashutils.row2hash, taxaIndex=self.taxaIndex, + treeweights=self.treeweights, wmg=wmg, lossonly=lossonly, + duplonly=duplonly) + + if self.h5OMA: self.READ_ORTHO = functools.partial(pyhamutils.get_orthoxml_oma, db_obj=self.db_obj) - + if self.h5OMA: - self.n_groups = len(self.h5OMA.root.OrthoXML.Index) + self.n_groups = len(self.h5OMA.root.OrthoXML.Index) elif self.fileglob: self.n_groups = len(self.fileglob) else: - raise Exception( 'please specify an input file' ) - + raise Exception('please specify an input file') + self.hashes_path = self.saving_path + 'hashes.h5' self.lshpath = self.saving_path + 'newlsh.pkl' self.lshforestpath = self.saving_path + 'newlshforest.pkl' - self.mat_path = self.saving_path+ 'hogmat.h5' + self.mat_path = self.saving_path + 'hogmat.h5' self.columns = len(self.taxaIndex) self.verbose = verbose - print('done') + print('Initialised') def load_one(self, fam): - #test function to try out the pipeline on one orthoxml + # test function to try out the pipeline on one orthoxml ortho_fam = self.READ_ORTHO(fam) pyham_tree = self.HAM_PIPELINE([fam, ortho_fam]) - hog_matrix,weighted_hash = hashutils.hash_tree(pyham_tree , self.taxaIndex , self.treeweights , self.wmg) - return ortho_fam , pyham_tree, weighted_hash,hog_matrix + hog_matrix, weighted_hash = hashutils.hash_tree(pyham_tree, self.taxaIndex, self.treeweights, self.wmg) + return ortho_fam, pyham_tree, weighted_hash, hog_matrix - def generates_dataframes(self, size=100, minhog_size=10, maxhog_size=None ): + def generates_dataframes(self, size=100, minhog_size=10, maxhog_size=None): families = {} start = -1 if self.h5OMA: - self.groups = self.h5OMA.root.OrthoXML.Index + self.groups = self.h5OMA.root.OrthoXML.Index self.rows = len(self.groups) + for i, row in tqdm.tqdm(enumerate(self.groups)): if i > start: fam = row[0] ortho_fam = self.READ_ORTHO(fam) hog_size = ortho_fam.count(' size: pd_dataframe = pd.DataFrame.from_dict(families, orient='index') pd_dataframe['Fam'] = pd_dataframe.index yield pd_dataframe families = {} + pd_dataframe = pd.DataFrame.from_dict(families, orient='index') pd_dataframe['Fam'] = pd_dataframe.index + yield pd_dataframe - print('last dataframe sent') - families = {} + + logging.info('Last dataframe sent') elif self.fileglob: - for i,file in enumerate(tqdm.tqdm(self.fileglob)): + for i, file in enumerate(tqdm.tqdm(self.fileglob)): + with open(file) as ortho: - #oxml = ET.parse(ortho) - #ortho_fam = ET.tostring( next(oxml.iter()), encoding='utf8', method='xml' ).decode() orthostr = ortho.read() + hog_size = orthostr.count(' size: pd_dataframe = pd.DataFrame.from_dict(families, orient='index') pd_dataframe['Fam'] = pd_dataframe.index yield pd_dataframe families = {} - if i%10000 == 0: + + if i % 10000 == 0: print(i) - #save the mapping of fam to orthoxml + # save the mapping of fam to orthoxml pd_dataframe = pd.DataFrame.from_dict(families, orient='index') pd_dataframe['Fam'] = pd_dataframe.index pd_dataframe.to_csv(self.saving_path + 'fam2orthoxml.csv') - - def universe_saver(self, i, q, retq, matq,univerq, l): - #only useful to save all prots within a taxonomic range as db is being compiled - allowed = set( [ n.name for n in self.tree_ete3.get_leaves() ] ) - with open(self.saving_path+'universe.txt') as universeout: - while True: - prots = univerq.get() - for row in df.iterrows(): - for ID in row.prots.tolist(): - universeout.write(ID) - else: - print('Universe saver done' + str(i)) - break - def worker(self, i, q, retq, matq, l): - if self.verbose == True: - print('worker init ' + str(i)) + if self.verbose: + logging.info('Initialising worker %s ' % str(i)) while True: df = q.get() - if df is not None : + if df is not None: df['tree'] = df[['Fam', 'ortho']].apply(self.HAM_PIPELINE, axis=1) - df[['hash','rows']] = df[['Fam', 'tree']].apply(self.HASH_PIPELINE, axis=1) + df[['hash', 'rows']] = df[['Fam', 'tree']].apply(self.HASH_PIPELINE, axis=1) retq.put(df[['Fam', 'hash']]) - #matq.put(df[['Fam', 'rows']]) else: - if self.verbose == True: - print('Worker done' + str(i)) + if self.verbose: + print('Worker done %s' % str(i)) break - def saver(self, i, q, retq, matq, l ): - print_start = t.time() + def saver(self, i, q, retq, matq, l): save_start = t.time() global_time = t.time() chunk_size = 100 count = 0 forest = MinHashLSHForest(num_perm=self.numperm) taxstr = '' + if self.tax_filter is None: taxstr = 'NoFilter' + if self.tax_mask is None: - taxstr+= 'NoMask' + taxstr += 'NoMask' else: taxstr = str(self.tax_filter) + self.errorfile = self.saving_path + 'errors.txt' with open(self.errorfile, 'w') as hashes_error_files: with h5py.File(self.hashes_path, 'w', libver='latest') as h5hashes: datasets = {} + if taxstr not in h5hashes.keys(): - if self.verbose == True: - print('creating dataset') - print('filtered at taxonomic level: '+taxstr) + if self.verbose: + logging.info('Creating dataset') + logging.info('Filtered at taxonomic level: ' + taxstr) h5hashes.create_dataset(taxstr, (chunk_size, 0), maxshape=(None, None), dtype='int32') - if self.verbose == True: - print(datasets) + + if self.verbose: + logging.info(datasets) h5flush = h5hashes.flush - print('saver init ' + str(i)) + + logging.info('Initialising saver %s ' % str(i)) + while True: this_dataframe = retq.get() if this_dataframe is not None: if not this_dataframe.empty: hashes = this_dataframe['hash'].to_dict() - #print(str(this_dataframe.Fam.max())+ 'fam num') - #print(str(count) + ' done') - hashes = {fam:hashes[fam] for fam in hashes if hashes[fam] } - [ forest.add(str(fam),hashes[fam]) for fam in hashes] + hashes = {fam: hashes[fam] for fam in hashes if hashes[fam]} + [forest.add(str(fam), hashes[fam]) for fam in hashes] + for fam in hashes: if len(h5hashes[taxstr]) < fam + 10: h5hashes[taxstr].resize((fam + chunk_size, len(hashes[fam].hashvalues.ravel()))) h5hashes[taxstr][fam, :] = hashes[fam].hashvalues.ravel() count += 1 + if t.time() - save_start > 200: - print( t.time() - global_time ) + logging.info(t.time() - global_time) forest.index() - print(forest.query( hashes[fam] , k = 10 ) ) + logging.info(forest.query(hashes[fam], k=10)) h5flush() save_start = t.time() - with open(self.lshforestpath , 'wb') as forestout: + + with open(self.lshforestpath, 'wb') as forestout: forestout.write(pickle.dumps(forest, -1)) - if self.verbose == True: - print('save done at' + str(t.time() - global_time)) + + if self.verbose: + logging.info('Saved to %s' % str(t.time() - global_time)) else: print(this_dataframe) else: - if self.verbose == True: - print('wrap it up') - with open(self.lshforestpath , 'wb') as forestout: + if self.verbose: + logging.info('Wrapping it up') + + with open(self.lshforestpath, 'wb') as forestout: forestout.write(pickle.dumps(forest, -1)) + h5flush() - if self.verbose == True: + + if self.verbose: print('DONE SAVER' + str(i)) break - def matrix_updater(self, iprocess , q, retq, matq, l): - print('hogmat saver init ' + str(iprocess)) + def matrix_updater(self, iprocess, q, retq, matq, l): + logging.info('Initialising hogmat saver ' + str(iprocess)) h5mat = None times1 = [] frames = [] @@ -299,205 +325,171 @@ def matrix_updater(self, iprocess , q, retq, matq, l): rows = rows.dropna() maxfam = rows.Fam.max() if h5mat is None: - h5hashes.create_dataset('matrows',(10,block.shape[1]), maxshape=(None, block.shape[1]),chunks=(1, block.shape[1]), dtype='i8') + h5hashes.create_dataset('matrows', (10, block.shape[1]), maxshape=(None, block.shape[1]), + chunks=(1, block.shape[1]), dtype='i8') h5mat = h5hashes['matrows'] if h5mat.shape[0] < maxfam: - h5mat.resize((maxfam+1,block.shape[1])) - i+=1 + h5mat.resize((maxfam + 1, block.shape[1])) + i += 1 frames.append(rows) assign = t.time() index = np.asarray(rows.Fam) block = np.vstack(rows.rows) - h5mat[index,:]= block + h5mat[index, :] = block - times1.append(t.time()-assign) - if len(times1)>10: + times1.append(t.time() - assign) + if len(times1) > 10: times1.pop(0) - print(np.mean(times1)) + logging.info('Mean time: %s' % np.mean(times1)) h5hashes.flush() else: h5hashes.flush() break - print('DONE MAT UPDATER' + str(i)) + logging.info('DONE MAT UPDATER %s' % str(i)) + + def run_pipeline(self, threads): + logging.info('Running with %s threads:' % threads) + functype_dict = {'worker': (self.worker, threads, True), 'updater': (self.saver, 1, False), + 'matrix_updater': (self.matrix_updater, 0, False)} - def run_pipeline(self , threads): - print( 'run w n threads:', threads) - functype_dict = {'worker': (self.worker, threads , True), 'updater': (self.saver, 1, False), - 'matrix_updater': (self.matrix_updater, 0, False) } def mp_with_timeout(functypes, data_generator): - work_processes = {} - update_processes = {} lock = mp.Lock() cores = mp.cpu_count() q = mp.Queue(maxsize=cores * 10) retq = mp.Queue(maxsize=cores * 10) matq = mp.Queue(maxsize=cores * 10) work_processes = {} - print('start workers') + logging.info('Starting workers...') + for key in functypes: worker_function, number_workers, joinval = functypes[key] work_processes[key] = [] for i in range(int(number_workers)): - t = mp.Process(target=worker_function, args=(i, q, retq, matq, lock )) + t = mp.Process(target=worker_function, args=(i, q, retq, matq, lock)) t.daemon = True work_processes[key].append(t) + for key in work_processes: for process in work_processes[key]: + logging.info('Starting process') process.start() + for data in data_generator: + logging.info('Putting data') q.put(data) - print('done spooling data') + + logging.info('Spooling data: OK') + for key in work_processes: for i in range(2): for _ in work_processes[key]: q.put(None) - print('joining processes') + logging.info('Joining processes') + for key in work_processes: - worker_function, number_workers , joinval = functypes[key] - if joinval == True: + worker_function, number_workers, joinval = functypes[key] + + if joinval: for process in work_processes[key]: process.join() + for key in work_processes: worker_function, number_workers, joinval = functypes[key] - if joinval == False: + + if not joinval: for _ in work_processes[key]: retq.put(None) matq.put(None) + for key in work_processes: - worker_function, number_workers , joinval = functypes[key] - if joinval == False: + worker_function, number_workers, joinval = functypes[key] + + if not joinval: for process in work_processes[key]: process.join() + gc.collect() print('DONE!') mp_with_timeout(functypes=functype_dict, data_generator=self.generates_dataframes(100)) - return self.hashes_path, self.lshforestpath , self.mat_path + return self.hashes_path, self.lshforestpath, self.mat_path +def arg_parser(): + parser = argparse.ArgumentParser() + parser.add_argument('--outpath', help='name of the db', type=str) + parser.add_argument('--dbtype', help='preconfigured taxonomic ranges', type=str) + parser.add_argument('--OMA', help='use oma data ', type=str) + parser.add_argument('--nthreads', help='nthreads for multiprocessing', type=int) + parser.add_argument('--outfolder', help='folder for storing hash, db and tree objects', type=str) + parser.add_argument('--verbose', help='print verbose output', type=bool) + args = parser.parse_args() -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--taxweights', help='load optimised weights from keras model',type = str) - parser.add_argument('--taxmask', help='consider only one branch',type = str) - parser.add_argument('--taxfilter', help='remove these taxa' , type = str) - parser.add_argument('--outpath', help='name of the db', type = str) - parser.add_argument('--dbtype', help='preconfigured taxonomic ranges' , type = str) - parser.add_argument('--OMA', help='use oma data ' , type = str) - parser.add_argument('--OrthoGlob', help='a glob expression for orthoxml files ' , type = str) - parser.add_argument('--tarfile', help='use tarfile with orthoxml data ' , type = str) - parser.add_argument('--nperm', help='number of hash functions to use when constructing profiles' , type = int) - parser.add_argument('--mastertree', help='master taxonomic tree. should use ncbi taxonomic id numbers as leaf names' , type = str) - parser.add_argument('--nthreads', help='nthreads for multiprocessing' , type = int) - parser.add_argument('--outfolder', help='folder for storing hash, db and tree objects' , type = str) - parser.add_argument('--lossonly', help='only compile loss events' , type = bool) - parser.add_argument('--duplonly', help='only compile duplication events' , type = bool) - parser.add_argument('--taxcodes', help='use taxid info in HOGs' , type = bool) - parser.add_argument('--verbose', help='print verbose output' , type = bool) - dbdict = { - 'all': { 'taxfilter': None , 'taxmask': None }, - 'plants': { 'taxfilter': None , 'taxmask': 33090 }, - 'archaea':{ 'taxfilter': None , 'taxmask': 2157 }, - 'bacteria':{ 'taxfilter': None , 'taxmask': 2 }, - 'eukarya':{ 'taxfilter': None , 'taxmask': 2759 }, - 'protists':{ 'taxfilter': [2 , 2157 , 33090 , 4751, 33208] , 'taxmask':None }, - 'fungi':{ 'taxfilter': None , 'taxmask': 4751 }, - 'metazoa':{ 'taxfilter': None , 'taxmask': 33208 }, - 'vertebrates':{ 'taxfilter': None , 'taxmask': 7742 }, - } - taxfilter = None - taxmask = None - omafile = None + return args - args = vars(parser.parse_args(sys.argv[1:])) +def main(args=None): + if args is None: + args = arg_parser() + + dbdict = { + 'all': {'taxfilter': None, 'taxmask': None}, + 'plants': {'taxfilter': None, 'taxmask': 33090}, + 'archaea': {'taxfilter': None, 'taxmask': 2157}, + 'bacteria': {'taxfilter': None, 'taxmask': 2}, + 'eukarya': {'taxfilter': None, 'taxmask': 2759}, + 'protists': {'taxfilter': [2, 2157, 33090, 4751, 33208], 'taxmask': None}, + 'fungi': {'taxfilter': None, 'taxmask': 4751}, + 'metazoa': {'taxfilter': None, 'taxmask': 33208}, + 'vertebrates': {'taxfilter': None, 'taxmask': 7742}, + } - if 'OrthoGlob' in args: - if args['OrthoGlob']: - orthoglob = glob.glob(args['OrthoGlob']+ '*') - else: - orthoglob = None - if 'outpath' in args: - dbname = args['outpath'] + dbname = args.outpath else: raise Exception(' please give your profile an output path with the --outpath argument ') - if args['dbtype']: - taxfilter = dbdict[args['dbtype']]['taxfilter'] - taxmask = dbdict[args['dbtype']]['taxmask'] - if args['taxmask']: - taxfilter = args['taxfilter'] - if args['taxfilter']: - taxmask = args['taxmask'] - if args['nperm']: - nperm = int(args['nperm']) + + if args.dbtype: + taxfilter = dbdict[args.dbtype]['taxfilter'] + taxmask = dbdict[args.dbtype]['taxmask'] else: - nperm = 256 - if args['OMA']: - omafile = args['OMA'] - elif args['tarfile']: - omafile = args['tarfile'] - elif orthoglob: - fileglob = orthoglob + taxfilter=None + taxmask=None + + nperm = 256 + + if args.OMA: + omafile = args.OMA else: raise Exception(' please specify input data ') - - if args['lossonly']: - lossonly = args['lossonly'] - else: - lossonly = False - if args['duplonly']: - duplonly = args['duplonly'] - else: - duplonly = False - - if args['taxcodes']: - taxcodes = args['taxcodes'] - else: - taxcodes = False - if args['verbose']: - verbose = args['verbose'] - else: + if args.verbose: + verbose = args.verbose + else: verbose = False + threads = 4 + if args.nthreads: + threads = args.nthreads - threads = 4 - if args['nthreads']: - threads = args['nthreads'] - if args['taxweights']: - from keras.models import model_from_json - json_file = open( args['taxweights']+ '.json', 'r') - loaded_model_json = json_file.read() - json_file.close() - model = model_from_json(loaded_model_json) - # load weights into new model - model.load_weights( args['taxweights']+".h5") - print("Loaded model from disk") - weights = model.get_weights()[0] - weights += 10 ** -10 - else: - weights = None - if args['mastertree']: - mastertree = args['mastertree'] - else: - mastertree=None start = time.time() - if omafile: - with open_file( omafile , mode="r") as h5_oma: - lsh_builder = LSHBuilder(h5_oma = h5_oma, fileglob=orthoglob ,saving_name=dbname , numperm = nperm , - treeweights= weights , taxfilter = taxfilter, taxmask=taxmask , masterTree =mastertree , lossonly = lossonly , duplonly = duplonly , use_taxcodes = taxcodes , verbose=verbose) - lsh_builder.run_pipeline(threads) - else: - lsh_builder = LSHBuilder(h5_oma = None, fileglob=orthoglob ,saving_name=dbname , numperm = nperm , - treeweights= weights , taxfilter = taxfilter, taxmask=taxmask , masterTree =mastertree , lossonly = lossonly , duplonly = duplonly , use_taxcodes = taxcodes , verbose=verbose) - lsh_builder.run_pipeline(threads) + with open_file(omafile, mode="r") as h5_oma: + logging.info('Starting LSH builder') + lsh_builder = LSHBuilder(h5_oma=h5_oma, + saving_name=dbname, + verbose=verbose, + numperm=nperm, + taxfilter=taxfilter, + taxmask=taxmask + ) + lsh_builder.run_pipeline(threads) print(time.time() - start) print('DONE') if __name__ == '__main__': - main() \ No newline at end of file + args = argparse.Namespace(outpath='out', dbtype='eukarya', OMA='data/OmaServer.h5', verbose=True, nthreads=8) + main(args) diff --git a/src/HogProf/orthoxml.py b/src/HogProf/orthoxml.py deleted file mode 100755 index d0c8687..0000000 --- a/src/HogProf/orthoxml.py +++ /dev/null @@ -1,1930 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# -# Generated Mon Jun 27 10:13:43 2011 by generateDS.py version 2.5b. -# - -import sys -import getopt -import re as re_ - -etree_ = None -Verbose_import_ = False -( XMLParser_import_none, XMLParser_import_lxml, - XMLParser_import_elementtree - ) = range(3) -XMLParser_import_library = None -try: - # lxml - from lxml import etree as etree_ - XMLParser_import_library = XMLParser_import_lxml - if Verbose_import_: - print("running with lxml.etree") -except ImportError: - try: - # cElementTree from Python 2.5+ - import xml.etree.cElementTree as etree_ - XMLParser_import_library = XMLParser_import_elementtree - if Verbose_import_: - print("running with cElementTree on Python 2.5+") - except ImportError: - try: - # ElementTree from Python 2.5+ - import xml.etree.ElementTree as etree_ - XMLParser_import_library = XMLParser_import_elementtree - if Verbose_import_: - print("running with ElementTree on Python 2.5+") - except ImportError: - try: - # normal cElementTree install - import cElementTree as etree_ - XMLParser_import_library = XMLParser_import_elementtree - if Verbose_import_: - print("running with cElementTree") - except ImportError: - try: - # normal ElementTree install - import elementtree.ElementTree as etree_ - XMLParser_import_library = XMLParser_import_elementtree - if Verbose_import_: - print("running with ElementTree") - except ImportError: - raise ImportError("Failed to import ElementTree from any known place") - -def parsexml_(*args, **kwargs): - if (XMLParser_import_library == XMLParser_import_lxml and - 'parser' not in kwargs): - # Use the lxml ElementTree compatible parser so that, e.g., - # we ignore comments. - kwargs['parser'] = etree_.ETCompatXMLParser() - doc = etree_.parse(*args, **kwargs) - return doc - -# -# User methods -# -# Calls to the methods in these classes are generated by generateDS.py. -# You can replace these methods by re-implementing the following class -# in a module named generatedssuper.py. - -try: - from generatedssuper import GeneratedsSuper -except ImportError, exp: - - class GeneratedsSuper(object): - def gds_format_string(self, input_data, input_name=''): - return input_data - def gds_validate_string(self, input_data, node, input_name=''): - return input_data - def gds_format_integer(self, input_data, input_name=''): - return '%d' % input_data - def gds_validate_integer(self, input_data, node, input_name=''): - return input_data - def gds_format_integer_list(self, input_data, input_name=''): - return '%s' % input_data - def gds_validate_integer_list(self, input_data, node, input_name=''): - values = input_data.split() - for value in values: - try: - fvalue = float(value) - except (TypeError, ValueError), exp: - raise_parse_error(node, 'Requires sequence of integers') - return input_data - def gds_format_float(self, input_data, input_name=''): - return '%f' % input_data - def gds_validate_float(self, input_data, node, input_name=''): - return input_data - def gds_format_float_list(self, input_data, input_name=''): - return '%s' % input_data - def gds_validate_float_list(self, input_data, node, input_name=''): - values = input_data.split() - for value in values: - try: - fvalue = float(value) - except (TypeError, ValueError), exp: - raise_parse_error(node, 'Requires sequence of floats') - return input_data - def gds_format_double(self, input_data, input_name=''): - return '%e' % input_data - def gds_validate_double(self, input_data, node, input_name=''): - return input_data - def gds_format_double_list(self, input_data, input_name=''): - return '%s' % input_data - def gds_validate_double_list(self, input_data, node, input_name=''): - values = input_data.split() - for value in values: - try: - fvalue = float(value) - except (TypeError, ValueError), exp: - raise_parse_error(node, 'Requires sequence of doubles') - return input_data - def gds_format_boolean(self, input_data, input_name=''): - return '%s' % input_data - def gds_validate_boolean(self, input_data, node, input_name=''): - return input_data - def gds_format_boolean_list(self, input_data, input_name=''): - return '%s' % input_data - def gds_validate_boolean_list(self, input_data, node, input_name=''): - values = input_data.split() - for value in values: - if value not in ('true', '1', 'false', '0', ): - raise_parse_error(node, 'Requires sequence of booleans ("true", "1", "false", "0")') - return input_data - def gds_str_lower(self, instring): - return instring.lower() - def get_path_(self, node): - path_list = [] - self.get_path_list_(node, path_list) - path_list.reverse() - path = '/'.join(path_list) - return path - Tag_strip_pattern_ = re_.compile(r'\{.*\}') - def get_path_list_(self, node, path_list): - if node is None: - return - tag = GeneratedsSuper.Tag_strip_pattern_.sub('', node.tag) - if tag: - path_list.append(tag) - self.get_path_list_(node.getparent(), path_list) - - -# -# If you have installed IPython you can uncomment and use the following. -# IPython is available from http://ipython.scipy.org/. -# - -## from IPython.Shell import IPShellEmbed -## args = '' -## ipshell = IPShellEmbed(args, -## banner = 'Dropping into IPython', -## exit_msg = 'Leaving Interpreter, back to program.') - -# Then use the following line where and when you want to drop into the -# IPython shell: -# ipshell(' -- Entering ipshell.\nHit Ctrl-D to exit') - -# -# Globals -# - -ExternalEncoding = 'utf-8' -Tag_pattern_ = re_.compile(r'({.*})?(.*)') -STRING_CLEANUP_PAT = re_.compile(r"[\n\r\s]+") - -# -# Support/utility functions. -# - -def showIndent(outfile, level): - for idx in range(level): - outfile.write(' ') - -def quote_xml(inStr): - if not inStr: - return '' - s1 = (isinstance(inStr, basestring) and inStr or - '%s' % inStr) - s1 = s1.replace('&', '&') - s1 = s1.replace('<', '<') - s1 = s1.replace('>', '>') - return s1 - -def quote_attrib(inStr): - s1 = (isinstance(inStr, basestring) and inStr or - '%s' % inStr) - s1 = s1.replace('&', '&') - s1 = s1.replace('<', '<') - s1 = s1.replace('>', '>') - if '"' in s1: - if "'" in s1: - s1 = '"%s"' % s1.replace('"', """) - else: - s1 = "'%s'" % s1 - else: - s1 = '"%s"' % s1 - return s1 - -def quote_python(inStr): - s1 = inStr - if s1.find("'") == -1: - if s1.find('\n') == -1: - return "'%s'" % s1 - else: - return "'''%s'''" % s1 - else: - if s1.find('"') != -1: - s1 = s1.replace('"', '\\"') - if s1.find('\n') == -1: - return '"%s"' % s1 - else: - return '"""%s"""' % s1 - -def get_all_text_(node): - if node.text is not None: - text = node.text - else: - text = '' - for child in node: - if child.tail is not None: - text += child.tail - return text - -def find_attr_value_(attr_name, node): - attrs = node.attrib - # First try with no namespace. - value = attrs.get(attr_name) - if value is None: - # Now try the other possible namespaces. - namespaces = node.nsmap.itervalues() - for namespace in namespaces: - value = attrs.get('{%s}%s' % (namespace, attr_name, )) - if value is not None: - break - return value - - -class GDSParseError(Exception): - pass - -def raise_parse_error(node, msg): - if XMLParser_import_library == XMLParser_import_lxml: - msg = '%s (element %s/line %d)' % (msg, node.tag, node.sourceline, ) - else: - msg = '%s (element %s)' % (msg, node.tag, ) - raise GDSParseError(msg) - - -class MixedContainer: - # Constants for category: - CategoryNone = 0 - CategoryText = 1 - CategorySimple = 2 - CategoryComplex = 3 - # Constants for content_type: - TypeNone = 0 - TypeText = 1 - TypeString = 2 - TypeInteger = 3 - TypeFloat = 4 - TypeDecimal = 5 - TypeDouble = 6 - TypeBoolean = 7 - def __init__(self, category, content_type, name, value): - self.category = category - self.content_type = content_type - self.name = name - self.value = value - def getCategory(self): - return self.category - def getContenttype(self, content_type): - return self.content_type - def getValue(self): - return self.value - def getName(self): - return self.name - def export(self, outfile, level, name, namespace): - if self.category == MixedContainer.CategoryText: - # Prevent exporting empty content as empty lines. - if self.value.strip(): - outfile.write(self.value) - elif self.category == MixedContainer.CategorySimple: - self.exportSimple(outfile, level, name) - else: # category == MixedContainer.CategoryComplex - self.value.export(outfile, level, namespace,name) - def exportSimple(self, outfile, level, name): - if self.content_type == MixedContainer.TypeString: - outfile.write('<%s>%s' % (self.name, self.value, self.name)) - elif self.content_type == MixedContainer.TypeInteger or \ - self.content_type == MixedContainer.TypeBoolean: - outfile.write('<%s>%d' % (self.name, self.value, self.name)) - elif self.content_type == MixedContainer.TypeFloat or \ - self.content_type == MixedContainer.TypeDecimal: - outfile.write('<%s>%f' % (self.name, self.value, self.name)) - elif self.content_type == MixedContainer.TypeDouble: - outfile.write('<%s>%g' % (self.name, self.value, self.name)) - def exportLiteral(self, outfile, level, name): - if self.category == MixedContainer.CategoryText: - showIndent(outfile, level) - outfile.write('model_.MixedContainer(%d, %d, "%s", "%s"),\n' % \ - (self.category, self.content_type, self.name, self.value)) - elif self.category == MixedContainer.CategorySimple: - showIndent(outfile, level) - outfile.write('model_.MixedContainer(%d, %d, "%s", "%s"),\n' % \ - (self.category, self.content_type, self.name, self.value)) - else: # category == MixedContainer.CategoryComplex - showIndent(outfile, level) - outfile.write('model_.MixedContainer(%d, %d, "%s",\n' % \ - (self.category, self.content_type, self.name,)) - self.value.exportLiteral(outfile, level + 1) - showIndent(outfile, level) - outfile.write(')\n') - - -class MemberSpec_(object): - def __init__(self, name='', data_type='', container=0): - self.name = name - self.data_type = data_type - self.container = container - def set_name(self, name): self.name = name - def get_name(self): return self.name - def set_data_type(self, data_type): self.data_type = data_type - def get_data_type_chain(self): return self.data_type - def get_data_type(self): - if isinstance(self.data_type, list): - if len(self.data_type) > 0: - return self.data_type[-1] - else: - return 'xs:string' - else: - return self.data_type - def set_container(self, container): self.container = container - def get_container(self): return self.container - -def _cast(typ, value): - if typ is None or value is None: - return value - return typ(value) - -# -# Data representation classes. -# - -class orthoXML(GeneratedsSuper): - """The OrthoXML root element. The source program/database of the file - for instance OMA or InParanoid. The version number of the file. - The version or release number of the source program/database at - time the file was generated.""" - subclass = None - superclass = None - def __init__(self, origin=None, version=None, originVersion=None, notes=None, species=None, scores=None, groups=None, valueOf_=None): - self.origin = _cast(None, origin) - self.version = _cast(float, version) - self.originVersion = _cast(None, originVersion) - self.notes = notes - if species is None: - self.species = [] - else: - self.species = species - self.scores = scores - self.groups = groups - def factory(*args_, **kwargs_): - if orthoXML.subclass: - return orthoXML.subclass(*args_, **kwargs_) - else: - return orthoXML(*args_, **kwargs_) - factory = staticmethod(factory) - def get_notes(self): return self.notes - def set_notes(self, notes): self.notes = notes - def get_species(self): return self.species - def set_species(self, species): self.species = species - def add_species(self, value): self.species.append(value) - def insert_species(self, index, value): self.species[index] = value - def get_scores(self): return self.scores - def set_scores(self, scores): self.scores = scores - def get_groups(self): return self.groups - def set_groups(self, groups): self.groups = groups - def get_origin(self): return self.origin - def set_origin(self, origin): self.origin = origin - def get_version(self): return self.version - def set_version(self, version): self.version = version - def get_originVersion(self): return self.originVersion - def set_originVersion(self, originVersion): self.originVersion = originVersion - def export(self, outfile, level, namespace_='ortho:', name_='orthoXML', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='orthoXML') - if self.hasContent_(): - outfile.write('>\n') - self.exportChildren(outfile, level + 1, namespace_, name_) - showIndent(outfile, level) - outfile.write('\n' % (namespace_, name_)) - else: - outfile.write('/>\n') - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='orthoXML'): - if self.origin is not None and 'origin' not in already_processed: - already_processed.append('origin') - outfile.write(' origin=%s' % (self.gds_format_string(quote_attrib(self.origin).encode(ExternalEncoding), input_name='origin'), )) - if self.version is not None and 'version' not in already_processed: - already_processed.append('version') - outfile.write(' version="%s"' % self.gds_format_float(self.version, input_name='version')) - if self.originVersion is not None and 'originVersion' not in already_processed: - already_processed.append('originVersion') - outfile.write(' originVersion=%s' % (self.gds_format_string(quote_attrib(self.originVersion).encode(ExternalEncoding), input_name='originVersion'), )) - def exportChildren(self, outfile, level, namespace_='ortho:', name_='orthoXML', fromsubclass_=False): - if self.notes: - self.notes.export(outfile, level, namespace_, name_='notes') - for species_ in self.species: - species_.export(outfile, level, namespace_, name_='species') - if self.scores: - self.scores.export(outfile, level, namespace_, name_='scores') - if self.groups: - self.groups.export(outfile, level, namespace_, name_='groups', ) - def hasContent_(self): - if ( - self.notes is not None or - self.species or - self.scores is not None or - self.groups is not None - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='orthoXML'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - if self.origin is not None and 'origin' not in already_processed: - already_processed.append('origin') - showIndent(outfile, level) - outfile.write('origin = "%s",\n' % (self.origin,)) - if self.version is not None and 'version' not in already_processed: - already_processed.append('version') - showIndent(outfile, level) - outfile.write('version = %f,\n' % (self.version,)) - if self.originVersion is not None and 'originVersion' not in already_processed: - already_processed.append('originVersion') - showIndent(outfile, level) - outfile.write('originVersion = "%s",\n' % (self.originVersion,)) - def exportLiteralChildren(self, outfile, level, name_): - if self.notes is not None: - showIndent(outfile, level) - outfile.write('notes=model_.notes(\n') - self.notes.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - showIndent(outfile, level) - outfile.write('species=[\n') - level += 1 - for species_ in self.species: - showIndent(outfile, level) - outfile.write('model_.species(\n') - species_.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - level -= 1 - showIndent(outfile, level) - outfile.write('],\n') - if self.scores is not None: - showIndent(outfile, level) - outfile.write('scores=model_.scores(\n') - self.scores.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - if self.groups is not None: - showIndent(outfile, level) - outfile.write('groups=model_.groups(\n') - self.groups.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - def build(self, node): - self.buildAttributes(node, node.attrib, []) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - value = find_attr_value_('origin', node) - if value is not None and 'origin' not in already_processed: - already_processed.append('origin') - self.origin = value - value = find_attr_value_('version', node) - if value is not None and 'version' not in already_processed: - already_processed.append('version') - try: - self.version = float(value) - except ValueError, exp: - raise ValueError('Bad float/double attribute (version): %s' % exp) - value = find_attr_value_('originVersion', node) - if value is not None and 'originVersion' not in already_processed: - already_processed.append('originVersion') - self.originVersion = value - self.originVersion = ' '.join(self.originVersion.split()) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - if nodeName_ == 'notes': - obj_ = notes.factory() - obj_.build(child_) - self.set_notes(obj_) - elif nodeName_ == 'species': - obj_ = species.factory() - obj_.build(child_) - self.species.append(obj_) - elif nodeName_ == 'scores': - obj_ = scores.factory() - obj_.build(child_) - self.set_scores(obj_) - elif nodeName_ == 'groups': - obj_ = groups.factory() - obj_.build(child_) - self.set_groups(obj_) -# end class orthoXML - - -class species(GeneratedsSuper): - """The species element contains all sequences of one species. The NCBI - Taxonomy identifier of the species to identify it unambiguously. - The name of the species.""" - subclass = None - superclass = None - def __init__(self, name=None, NCBITaxId=None, database=None, notes=None, valueOf_=None): - self.name = _cast(None, name) - self.NCBITaxId = _cast(int, NCBITaxId) - if database is None: - self.database = [] - else: - self.database = database - self.notes = notes - def factory(*args_, **kwargs_): - if species.subclass: - return species.subclass(*args_, **kwargs_) - else: - return species(*args_, **kwargs_) - factory = staticmethod(factory) - def get_database(self): return self.database - def set_database(self, database): self.database = database - def add_database(self, value): self.database.append(value) - def insert_database(self, index, value): self.database[index] = value - def get_notes(self): return self.notes - def set_notes(self, notes): self.notes = notes - def get_name(self): return self.name - def set_name(self, name): self.name = name - def get_NCBITaxId(self): return self.NCBITaxId - def set_NCBITaxId(self, NCBITaxId): self.NCBITaxId = NCBITaxId - def export(self, outfile, level, namespace_='ortho:', name_='species', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='species') - if self.hasContent_(): - outfile.write('>\n') - self.exportChildren(outfile, level + 1, namespace_, name_) - showIndent(outfile, level) - outfile.write('\n' % (namespace_, name_)) - else: - outfile.write('/>\n') - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='species'): - if self.name is not None and 'name' not in already_processed: - already_processed.append('name') - outfile.write(' name=%s' % (self.gds_format_string(quote_attrib(self.name).encode(ExternalEncoding), input_name='name'), )) - if self.NCBITaxId is not None and 'NCBITaxId' not in already_processed: - already_processed.append('NCBITaxId') - outfile.write(' NCBITaxId="%s"' % self.gds_format_integer(self.NCBITaxId, input_name='NCBITaxId')) - def exportChildren(self, outfile, level, namespace_='ortho:', name_='species', fromsubclass_=False): - for database_ in self.database: - database_.export(outfile, level, namespace_, name_='database') - if self.notes: - self.notes.export(outfile, level, namespace_, name_='notes') - def hasContent_(self): - if ( - self.database or - self.notes is not None - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='species'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - if self.name is not None and 'name' not in already_processed: - already_processed.append('name') - showIndent(outfile, level) - outfile.write('name = "%s",\n' % (self.name,)) - if self.NCBITaxId is not None and 'NCBITaxId' not in already_processed: - already_processed.append('NCBITaxId') - showIndent(outfile, level) - outfile.write('NCBITaxId = %d,\n' % (self.NCBITaxId,)) - def exportLiteralChildren(self, outfile, level, name_): - showIndent(outfile, level) - outfile.write('database=[\n') - level += 1 - for database_ in self.database: - showIndent(outfile, level) - outfile.write('model_.database(\n') - database_.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - level -= 1 - showIndent(outfile, level) - outfile.write('],\n') - if self.notes is not None: - showIndent(outfile, level) - outfile.write('notes=model_.notes(\n') - self.notes.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - def build(self, node): - self.buildAttributes(node, node.attrib, []) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - value = find_attr_value_('name', node) - if value is not None and 'name' not in already_processed: - already_processed.append('name') - self.name = value - value = find_attr_value_('NCBITaxId', node) - if value is not None and 'NCBITaxId' not in already_processed: - already_processed.append('NCBITaxId') - try: - self.NCBITaxId = int(value) - except ValueError, exp: - raise_parse_error(node, 'Bad integer attribute: %s' % exp) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - if nodeName_ == 'database': - obj_ = database.factory() - obj_.build(child_) - self.database.append(obj_) - elif nodeName_ == 'notes': - obj_ = notes.factory() - obj_.build(child_) - self.set_notes(obj_) -# end class species - - -class database(GeneratedsSuper): - """A database element contains all genes from a single database/source. - A Uniform Resource Identifier (URI) pointing to the gene. In the - simplest case one could imagine a URL which in concatenation - with the gene identifier links to the website of the gene in the - source database. However, how this is used depends on the source - of the orthoXML file. Name of the database. A Uniform Resource - Identifier (URI) pointing to the protein. A Uniform Resource - Identifier (URI) pointing to the transcript. Version number of - the database.""" - subclass = None - superclass = None - def __init__(self, transcriptLink=None, protLink=None, geneLink=None, name=None, version=None, genes=None, valueOf_=None): - self.transcriptLink = _cast(None, transcriptLink) - self.protLink = _cast(None, protLink) - self.geneLink = _cast(None, geneLink) - self.name = _cast(None, name) - self.version = _cast(None, version) - self.genes = genes - def factory(*args_, **kwargs_): - if database.subclass: - return database.subclass(*args_, **kwargs_) - else: - return database(*args_, **kwargs_) - factory = staticmethod(factory) - def get_genes(self): return self.genes - def set_genes(self, genes): self.genes = genes - def get_transcriptLink(self): return self.transcriptLink - def set_transcriptLink(self, transcriptLink): self.transcriptLink = transcriptLink - def get_protLink(self): return self.protLink - def set_protLink(self, protLink): self.protLink = protLink - def get_geneLink(self): return self.geneLink - def set_geneLink(self, geneLink): self.geneLink = geneLink - def get_name(self): return self.name - def set_name(self, name): self.name = name - def get_version(self): return self.version - def set_version(self, version): self.version = version - def export(self, outfile, level, namespace_='ortho:', name_='database', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='database') - if self.hasContent_(): - outfile.write('>\n') - self.exportChildren(outfile, level + 1, namespace_, name_) - showIndent(outfile, level) - outfile.write('\n' % (namespace_, name_)) - else: - outfile.write('/>\n') - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='database'): - if self.transcriptLink is not None and 'transcriptLink' not in already_processed: - already_processed.append('transcriptLink') - outfile.write(' transcriptLink=%s' % (self.gds_format_string(quote_attrib(self.transcriptLink).encode(ExternalEncoding), input_name='transcriptLink'), )) - if self.protLink is not None and 'protLink' not in already_processed: - already_processed.append('protLink') - outfile.write(' protLink=%s' % (self.gds_format_string(quote_attrib(self.protLink).encode(ExternalEncoding), input_name='protLink'), )) - if self.geneLink is not None and 'geneLink' not in already_processed: - already_processed.append('geneLink') - outfile.write(' geneLink=%s' % (self.gds_format_string(quote_attrib(self.geneLink).encode(ExternalEncoding), input_name='geneLink'), )) - if self.name is not None and 'name' not in already_processed: - already_processed.append('name') - outfile.write(' name=%s' % (self.gds_format_string(quote_attrib(self.name).encode(ExternalEncoding), input_name='name'), )) - if self.version is not None and 'version' not in already_processed: - already_processed.append('version') - outfile.write(' version=%s' % (self.gds_format_string(quote_attrib(self.version).encode(ExternalEncoding), input_name='version'), )) - def exportChildren(self, outfile, level, namespace_='ortho:', name_='database', fromsubclass_=False): - if self.genes: - self.genes.export(outfile, level, namespace_, name_='genes', ) - def hasContent_(self): - if ( - self.genes is not None - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='database'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - if self.transcriptLink is not None and 'transcriptLink' not in already_processed: - already_processed.append('transcriptLink') - showIndent(outfile, level) - outfile.write('transcriptLink = "%s",\n' % (self.transcriptLink,)) - if self.protLink is not None and 'protLink' not in already_processed: - already_processed.append('protLink') - showIndent(outfile, level) - outfile.write('protLink = "%s",\n' % (self.protLink,)) - if self.geneLink is not None and 'geneLink' not in already_processed: - already_processed.append('geneLink') - showIndent(outfile, level) - outfile.write('geneLink = "%s",\n' % (self.geneLink,)) - if self.name is not None and 'name' not in already_processed: - already_processed.append('name') - showIndent(outfile, level) - outfile.write('name = "%s",\n' % (self.name,)) - if self.version is not None and 'version' not in already_processed: - already_processed.append('version') - showIndent(outfile, level) - outfile.write('version = "%s",\n' % (self.version,)) - def exportLiteralChildren(self, outfile, level, name_): - if self.genes is not None: - showIndent(outfile, level) - outfile.write('genes=model_.genes(\n') - self.genes.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - def build(self, node): - self.buildAttributes(node, node.attrib, []) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - value = find_attr_value_('transcriptLink', node) - if value is not None and 'transcriptLink' not in already_processed: - already_processed.append('transcriptLink') - self.transcriptLink = value - value = find_attr_value_('protLink', node) - if value is not None and 'protLink' not in already_processed: - already_processed.append('protLink') - self.protLink = value - value = find_attr_value_('geneLink', node) - if value is not None and 'geneLink' not in already_processed: - already_processed.append('geneLink') - self.geneLink = value - value = find_attr_value_('name', node) - if value is not None and 'name' not in already_processed: - already_processed.append('name') - self.name = value - value = find_attr_value_('version', node) - if value is not None and 'version' not in already_processed: - already_processed.append('version') - self.version = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - if nodeName_ == 'genes': - obj_ = genes.factory() - obj_.build(child_) - self.set_genes(obj_) -# end class database - - -class genes(GeneratedsSuper): - """A gene element represents a list of genes.""" - subclass = None - superclass = None - def __init__(self, gene=None, valueOf_=None): - if gene is None: - self.gene = [] - else: - self.gene = gene - def factory(*args_, **kwargs_): - if genes.subclass: - return genes.subclass(*args_, **kwargs_) - else: - return genes(*args_, **kwargs_) - factory = staticmethod(factory) - def get_gene(self): return self.gene - def set_gene(self, gene): self.gene = gene - def add_gene(self, value): self.gene.append(value) - def insert_gene(self, index, value): self.gene[index] = value - def export(self, outfile, level, namespace_='ortho:', name_='genes', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='genes') - if self.hasContent_(): - outfile.write('>\n') - self.exportChildren(outfile, level + 1, namespace_, name_) - showIndent(outfile, level) - outfile.write('\n' % (namespace_, name_)) - else: - outfile.write('/>\n') - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='genes'): - pass - def exportChildren(self, outfile, level, namespace_='ortho:', name_='genes', fromsubclass_=False): - for gene_ in self.gene: - gene_.export(outfile, level, namespace_, name_='gene') - def hasContent_(self): - if ( - self.gene - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='genes'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - pass - def exportLiteralChildren(self, outfile, level, name_): - showIndent(outfile, level) - outfile.write('gene=[\n') - level += 1 - for gene_ in self.gene: - showIndent(outfile, level) - outfile.write('model_.gene(\n') - gene_.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - level -= 1 - showIndent(outfile, level) - outfile.write('],\n') - def build(self, node): - self.buildAttributes(node, node.attrib, []) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - if nodeName_ == 'gene': - obj_ = gene.factory() - obj_.build(child_) - self.gene.append(obj_) -# end class genes - - -class gene(GeneratedsSuper): - """The gene element represents a single gene, protein or transcript. It - is in fact a set of identifiers: one internal identifier that is - used to link from geneRef elements in ortholog clusters and gene - identifiers, transcript identifiers and protein identifiers to - identify the molecule. The proper term for this element would - therefore rather be molecule. However, as the general purpose of - orthoXML is to represent orthology data for genes the term gene - is used instead. Gene, protein and transcipt identifiers are - optional but at least one of the three should be given. The - source database of the gene is defined through the database - element in which the gene element lies and the identifiers - should stem from this source. Identifier of the gene in the - source database. Multiple splice forms are possible by having - the same geneId more than once. Internal identifier to link to - the gene via the geneRef elements. Identifier of the protein in - the source database. Identifier of the transcript in the source - database.""" - subclass = None - superclass = None - def __init__(self, protId=None, id=None, geneId=None, transcriptId=None, valueOf_=None): - self.protId = _cast(None, protId) - self.id = _cast(int, id) - self.geneId = _cast(None, geneId) - self.transcriptId = _cast(None, transcriptId) - pass - def factory(*args_, **kwargs_): - if gene.subclass: - return gene.subclass(*args_, **kwargs_) - else: - return gene(*args_, **kwargs_) - factory = staticmethod(factory) - def get_protId(self): return self.protId - def set_protId(self, protId): self.protId = protId - def get_id(self): return self.id - def set_id(self, id): self.id = id - def get_geneId(self): return self.geneId - def set_geneId(self, geneId): self.geneId = geneId - def get_transcriptId(self): return self.transcriptId - def set_transcriptId(self, transcriptId): self.transcriptId = transcriptId - def export(self, outfile, level, namespace_='ortho:', name_='gene', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='gene') - if self.hasContent_(): - outfile.write('>\n') - self.exportChildren(outfile, level + 1, namespace_, name_) - outfile.write('\n' % (namespace_, name_)) - else: - outfile.write('/>\n') - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='gene'): - if self.protId is not None and 'protId' not in already_processed: - already_processed.append('protId') - outfile.write(' protId=%s' % (self.gds_format_string(quote_attrib(self.protId).encode(ExternalEncoding), input_name='protId'), )) - if self.id is not None and 'id' not in already_processed: - already_processed.append('id') - outfile.write(' id="%s"' % self.gds_format_integer(self.id, input_name='id')) - if self.geneId is not None and 'geneId' not in already_processed: - already_processed.append('geneId') - outfile.write(' geneId=%s' % (self.gds_format_string(quote_attrib(self.geneId).encode(ExternalEncoding), input_name='geneId'), )) - if self.transcriptId is not None and 'transcriptId' not in already_processed: - already_processed.append('transcriptId') - outfile.write(' transcriptId=%s' % (self.gds_format_string(quote_attrib(self.transcriptId).encode(ExternalEncoding), input_name='transcriptId'), )) - def exportChildren(self, outfile, level, namespace_='ortho:', name_='gene', fromsubclass_=False): - pass - def hasContent_(self): - if ( - - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='gene'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - if self.protId is not None and 'protId' not in already_processed: - already_processed.append('protId') - showIndent(outfile, level) - outfile.write('protId = "%s",\n' % (self.protId,)) - if self.id is not None and 'id' not in already_processed: - already_processed.append('id') - showIndent(outfile, level) - outfile.write('id = %d,\n' % (self.id,)) - if self.geneId is not None and 'geneId' not in already_processed: - already_processed.append('geneId') - showIndent(outfile, level) - outfile.write('geneId = "%s",\n' % (self.geneId,)) - if self.transcriptId is not None and 'transcriptId' not in already_processed: - already_processed.append('transcriptId') - showIndent(outfile, level) - outfile.write('transcriptId = "%s",\n' % (self.transcriptId,)) - def exportLiteralChildren(self, outfile, level, name_): - pass - def build(self, node): - self.buildAttributes(node, node.attrib, []) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - value = find_attr_value_('protId', node) - if value is not None and 'protId' not in already_processed: - already_processed.append('protId') - self.protId = value - value = find_attr_value_('id', node) - if value is not None and 'id' not in already_processed: - already_processed.append('id') - try: - self.id = int(value) - except ValueError, exp: - raise_parse_error(node, 'Bad integer attribute: %s' % exp) - value = find_attr_value_('geneId', node) - if value is not None and 'geneId' not in already_processed: - already_processed.append('geneId') - self.geneId = value - value = find_attr_value_('transcriptId', node) - if value is not None and 'transcriptId' not in already_processed: - already_processed.append('transcriptId') - self.transcriptId = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - pass -# end class gene - - -class scores(GeneratedsSuper): - """A list of score definitions.""" - subclass = None - superclass = None - def __init__(self, scoreDef=None, valueOf_=None): - if scoreDef is None: - self.scoreDef = [] - else: - self.scoreDef = scoreDef - def factory(*args_, **kwargs_): - if scores.subclass: - return scores.subclass(*args_, **kwargs_) - else: - return scores(*args_, **kwargs_) - factory = staticmethod(factory) - def get_scoreDef(self): return self.scoreDef - def set_scoreDef(self, scoreDef): self.scoreDef = scoreDef - def add_scoreDef(self, value): self.scoreDef.append(value) - def insert_scoreDef(self, index, value): self.scoreDef[index] = value - def export(self, outfile, level, namespace_='ortho:', name_='scores', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='scores') - if self.hasContent_(): - outfile.write('>\n') - self.exportChildren(outfile, level + 1, namespace_, name_) - showIndent(outfile, level) - outfile.write('\n' % (namespace_, name_)) - else: - outfile.write('/>\n') - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='scores'): - pass - def exportChildren(self, outfile, level, namespace_='ortho:', name_='scores', fromsubclass_=False): - for scoreDef_ in self.scoreDef: - scoreDef_.export(outfile, level, namespace_, name_='scoreDef') - def hasContent_(self): - if ( - self.scoreDef - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='scores'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - pass - def exportLiteralChildren(self, outfile, level, name_): - showIndent(outfile, level) - outfile.write('scoreDef=[\n') - level += 1 - for scoreDef_ in self.scoreDef: - showIndent(outfile, level) - outfile.write('model_.scoreDef(\n') - scoreDef_.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - level -= 1 - showIndent(outfile, level) - outfile.write('],\n') - def build(self, node): - self.buildAttributes(node, node.attrib, []) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - if nodeName_ == 'scoreDef': - obj_ = scoreDef.factory() - obj_.build(child_) - self.scoreDef.append(obj_) -# end class scores - - -class groups(GeneratedsSuper): - """Represents the list of ortholog groups. Note that the purpose of - OrthoXML is to store orthology assignment hence on the top level - only ortholog groups are allowed.""" - subclass = None - superclass = None - def __init__(self, orthologGroup=None, valueOf_=None): - if orthologGroup is None: - self.orthologGroup = [] - else: - self.orthologGroup = orthologGroup - def factory(*args_, **kwargs_): - if groups.subclass: - return groups.subclass(*args_, **kwargs_) - else: - return groups(*args_, **kwargs_) - factory = staticmethod(factory) - def get_orthologGroup(self): return self.orthologGroup - def set_orthologGroup(self, orthologGroup): self.orthologGroup = orthologGroup - def add_orthologGroup(self, value): self.orthologGroup.append(value) - def insert_orthologGroup(self, index, value): self.orthologGroup[index] = value - def export(self, outfile, level, namespace_='ortho:', name_='groups', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='groups') - if self.hasContent_(): - outfile.write('>\n') - self.exportChildren(outfile, level + 1, namespace_, name_) - showIndent(outfile, level) - outfile.write('\n' % (namespace_, name_)) - else: - outfile.write('/>\n') - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='groups'): - pass - def exportChildren(self, outfile, level, namespace_='ortho:', name_='groups', fromsubclass_=False): - for orthologGroup_ in self.orthologGroup: - orthologGroup_.export(outfile, level, namespace_, name_='orthologGroup') - def hasContent_(self): - if ( - self.orthologGroup - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='groups'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - pass - def exportLiteralChildren(self, outfile, level, name_): - showIndent(outfile, level) - outfile.write('orthologGroup=[\n') - level += 1 - for orthologGroup_ in self.orthologGroup: - showIndent(outfile, level) - outfile.write('model_.group(\n') - orthologGroup_.exportLiteral(outfile, level, name_='group') - showIndent(outfile, level) - outfile.write('),\n') - level -= 1 - showIndent(outfile, level) - outfile.write('],\n') - def build(self, node): - self.buildAttributes(node, node.attrib, []) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - if nodeName_ == 'orthologGroup': - obj_ = group.factory() - obj_.build(child_) - self.orthologGroup.append(obj_) -# end class groups - - -class group(GeneratedsSuper): - """A group of genes or nested groups. In case of a orothologGroup - element, all genes in the group or in the nested groups are - orthologs to each other i.e. stem from the same gene in the last - common ancester of the species. In case of a paralogGroup the - genes are paralogs to each other. Subgroups within the group - allow the represention of phylogenetic trees. For more details - and examples see http://orthoxml.org/orthoxml_doc.html. A group - can may contain two or more of the three alternatives geneRef, - paralogGroup, and orthologGroup. By combining these, complex - phylogenies are possible. Identifier for the group in context of - the resource. This attribute is not required but if your - resource provides identifiers for the ortholog groups we - strongly recommend to use it at least for the top level groups.""" - subclass = None - superclass = None - def __init__(self, id=None, score=None, property=None, geneRef=None, paralogGroup=None, orthologGroup=None, notes=None, valueOf_=None): - self.id = _cast(None, id) - if score is None: - self.score = [] - else: - self.score = score - if property is None: - self.property = [] - else: - self.property = property - if geneRef is None: - self.geneRef = [] - else: - self.geneRef = geneRef - if paralogGroup is None: - self.paralogGroup = [] - else: - self.paralogGroup = paralogGroup - if orthologGroup is None: - self.orthologGroup = [] - else: - self.orthologGroup = orthologGroup - self.notes = notes - def factory(*args_, **kwargs_): - if group.subclass: - return group.subclass(*args_, **kwargs_) - else: - return group(*args_, **kwargs_) - factory = staticmethod(factory) - def get_score(self): return self.score - def set_score(self, score): self.score = score - def add_score(self, value): self.score.append(value) - def insert_score(self, index, value): self.score[index] = value - def get_property(self): return self.property - def set_property(self, property): self.property = property - def add_property(self, value): self.property.append(value) - def insert_property(self, index, value): self.property[index] = value - def get_geneRef(self): return self.geneRef - def set_geneRef(self, geneRef): self.geneRef = geneRef - def add_geneRef(self, value): self.geneRef.append(value) - def insert_geneRef(self, index, value): self.geneRef[index] = value - def get_paralogGroup(self): return self.paralogGroup - def set_paralogGroup(self, paralogGroup): self.paralogGroup = paralogGroup - def add_paralogGroup(self, value): self.paralogGroup.append(value) - def insert_paralogGroup(self, index, value): self.paralogGroup[index] = value - def get_orthologGroup(self): return self.orthologGroup - def set_orthologGroup(self, orthologGroup): self.orthologGroup = orthologGroup - def add_orthologGroup(self, value): self.orthologGroup.append(value) - def insert_orthologGroup(self, index, value): self.orthologGroup[index] = value - def get_notes(self): return self.notes - def set_notes(self, notes): self.notes = notes - def get_id(self): return self.id - def set_id(self, id): self.id = id - def export(self, outfile, level, namespace_='ortho:', name_='group', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='group') - if self.hasContent_(): - outfile.write('>\n') - self.exportChildren(outfile, level + 1, namespace_, name_) - showIndent(outfile, level) - outfile.write('\n' % (namespace_, name_)) - else: - outfile.write('/>\n') - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='group'): - if self.id is not None and 'id' not in already_processed: - already_processed.append('id') - outfile.write(' id=%s' % (self.gds_format_string(quote_attrib(self.id).encode(ExternalEncoding), input_name='id'), )) - def exportChildren(self, outfile, level, namespace_='ortho:', name_='group', fromsubclass_=False): - for score_ in self.score: - score_.export(outfile, level, namespace_, name_='score') - for property_ in self.property: - property_.export(outfile, level, namespace_, name_='property') - for geneRef_ in self.geneRef: - geneRef_.export(outfile, level, namespace_, name_='geneRef') - for paralogGroup_ in self.paralogGroup: - paralogGroup_.export(outfile, level, namespace_, name_='paralogGroup') - for orthologGroup_ in self.orthologGroup: - orthologGroup_.export(outfile, level, namespace_, name_='orthologGroup') - if self.notes: - self.notes.export(outfile, level, namespace_, name_='notes') - def hasContent_(self): - if ( - self.score or - self.property or - self.geneRef or - self.paralogGroup or - self.orthologGroup or - self.notes is not None - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='group'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - if self.id is not None and 'id' not in already_processed: - already_processed.append('id') - showIndent(outfile, level) - outfile.write('id = "%s",\n' % (self.id,)) - def exportLiteralChildren(self, outfile, level, name_): - showIndent(outfile, level) - outfile.write('score=[\n') - level += 1 - for score_ in self.score: - showIndent(outfile, level) - outfile.write('model_.score(\n') - score_.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - level -= 1 - showIndent(outfile, level) - outfile.write('],\n') - showIndent(outfile, level) - outfile.write('property=[\n') - level += 1 - for property_ in self.property: - showIndent(outfile, level) - outfile.write('model_.property(\n') - property_.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - level -= 1 - showIndent(outfile, level) - outfile.write('],\n') - showIndent(outfile, level) - outfile.write('geneRef=[\n') - level += 1 - for geneRef_ in self.geneRef: - showIndent(outfile, level) - outfile.write('model_.geneRef(\n') - geneRef_.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - level -= 1 - showIndent(outfile, level) - outfile.write('],\n') - showIndent(outfile, level) - outfile.write('paralogGroup=[\n') - level += 1 - for paralogGroup_ in self.paralogGroup: - showIndent(outfile, level) - outfile.write('model_.group(\n') - paralogGroup_.exportLiteral(outfile, level, name_='group') - showIndent(outfile, level) - outfile.write('),\n') - level -= 1 - showIndent(outfile, level) - outfile.write('],\n') - showIndent(outfile, level) - outfile.write('orthologGroup=[\n') - level += 1 - for orthologGroup_ in self.orthologGroup: - showIndent(outfile, level) - outfile.write('model_.group(\n') - orthologGroup_.exportLiteral(outfile, level, name_='group') - showIndent(outfile, level) - outfile.write('),\n') - level -= 1 - showIndent(outfile, level) - outfile.write('],\n') - if self.notes is not None: - showIndent(outfile, level) - outfile.write('notes=model_.notes(\n') - self.notes.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - def build(self, node): - self.buildAttributes(node, node.attrib, []) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - value = find_attr_value_('id', node) - if value is not None and 'id' not in already_processed: - already_processed.append('id') - self.id = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - if nodeName_ == 'score': - obj_ = score.factory() - obj_.build(child_) - self.score.append(obj_) - elif nodeName_ == 'property': - obj_ = property.factory() - obj_.build(child_) - self.property.append(obj_) - elif nodeName_ == 'geneRef': - obj_ = geneRef.factory() - obj_.build(child_) - self.geneRef.append(obj_) - elif nodeName_ == 'paralogGroup': - obj_ = group.factory() - obj_.build(child_) - self.paralogGroup.append(obj_) - elif nodeName_ == 'orthologGroup': - obj_ = group.factory() - obj_.build(child_) - self.orthologGroup.append(obj_) - elif nodeName_ == 'notes': - obj_ = notes.factory() - obj_.build(child_) - self.set_notes(obj_) -# end class group - - -class geneRef(GeneratedsSuper): - """The geneRef element is a link to the gene definition under the - species element. It defines the members of an ortholog or - paralog group. The same gene can be referenced muliple times. - The geneRef element can have multiple score elements and a notes - elements as children. The notes element can for instance be used - for special, ortholog-database-specific information (with - InParanoid, for example, we could use it to mark the seed - orthologs). Internal identifier for a gene element defined under - the species element.""" - subclass = None - superclass = None - def __init__(self, id=None, score=None, notes=None, valueOf_=None): - self.id = _cast(int, id) - if score is None: - self.score = [] - else: - self.score = score - self.notes = notes - def factory(*args_, **kwargs_): - if geneRef.subclass: - return geneRef.subclass(*args_, **kwargs_) - else: - return geneRef(*args_, **kwargs_) - factory = staticmethod(factory) - def get_score(self): return self.score - def set_score(self, score): self.score = score - def add_score(self, value): self.score.append(value) - def insert_score(self, index, value): self.score[index] = value - def get_notes(self): return self.notes - def set_notes(self, notes): self.notes = notes - def get_id(self): return self.id - def set_id(self, id): self.id = id - def export(self, outfile, level, namespace_='ortho:', name_='geneRef', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='geneRef') - if self.hasContent_(): - outfile.write('>\n') - self.exportChildren(outfile, level + 1, namespace_, name_) - showIndent(outfile, level) - outfile.write('\n' % (namespace_, name_)) - else: - outfile.write('/>\n') - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='geneRef'): - if self.id is not None and 'id' not in already_processed: - already_processed.append('id') - outfile.write(' id="%s"' % self.gds_format_integer(self.id, input_name='id')) - def exportChildren(self, outfile, level, namespace_='ortho:', name_='geneRef', fromsubclass_=False): - for score_ in self.score: - score_.export(outfile, level, namespace_, name_='score') - if self.notes: - self.notes.export(outfile, level, namespace_, name_='notes') - def hasContent_(self): - if ( - self.score or - self.notes is not None - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='geneRef'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - if self.id is not None and 'id' not in already_processed: - already_processed.append('id') - showIndent(outfile, level) - outfile.write('id = %d,\n' % (self.id,)) - def exportLiteralChildren(self, outfile, level, name_): - showIndent(outfile, level) - outfile.write('score=[\n') - level += 1 - for score_ in self.score: - showIndent(outfile, level) - outfile.write('model_.score(\n') - score_.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - level -= 1 - showIndent(outfile, level) - outfile.write('],\n') - if self.notes is not None: - showIndent(outfile, level) - outfile.write('notes=model_.notes(\n') - self.notes.exportLiteral(outfile, level) - showIndent(outfile, level) - outfile.write('),\n') - def build(self, node): - self.buildAttributes(node, node.attrib, []) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - value = find_attr_value_('id', node) - if value is not None and 'id' not in already_processed: - already_processed.append('id') - try: - self.id = int(value) - except ValueError, exp: - raise_parse_error(node, 'Bad integer attribute: %s' % exp) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - if nodeName_ == 'score': - obj_ = score.factory() - obj_.build(child_) - self.score.append(obj_) - elif nodeName_ == 'notes': - obj_ = notes.factory() - obj_.build(child_) - self.set_notes(obj_) -# end class geneRef - - -class scoreDef(GeneratedsSuper): - """The scoreDef element defines a score. One of the concepts of - orthoXML is to be as flexible as possible but still uniformly - parsable. Part of this is to allow every ortholog resource to - give their own types of scores for groups or group members, - which is done using score elements. Score elements can be - defined to apply to either groups or geneRefs. It is possible to - define multiple scores. An internal identifier to link to the - scoreDef from a score element. Description of the score.""" - subclass = None - superclass = None - def __init__(self, id=None, desc=None, valueOf_=None): - self.id = _cast(None, id) - self.desc = _cast(None, desc) - pass - def factory(*args_, **kwargs_): - if scoreDef.subclass: - return scoreDef.subclass(*args_, **kwargs_) - else: - return scoreDef(*args_, **kwargs_) - factory = staticmethod(factory) - def get_id(self): return self.id - def set_id(self, id): self.id = id - def get_desc(self): return self.desc - def set_desc(self, desc): self.desc = desc - def export(self, outfile, level, namespace_='ortho:', name_='scoreDef', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='scoreDef') - if self.hasContent_(): - outfile.write('>\n') - self.exportChildren(outfile, level + 1, namespace_, name_) - outfile.write('\n' % (namespace_, name_)) - else: - outfile.write('/>\n') - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='scoreDef'): - if self.id is not None and 'id' not in already_processed: - already_processed.append('id') - outfile.write(' id=%s' % (quote_attrib(self.id), )) - if self.desc is not None and 'desc' not in already_processed: - already_processed.append('desc') - outfile.write(' desc=%s' % (self.gds_format_string(quote_attrib(self.desc).encode(ExternalEncoding), input_name='desc'), )) - def exportChildren(self, outfile, level, namespace_='ortho:', name_='scoreDef', fromsubclass_=False): - pass - def hasContent_(self): - if ( - - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='scoreDef'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - if self.id is not None and 'id' not in already_processed: - already_processed.append('id') - showIndent(outfile, level) - outfile.write('id = "%s",\n' % (self.id,)) - if self.desc is not None and 'desc' not in already_processed: - already_processed.append('desc') - showIndent(outfile, level) - outfile.write('desc = "%s",\n' % (self.desc,)) - def exportLiteralChildren(self, outfile, level, name_): - pass - def build(self, node): - self.buildAttributes(node, node.attrib, []) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - value = find_attr_value_('id', node) - if value is not None and 'id' not in already_processed: - already_processed.append('id') - self.id = value - value = find_attr_value_('desc', node) - if value is not None and 'desc' not in already_processed: - already_processed.append('desc') - self.desc = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - pass -# end class scoreDef - - -class score(GeneratedsSuper): - """The score element gives the value of a score and links it to the - scoreDef element, which defines the score. It can be child of a - group or a geneRef element to allow scoring on different levels. - An identifier linking to the scoreDef element, which defines the - score. The actual value of the score. For instance a confidence - score of a group member.""" - subclass = None - superclass = None - def __init__(self, id=None, value=None, valueOf_=None): - self.id = _cast(None, id) - self.value = _cast(float, value) - pass - def factory(*args_, **kwargs_): - if score.subclass: - return score.subclass(*args_, **kwargs_) - else: - return score(*args_, **kwargs_) - factory = staticmethod(factory) - def get_id(self): return self.id - def set_id(self, id): self.id = id - def get_value(self): return self.value - def set_value(self, value): self.value = value - def export(self, outfile, level, namespace_='ortho:', name_='score', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='score') - if self.hasContent_(): - outfile.write('>\n') - self.exportChildren(outfile, level + 1, namespace_, name_) - outfile.write('\n' % (namespace_, name_)) - else: - outfile.write('/>\n') - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='score'): - if self.id is not None and 'id' not in already_processed: - already_processed.append('id') - outfile.write(' id=%s' % (quote_attrib(self.id), )) - if self.value is not None and 'value' not in already_processed: - already_processed.append('value') - outfile.write(' value="%s"' % self.gds_format_float(self.value, input_name='value')) - def exportChildren(self, outfile, level, namespace_='ortho:', name_='score', fromsubclass_=False): - pass - def hasContent_(self): - if ( - - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='score'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - if self.id is not None and 'id' not in already_processed: - already_processed.append('id') - showIndent(outfile, level) - outfile.write('id = "%s",\n' % (self.id,)) - if self.value is not None and 'value' not in already_processed: - already_processed.append('value') - showIndent(outfile, level) - outfile.write('value = %f,\n' % (self.value,)) - def exportLiteralChildren(self, outfile, level, name_): - pass - def build(self, node): - self.buildAttributes(node, node.attrib, []) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - value = find_attr_value_('id', node) - if value is not None and 'id' not in already_processed: - already_processed.append('id') - self.id = value - value = find_attr_value_('value', node) - if value is not None and 'value' not in already_processed: - already_processed.append('value') - try: - self.value = float(value) - except ValueError, exp: - raise ValueError('Bad float/double attribute (value): %s' % exp) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - pass -# end class score - - -class property(GeneratedsSuper): - """Key-value pair for group annotations, for instance statistics about - the group members. The key of the key-value annotation pair. The - value of the key-value annotation pair. Optional to allow flag - like annotations.""" - subclass = None - superclass = None - def __init__(self, name=None, value=None, valueOf_=None): - self.name = _cast(None, name) - self.value = _cast(None, value) - pass - def factory(*args_, **kwargs_): - if property.subclass: - return property.subclass(*args_, **kwargs_) - else: - return property(*args_, **kwargs_) - factory = staticmethod(factory) - def get_name(self): return self.name - def set_name(self, name): self.name = name - def get_value(self): return self.value - def set_value(self, value): self.value = value - def export(self, outfile, level, namespace_='ortho:', name_='property', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='property') - if self.hasContent_(): - outfile.write('>\n') - self.exportChildren(outfile, level + 1, namespace_, name_) - outfile.write('\n' % (namespace_, name_)) - else: - outfile.write('/>\n') - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='property'): - if self.name is not None and 'name' not in already_processed: - already_processed.append('name') - outfile.write(' name=%s' % (self.gds_format_string(quote_attrib(self.name).encode(ExternalEncoding), input_name='name'), )) - if self.value is not None and 'value' not in already_processed: - already_processed.append('value') - outfile.write(' value=%s' % (self.gds_format_string(quote_attrib(self.value).encode(ExternalEncoding), input_name='value'), )) - def exportChildren(self, outfile, level, namespace_='ortho:', name_='property', fromsubclass_=False): - pass - def hasContent_(self): - if ( - - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='property'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - if self.name is not None and 'name' not in already_processed: - already_processed.append('name') - showIndent(outfile, level) - outfile.write('name = "%s",\n' % (self.name,)) - if self.value is not None and 'value' not in already_processed: - already_processed.append('value') - showIndent(outfile, level) - outfile.write('value = "%s",\n' % (self.value,)) - def exportLiteralChildren(self, outfile, level, name_): - pass - def build(self, node): - self.buildAttributes(node, node.attrib, []) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - value = find_attr_value_('name', node) - if value is not None and 'name' not in already_processed: - already_processed.append('name') - self.name = value - value = find_attr_value_('value', node) - if value is not None and 'value' not in already_processed: - already_processed.append('value') - self.value = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - pass -# end class property - - -class notes(GeneratedsSuper): - """The notes element is a special element, which allows adding - information that is not general enough to be part of the - standard. I.e. something specific to a particular ortholog - database or algorithm. Notes elements will not be validated, so - any child elements are legal. Notes elements can be children of - the root element orthoXML, the species element, the - orthologGroup element, the paralogGroup element, or the geneRef - element.""" - subclass = None - superclass = None - def __init__(self, valueOf_=None, mixedclass_=None, content_=None): - self.valueOf_ = valueOf_ - if mixedclass_ is None: - self.mixedclass_ = MixedContainer - else: - self.mixedclass_ = mixedclass_ - if content_ is None: - self.content_ = [] - else: - self.content_ = content_ - self.valueOf_ = valueOf_ - def factory(*args_, **kwargs_): - if notes.subclass: - return notes.subclass(*args_, **kwargs_) - else: - return notes(*args_, **kwargs_) - factory = staticmethod(factory) - def get_valueOf_(self): return self.valueOf_ - def set_valueOf_(self, valueOf_): self.valueOf_ = valueOf_ - def export(self, outfile, level, namespace_='ortho:', name_='notes', namespacedef_=''): - showIndent(outfile, level) - outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) - already_processed = [] - self.exportAttributes(outfile, level, already_processed, namespace_, name_='notes') - outfile.write('>') - self.exportChildren(outfile, level + 1, namespace_, name_) - outfile.write('\n' % (namespace_, name_)) - def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='notes'): - pass - def exportChildren(self, outfile, level, namespace_='ortho:', name_='notes', fromsubclass_=False): - pass - def hasContent_(self): - if ( - self.valueOf_ - ): - return True - else: - return False - def exportLiteral(self, outfile, level, name_='notes'): - level += 1 - self.exportLiteralAttributes(outfile, level, [], name_) - if self.hasContent_(): - self.exportLiteralChildren(outfile, level, name_) - showIndent(outfile, level) - outfile.write('valueOf_ = """%s""",\n' % (self.valueOf_,)) - def exportLiteralAttributes(self, outfile, level, already_processed, name_): - pass - def exportLiteralChildren(self, outfile, level, name_): - pass - def build(self, node): - self.buildAttributes(node, node.attrib, []) - self.valueOf_ = get_all_text_(node) - if node.text is not None: - obj_ = self.mixedclass_(MixedContainer.CategoryText, - MixedContainer.TypeNone, '', node.text) - self.content_.append(obj_) - for child in node: - nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_) - def buildAttributes(self, node, attrs, already_processed): - pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): - if not fromsubclass_ and child_.tail is not None: - obj_ = self.mixedclass_(MixedContainer.CategoryText, - MixedContainer.TypeNone, '', child_.tail) - self.content_.append(obj_) - pass -# end class notes - - -USAGE_TEXT = """ -Usage: python .py [ -s ] -""" - -def usage(): - print USAGE_TEXT - sys.exit(1) - - -def get_root_tag(node): - tag = Tag_pattern_.match(node.tag).groups()[-1] - rootClass = globals().get(tag) - return tag, rootClass - - -def parse(inFileName): - doc = parsexml_(inFileName) - rootNode = doc.getroot() - rootTag, rootClass = get_root_tag(rootNode) - if rootClass is None: - rootTag = 'orthoXML' - rootClass = orthoXML - rootObj = rootClass.factory() - rootObj.build(rootNode) - # Enable Python to collect the space used by the DOM. - doc = None -## sys.stdout.write('\n') -## rootObj.export(sys.stdout, 0, name_=rootTag, -## namespacedef_='xmlns:ortho="http://orthoXML.org/2011/"') - return rootObj - - -def parseString(inString): - from StringIO import StringIO - doc = parsexml_(StringIO(inString)) - rootNode = doc.getroot() - rootTag, rootClass = get_root_tag(rootNode) - if rootClass is None: - rootTag = 'orthoXML' - rootClass = orthoXML - rootObj = rootClass.factory() - rootObj.build(rootNode) - # Enable Python to collect the space used by the DOM. - doc = None -## sys.stdout.write('\n') -## rootObj.export(sys.stdout, 0, name_="orthoXML", -## namespacedef_='xmlns:ortho="http://orthoXML.org/2011/"') - return rootObj - - -def parseLiteral(inFileName): - doc = parsexml_(inFileName) - rootNode = doc.getroot() - rootTag, rootClass = get_root_tag(rootNode) - if rootClass is None: - rootTag = 'orthoXML' - rootClass = orthoXML - rootObj = rootClass.factory() - rootObj.build(rootNode) - # Enable Python to collect the space used by the DOM. - doc = None -## sys.stdout.write('#from orthoxml import *\n\n') -## sys.stdout.write('import orthoxml as model_\n\n') -## sys.stdout.write('rootObj = model_.rootTag(\n') -## rootObj.exportLiteral(sys.stdout, 0, name_=rootTag) -## sys.stdout.write(')\n') - return rootObj - - -def main(): - args = sys.argv[1:] - if len(args) == 1: - parse(args[0]) - else: - usage() - - -if __name__ == '__main__': - #import pdb; pdb.set_trace() - main() - - -__all__ = [ - "database", - "gene", - "geneRef", - "genes", - "group", - "groups", - "notes", - "orthoXML", - "property", - "score", - "scoreDef", - "scores", - "species" - ] diff --git a/src/HogProf/profiler.py b/src/HogProf/profiler.py index 8b396f7..ce1065f 100755 --- a/src/HogProf/profiler.py +++ b/src/HogProf/profiler.py @@ -1,426 +1,429 @@ -from pyoma.browser import db +import functools +import gc +import logging +import multiprocessing as mp import pickle -import pandas as pd -import h5py import random -from tables import * -import numpy as np -import random -import ete3 -#from validation import validation_semantic_similarity -from HogProf.utils import hashutils , pyhamutils , files_utils +import time from time import time -import multiprocessing as mp -import functools + +import ete3 +import h5py import numpy as np -import time -import gc -import logging +import pandas as pd from pyoma.browser import db +from tables import * + +from HogProf.utils import hashutils, pyhamutils, files_utils + np.random.seed(0) random.seed(0) -class Profiler: - """ - A profiler object allows the user to query the LSH with HOGs and get a list of result HOGs back - - """ - def __init__(self,lshforestpath = None, hashes_h5=None, mat_path= None, oma = False , nsamples = 256 , mastertree = None ): - """ - The Profiler class initializes a profiler object for querying the LSH with HOGs and returning a list of result HOGs. - - Attributes: - lshobj (object): LSH object for querying. - hashes_h5 (h5py.File): H5 file containing HOGs. - nsamples (int): Number of samples to use. - tree (ete3.Tree): Master tree used for generating taxa index. - tree_string (str): String representation of the master tree. - taxaIndex (dict): Dictionary mapping taxa names to their indices in the master tree. - ReverseTaxaIndex (dict): Dictionary mapping indices in the master tree to their corresponding taxa names. - db_obj (db.Database): OMA database object. - treeweights (dict): Dictionary containing the tree weight for each taxon. - READ_ORTHO (callable): Function for reading orthoxml files from OMA. - HAM_PIPELINE (callable): Function for generating the Annotated tree from a row. - HASH_PIPELINE (callable): Function for generating the hash from a row. - - Parameters: - lshforestpath (str, optional): Path to the pickled LSH forest object. - hashes_h5 (str, optional): Path to the H5 file containing HOGs. - mat_path (str, optional): Path to the matrix file containing HOGs. - oma (str, optional): Path to the OMA database. - tar (str, optional): Path to the tar archive. - nsamples (int, optional): Number of samples to use. Defaults to 256. - mastertree (str, optional): Path to the master tree file. - """ - - print('loading lsh') - with open(lshforestpath, 'rb') as lshpickle: - self.lshobj = pickle.loads(lshpickle.read()) - print('indexing lsh') - self.lshobj.index() - - self.hashes_h5 = h5py.File(hashes_h5, mode='r') - print('h5' , self.hashes_h5 , self.hashes_h5.keys()) - self.nsamples = nsamples - if mastertree.split('.')[-1] == 'pkl': - with open( mastertree , 'rb') as pklin: - self.tree = pickle.loads(pklin.read()) - self.tree_string = self.tree.write(format=1) - elif mastertree.split('.')[-1] == 'nwk': - self.tree = ete3.Tree(mastertree,format=1) - self.tree_string = self.tree.write(format=1) - - else: - raise Exception( 'please provide a pickled ete3 tree or a newick file' ) - self.taxaIndex, self.ReverseTaxaIndex = files_utils.generate_taxa_index(self.tree) - - if oma: - h5_oma = open_file(oma, mode="r") - self.db_obj = db.Database(h5_oma) - self.treeweights = hashutils.generate_treeweights(self.tree , self.taxaIndex , None, None ) - self.READ_ORTHO = functools.partial(pyhamutils.get_orthoxml_oma , db_obj=self.db_obj) - self.HAM_PIPELINE = functools.partial(pyhamutils.get_ham_treemap_from_row, tree=self.tree_string ) - self.HASH_PIPELINE = functools.partial(hashutils.row2hash , taxaIndex=self.taxaIndex , treeweights=self.treeweights , wmg=None ) - - print('DONE') - - def hogid2fam(self, hog_entry): - if type(hog_entry )== int: - return hog_entry - else: - hog_entry = self.db_obj.entry_by_entry_nr(self.db_obj.id_resolver.resolve(hog_entry)) - famnr = int(self.db_obj.hog_family( entry=hog_entry ) ) - return famnr - - def return_profile_OTF(self, fam): - """ - Returns profiles as binary vectors for use with optimisation pipelines - """ - if type(fam) is str: - fam = self.hogid2fam(fam) - ortho_fam = self.READ_ORTHO(fam) - if ortho_fam: - tp = self.HAM_PIPELINE([fam, ortho_fam]) - - losses = [ self.taxaIndex[n.name] for n in tp.traverse() if n.lost and n.name in self.taxaIndex ] - dupl = [ self.taxaIndex[n.name] for n in tp.traverse() if n.dupl and n.name in self.taxaIndex ] - presence = [ self.taxaIndex[n.name] for n in tp.traverse() if n.nbr_genes > 0 and n.name in self.taxaIndex ] - - indices = dict(zip (['presence', 'loss', 'dup'],[presence,losses,dupl] ) ) - hog_matrix_raw = np.zeros((1, 3*len(self.taxaIndex))) - for i,event in enumerate(indices): - if len(indices[event])>0: - taxindex = np.asarray(indices[event]) - hogindex = np.asarray(indices[event])+i*len(self.taxaIndex) - hog_matrix_raw[:,hogindex] = 1 - return {fam:{ 'mat':hog_matrix_raw, 'tree':tp} } - else: - return{ fam: { 'mat':None , 'tree':None }} - - - def return_profile_complements(self, fam): - """ - Returns profiles for each loss to search for complementary hogs - """ - if type(fam) is str: - fam = self.hogid2fam(fam) - ortho_fam = self.READ_ORTHO(fam) - tp = self.HAM_PIPELINE([fam, ortho_fam]) - - losses = set([ n.name for n in tp.traverse() if n.lost and n.name in self.taxaIndex ]) - #these are the roots of the fams we are looking for - #we just assume no duplications or losses from this point - - ancestral_nodes = ([ n for n in profiler.tree.traverse() if n.name in losses]) - losses=[] - dupl=[] - complements={ n.name+'_loss' : [] } - - indices = dict(zip (['presence', 'loss', 'dup'],[presence,losses,dupl] ) ) - - hog_matrix_raw = np.zeros((1, 3*len(self.taxaIndex))) - for i,event in enumerate(indices): - if len(indices[event])>0: - taxindex = np.asarray(indices[event]) - hogindex = np.asarray(indices[event])+i*len(self.taxaIndex) - hog_matrix_raw[:,hogindex] = 1 - - return {fam:{ 'mat':hog_matrix_raw, 'hash':tp} } - - def worker( self,i, inq, retq ): - """ - this worker function is for parallelization of generation of binary vector for use with optimisation pipelines - - """ - print('worker start'+str(i)) - while True: - input = inq.get() - if input is None: - break - else: - fam,ortho_fam = input - tp = self.HAM_PIPELINE([fam, ortho_fam]) - losses = [ self.taxaIndex[n.name] for n in tp.traverse() if n.lost and n.name in self.taxaIndex ] - dupl = [ self.taxaIndex[n.name] for n in tp.traverse() if n.dupl and n.name in self.taxaIndex ] - presence = [ self.taxaIndex[n.name] for n in tp.traverse() if n.nbr_genes > 0 and n.name in self.taxaIndex ] - indices = dict(zip (['presence', 'loss', 'dup'],[presence,losses,dupl] ) ) - hog_matrix_raw = np.zeros((1, 3*len(self.taxaIndex))) - for i,event in enumerate(indices): - if len(indices[event])>0: - taxindex = np.asarray(indices[event]) - hogindex = np.asarray(indices[event])+i*len(self.taxaIndex) - hog_matrix_raw[:,hogindex] = 1 - retq.put({fam:{ 'mat':hog_matrix_raw, 'tree':tp} }) - - - def retmat_mp(self, traindf , nworkers = 25, chunksize=50 ): - """ - function used to create training matrix with pairs of hogs. calculate_x will return the intersetcion of - two binary vectors generated by pyham - """ - #fams = [ hashutils.hogid2fam(fam) for fam in fams ] - def calculate_x(row): - mat_x1 = row.mat_x - mat_x2 = row.mat_y - ret1 = np.zeros(mat_x1.shape) - ret2 = np.zeros(mat_x2.shape) - #diff = mat_x1 - mat_x2 - matsum = mat_x1 + mat_x2 - #ret1[np.where(diff != 0 ) ] = -1 - ret2[ np.where(matsum == 2 ) ] = 1 - return list(ret2) - retq= mp.Queue(-1) - inq= mp.Queue(-1) - processes = {} - mp.log_to_stderr() - logger = mp.get_logger() - logger.setLevel(logging.INFO) - - for i in range(nworkers): - processes[i] = {'time':time.time() , 'process': mp.Process( target = self.worker , args = (i,inq, retq ) ) } - #processes[i]['process'].daemon = True - processes[i]['process'].start() - - for batch in range(0, len(traindf) , chunksize ): - - slicedf = traindf.iloc[batch:batch+chunksize, :] - fams = list(set(list(slicedf.HogFamA.unique()) + list(slicedf.HogFamB.unique() ) ) ) - total= {} - - for fam in fams: - orthxml = self.READ_ORTHO(fam) - if orthxml is not None: - inq.put((fam,orthxml)) - done = [] - count = 0 - while len(fams)-1 > count: - try: - data =retq.get(False) - count+=1 - total.update(data) - except : - pass - time.sleep(.01) - - gc.collect() - retdf= pd.DataFrame.from_dict( total , orient= 'index') - slicedf = slicedf.merge( retdf , left_on = 'HogFamA' , right_index = True , how= 'left') - slicedf = slicedf.merge( retdf , left_on = 'HogFamB' , right_index = True , how= 'left') - slicedf = slicedf.dropna(subset=['mat_y', 'mat_x'] , how = 'any') - slicedf['xtrain'] = slicedf.apply( calculate_x , axis = 1) - X_train = np.vstack( slicedf['xtrain']) - y_train = slicedf.truth - print(slicedf) - - yield (X_train, y_train) - for i in processes: - inq.put(None) - for i in processes: - processes[i]['process'].terminate() - - def retmat_mp_profiles(self, fams , nworkers = 25, chunksize=50 , verbose = False ): - """ - function used to create dataframe containing binary profiles - and trees of fams - """ - - fams = [ f for f in fams if f] - retq= mp.Queue(-1) - inq= mp.Queue(-1) - processes = {} - mp.log_to_stderr() - logger = mp.get_logger() - logger.setLevel(logging.INFO) - total = {} - - for i in range(nworkers): - processes[i] = {'time':time.time() , 'process': mp.Process( target = self.worker , args = (i,inq, retq ) ) } - #processes[i]['process'].daemon = True - processes[i]['process'].start() - for fam in fams: - if verbose == True: - print(fam) - try: - orthxml = self.READ_ORTHO(fam) - except: - orthxml = None - if orthxml is not None: - inq.put((fam,orthxml)) - done = [] - count = 0 - - while len(fams)-1 > count : - try: - data =retq.get(False ) - count+=1 - total.update(data) - if count % 100 == 0 : - print(count) - except : - pass - time.sleep(.01) - - for i in range(nworkers): - processes[i]['process'].terminate() - retdf= pd.DataFrame.from_dict( total , orient= 'index') - return retdf - - def hog_query(self, hog_id=None, fam_id=None , k = 100 ): - """ - Given a hog_id or a fam_id as a query, returns a dictionary containing the results of the LSH. - :param hog_id: query hog id - :param fam_id: query fam id - :return: list containing the results of the LSH for the given query - """ - - if hog_id is not None: - fam_id = self.hogid2fam(hog_id) - query_hash = hashutils.fam2hash_hdf5(fam_id, self.hashes_h5 , nsamples= self.nsamples ) - #print(query_hash.hashvalues) - results = self.lshobj.query(query_hash, k) - - - return results - - def hog_query_sorted(self, hog_id=None, fam_id=None , k = 100 ): - """ - Given a hog_id or a fam_id as a query, returns a dictionary containing the results of the LSH. - :param hog_id: query hog id - :param fam_id: query fam id - :return: list containing the results of the LSH for the given query - """ - - if hog_id is not None: - fam_id = self.hogid2fam(hog_id) - query_hash = hashutils.fam2hash_hdf5(fam_id, self.hashes_h5 , nsamples= self.nsamples ) - results = self.lshobj.query(query_hash, k) - hogdict = self.pull_hashes(results) - - hogdict = { hog: hogdict[hog].jaccard(query_hash) for hog in hogdict } - sortedhogs = [(k, v) for k, v in hogdict.items()] - sortedhogs = sorted(student_tuples, key=lambda x: x[1]) - sortedhogs = [ h[0] for h in sortehogs.reverse() ] - return hogdict - - def pull_hashes(self , hoglist): - - """ - Given a list of hog_ids , returns a dictionary containing their hashes. - This uses the hdf5 file to get the hashvalues - :param hog_id: query hog id - :param fam_id: query fam id - :return: a dict containing the hash values of the hogs in hoglist - """ - - return { entry: hashutils.fam2hash_hdf5( self.hogid2fam(entry), self.hashes_h5 , nsamples= self.nsamples) for entry in hoglist} - - def pull_matrows(self,fams): - """ - given a list of fams return the submatrix containing their profiles - - :return:fams sorted, sparse mat - """ - return self.profile_matrix[np.asarray(fams),:] - - - @staticmethod - def sort_hashes(query_hash,hashes): - """ - Given a dict of hogs:hashes, returns a sorted array of hogs and jaccard distances relative to query hog. - :param query hash: weighted minhash of the query - :param hashes: a dict of hogs:hashes - :return: sortedhogs, jaccard - """ - #sort the hashes by their jaccard relative to query hash - jaccard=[ query_hash.jaccard(hashes[hog]) for hog in hashes] - index = np.argsort(jaccard) - sortedhogs = np.asarry(list(hashes.keys()))[index] - jaccard= jaccard[index] - return sortedhogs, jaccard - - @staticmethod - def allvall_hashes(hashes): - """ - Given a dict of hogs:hashes, returns generate an all v all jaccard distance matrix. - :param hashes: a dict of hogs:hashes - :return: hashmat - """ - #generate an all v all jaccard distance matrix - hashmat = np.zeros((len(hashes),len(hashes))) - for i , hog1 in enumerate(hashes): - for j, hog2 in enumerate(hashes): - if i < j : - hashmat[i,j]= hashes[hog1].jaccard(hashes[hog2]) - hashmat = hashmat+hashmat.T - np.fill_diagonal(hashmat, 1) - return hashmat - - def hog_v_hog(self, hogs): - """ - give two hogs returns jaccard distance. - :param hog1 , hog2: str hog id - :return: jaccard score - """ - hog1,hog2 = hogs - #generate an all v all jaccard distance matrix - hashes = self.pull_hashes([hog1,hog2]) - hashes = list(hashes.values()) - return hashes[0].jaccard(hashes[1]) - - def allvall_nx(G,hashes,thresh =None): - - """ - Given a dict of hogs:hashes, returns generate an all v all jaccard distance matrix. - :param hashes: a dict of hogs:hashes - :return: hashmat - """ - - #generate an all v all jaccard distance matrix - - hashmat = [[ hashes[hog1].jaccard(hashes[hog2]) if j>i else 0 for j,hog2 in enumerate(hashes[0:i] ) ] for i,hog1 in enumerate(hashes) ] - hashmat = np.asarray(hashmat) - hashmat+= hashmat.T - np.fill_diagonal(hashmat, 1) - - #hashmat = np.zeros((len(hashes),len(hashes))) - - #for i , hog1 in enumerate(hashes): - # for j, hog2 in enumerate(hashes): - # hashmat[i,j]= hashes[hog1].jaccard(hashes[hog2]) - return hashmat - - def iternetwork(seedHOG): - pass - - def rank_hashes(query_hash,hashes): - jaccard = [] - sorted = [] - scores = {} - hogsRanked = np.asarray(list(hashes.keys())) - for i, hog in enumerate(hashes): - score = query_hash.jaccard(hashes[hog]) - jaccard.append( score) - scores[hog] = score - hogsRanked = list( hogsRanked[ np.argsort(jaccard) ] ) - jaccard = np.sort(jaccard) - return hogsRanked, jaccard + +class Profiler: + """ + A profiler object allows the user to query the LSH with HOGs and get a list of result HOGs back + + """ + + def __init__(self, lshforestpath=None, hashes_h5=None, mat_path=None, oma=False, nsamples=256, mastertree=None): + """ + The Profiler class initializes a profiler object for querying the LSH with HOGs and returning a list of result HOGs. + + Attributes: + lshobj (object): LSH object for querying. + hashes_h5 (h5py.File): H5 file containing HOGs. + nsamples (int): Number of samples to use. + tree (ete3.Tree): Master tree used for generating taxa index. + tree_string (str): String representation of the master tree. + taxaIndex (dict): Dictionary mapping taxa names to their indices in the master tree. + ReverseTaxaIndex (dict): Dictionary mapping indices in the master tree to their corresponding taxa names. + db_obj (db.Database): OMA database object. + treeweights (dict): Dictionary containing the tree weight for each taxon. + READ_ORTHO (callable): Function for reading orthoxml files from OMA. + HAM_PIPELINE (callable): Function for generating the Annotated tree from a row. + HASH_PIPELINE (callable): Function for generating the hash from a row. + + Parameters: + lshforestpath (str, optional): Path to the pickled LSH forest object. + hashes_h5 (str, optional): Path to the H5 file containing HOGs. + mat_path (str, optional): Path to the matrix file containing HOGs. + oma (str, optional): Path to the OMA database. + tar (str, optional): Path to the tar archive. + nsamples (int, optional): Number of samples to use. Defaults to 256. + mastertree (str, optional): Path to the master tree file. + """ + + print('loading lsh') + with open(lshforestpath, 'rb') as lshpickle: + self.lshobj = pickle.loads(lshpickle.read()) + print('indexing lsh') + self.lshobj.index() + + self.hashes_h5 = h5py.File(hashes_h5, mode='r') + print('h5', self.hashes_h5, self.hashes_h5.keys()) + self.nsamples = nsamples + if mastertree.split('.')[-1] == 'pkl': + with open(mastertree, 'rb') as pklin: + self.tree = pickle.loads(pklin.read()) + self.tree_string = self.tree.write(format=1) + elif mastertree.split('.')[-1] == 'nwk': + self.tree = ete3.Tree(mastertree, format=1) + self.tree_string = self.tree.write(format=1) + + else: + raise Exception('please provide a pickled ete3 tree or a newick file') + self.taxaIndex, self.ReverseTaxaIndex = files_utils.generate_taxa_index(self.tree) + + if oma: + h5_oma = open_file(oma, mode="r") + self.db_obj = db.Database(h5_oma) + self.treeweights = hashutils.generate_treeweights(self.tree, self.taxaIndex, None, None) + self.READ_ORTHO = functools.partial(pyhamutils.get_orthoxml_oma, db_obj=self.db_obj) + self.HAM_PIPELINE = functools.partial(pyhamutils.get_ham_treemap_from_row, tree=self.tree_string) + self.HASH_PIPELINE = functools.partial(hashutils.row2hash, taxaIndex=self.taxaIndex, + treeweights=self.treeweights, wmg=None) + + print('DONE') + + def hogid2fam(self, hog_entry): + if type(hog_entry) == int: + return hog_entry + else: + hog_entry = self.db_obj.entry_by_entry_nr(self.db_obj.id_resolver.resolve(hog_entry)) + famnr = int(self.db_obj.hog_family(entry=hog_entry)) + return famnr + + def return_profile_OTF(self, fam): + """ + Returns profiles as binary vectors for use with optimisation pipelines + """ + if type(fam) is str: + fam = self.hogid2fam(fam) + ortho_fam = self.READ_ORTHO(fam) + if ortho_fam: + tp = self.HAM_PIPELINE([fam, ortho_fam]) + + losses = [self.taxaIndex[n.name] for n in tp.traverse() if n.lost and n.name in self.taxaIndex] + dupl = [self.taxaIndex[n.name] for n in tp.traverse() if n.dupl and n.name in self.taxaIndex] + presence = [self.taxaIndex[n.name] for n in tp.traverse() if n.nbr_genes > 0 and n.name in self.taxaIndex] + + indices = dict(zip(['presence', 'loss', 'dup'], [presence, losses, dupl])) + hog_matrix_raw = np.zeros((1, 3 * len(self.taxaIndex))) + for i, event in enumerate(indices): + if len(indices[event]) > 0: + taxindex = np.asarray(indices[event]) + hogindex = np.asarray(indices[event]) + i * len(self.taxaIndex) + hog_matrix_raw[:, hogindex] = 1 + return {fam: {'mat': hog_matrix_raw, 'tree': tp}} + else: + return {fam: {'mat': None, 'tree': None}} + + def return_profile_complements(self, fam): + """ + Returns profiles for each loss to search for complementary hogs + """ + if type(fam) is str: + fam = self.hogid2fam(fam) + ortho_fam = self.READ_ORTHO(fam) + tp = self.HAM_PIPELINE([fam, ortho_fam]) + + losses = set([n.name for n in tp.traverse() if n.lost and n.name in self.taxaIndex]) + # these are the roots of the fams we are looking for + # we just assume no duplications or losses from this point + + ancestral_nodes = ([n for n in profiler.tree.traverse() if n.name in losses]) + losses = [] + dupl = [] + complements = {n.name + '_loss': []} + + indices = dict(zip(['presence', 'loss', 'dup'], [presence, losses, dupl])) + + hog_matrix_raw = np.zeros((1, 3 * len(self.taxaIndex))) + for i, event in enumerate(indices): + if len(indices[event]) > 0: + taxindex = np.asarray(indices[event]) + hogindex = np.asarray(indices[event]) + i * len(self.taxaIndex) + hog_matrix_raw[:, hogindex] = 1 + + return {fam: {'mat': hog_matrix_raw, 'hash': tp}} + + def worker(self, i, inq, retq): + """ + this worker function is for parallelization of generation of binary vector for use with optimisation pipelines + + """ + print('worker start' + str(i)) + while True: + input = inq.get() + if input is None: + break + else: + fam, ortho_fam = input + tp = self.HAM_PIPELINE([fam, ortho_fam]) + losses = [self.taxaIndex[n.name] for n in tp.traverse() if n.lost and n.name in self.taxaIndex] + dupl = [self.taxaIndex[n.name] for n in tp.traverse() if n.dupl and n.name in self.taxaIndex] + presence = [self.taxaIndex[n.name] for n in tp.traverse() if + n.nbr_genes > 0 and n.name in self.taxaIndex] + indices = dict(zip(['presence', 'loss', 'dup'], [presence, losses, dupl])) + hog_matrix_raw = np.zeros((1, 3 * len(self.taxaIndex))) + for i, event in enumerate(indices): + if len(indices[event]) > 0: + taxindex = np.asarray(indices[event]) + hogindex = np.asarray(indices[event]) + i * len(self.taxaIndex) + hog_matrix_raw[:, hogindex] = 1 + retq.put({fam: {'mat': hog_matrix_raw, 'tree': tp}}) + + def retmat_mp(self, traindf, nworkers=25, chunksize=50): + """ + function used to create training matrix with pairs of hogs. calculate_x will return the intersetcion of + two binary vectors generated by pyham + """ + + # fams = [ hashutils.hogid2fam(fam) for fam in fams ] + def calculate_x(row): + mat_x1 = row.mat_x + mat_x2 = row.mat_y + ret1 = np.zeros(mat_x1.shape) + ret2 = np.zeros(mat_x2.shape) + # diff = mat_x1 - mat_x2 + matsum = mat_x1 + mat_x2 + # ret1[np.where(diff != 0 ) ] = -1 + ret2[np.where(matsum == 2)] = 1 + return list(ret2) + + retq = mp.Queue(-1) + inq = mp.Queue(-1) + processes = {} + mp.log_to_stderr() + logger = mp.get_logger() + logger.setLevel(logging.INFO) + + for i in range(nworkers): + processes[i] = {'time': time.time(), 'process': mp.Process(target=self.worker, args=(i, inq, retq))} + # processes[i]['process'].daemon = True + processes[i]['process'].start() + + for batch in range(0, len(traindf), chunksize): + + slicedf = traindf.iloc[batch:batch + chunksize, :] + fams = list(set(list(slicedf.HogFamA.unique()) + list(slicedf.HogFamB.unique()))) + total = {} + + for fam in fams: + orthxml = self.READ_ORTHO(fam) + if orthxml is not None: + inq.put((fam, orthxml)) + done = [] + count = 0 + while len(fams) - 1 > count: + try: + data = retq.get(False) + count += 1 + total.update(data) + except: + pass + time.sleep(.01) + + gc.collect() + retdf = pd.DataFrame.from_dict(total, orient='index') + slicedf = slicedf.merge(retdf, left_on='HogFamA', right_index=True, how='left') + slicedf = slicedf.merge(retdf, left_on='HogFamB', right_index=True, how='left') + slicedf = slicedf.dropna(subset=['mat_y', 'mat_x'], how='any') + slicedf['xtrain'] = slicedf.apply(calculate_x, axis=1) + X_train = np.vstack(slicedf['xtrain']) + y_train = slicedf.truth + print(slicedf) + + yield (X_train, y_train) + for i in processes: + inq.put(None) + for i in processes: + processes[i]['process'].terminate() + + def retmat_mp_profiles(self, fams, nworkers=25, chunksize=50, verbose=False): + """ + function used to create dataframe containing binary profiles + and trees of fams + """ + + fams = [f for f in fams if f] + retq = mp.Queue(-1) + inq = mp.Queue(-1) + processes = {} + mp.log_to_stderr() + logger = mp.get_logger() + logger.setLevel(logging.INFO) + total = {} + + for i in range(nworkers): + processes[i] = {'time': time.time(), 'process': mp.Process(target=self.worker, args=(i, inq, retq))} + # processes[i]['process'].daemon = True + processes[i]['process'].start() + for fam in fams: + if verbose: + print(fam) + try: + orthxml = self.READ_ORTHO(fam) + except: + orthxml = None + if orthxml is not None: + inq.put((fam, orthxml)) + done = [] + count = 0 + + while len(fams) - 1 > count: + try: + data = retq.get(False) + count += 1 + total.update(data) + if count % 100 == 0: + print(count) + except: + pass + time.sleep(.01) + + for i in range(nworkers): + processes[i]['process'].terminate() + retdf = pd.DataFrame.from_dict(total, orient='index') + return retdf + + def hog_query(self, hog_id=None, fam_id=None, k=100): + """ + Given a hog_id or a fam_id as a query, returns a dictionary containing the results of the LSH. + :param hog_id: query hog id + :param fam_id: query fam id + :return: list containing the results of the LSH for the given query + """ + + if hog_id is not None: + fam_id = self.hogid2fam(hog_id) + query_hash = hashutils.fam2hash_hdf5(fam_id, self.hashes_h5, nsamples=self.nsamples) + # print(query_hash.hashvalues) + results = self.lshobj.query(query_hash, k) + + return results + + def hog_query_sorted(self, hog_id=None, fam_id=None, k=100): + """ + Given a hog_id or a fam_id as a query, returns a dictionary containing the results of the LSH. + :param hog_id: query hog id + :param fam_id: query fam id + :return: list containing the results of the LSH for the given query + """ + + if hog_id is not None: + fam_id = self.hogid2fam(hog_id) + query_hash = hashutils.fam2hash_hdf5(fam_id, self.hashes_h5, nsamples=self.nsamples) + results = self.lshobj.query(query_hash, k) + hogdict = self.pull_hashes(results) + + hogdict = {hog: hogdict[hog].jaccard(query_hash) for hog in hogdict} + sortedhogs = [(k, v) for k, v in hogdict.items()] + sortedhogs = sorted(student_tuples, key=lambda x: x[1]) + sortedhogs = [h[0] for h in sortehogs.reverse()] + return hogdict + + def pull_hashes(self, hoglist): + + """ + Given a list of hog_ids , returns a dictionary containing their hashes. + This uses the hdf5 file to get the hashvalues + :param hog_id: query hog id + :param fam_id: query fam id + :return: a dict containing the hash values of the hogs in hoglist + """ + + return {entry: hashutils.fam2hash_hdf5(self.hogid2fam(entry), self.hashes_h5, nsamples=self.nsamples) for entry + in hoglist} + + def pull_matrows(self, fams): + """ + given a list of fams return the submatrix containing their profiles + + :return:fams sorted, sparse mat + """ + return self.profile_matrix[np.asarray(fams), :] + + @staticmethod + def sort_hashes(query_hash, hashes): + """ + Given a dict of hogs:hashes, returns a sorted array of hogs and jaccard distances relative to query hog. + :param query hash: weighted minhash of the query + :param hashes: a dict of hogs:hashes + :return: sortedhogs, jaccard + """ + # sort the hashes by their jaccard relative to query hash + jaccard = [query_hash.jaccard(hashes[hog]) for hog in hashes] + index = np.argsort(jaccard) + sortedhogs = np.asarry(list(hashes.keys()))[index] + jaccard = jaccard[index] + return sortedhogs, jaccard + + @staticmethod + def allvall_hashes(hashes): + """ + Given a dict of hogs:hashes, returns generate an all v all jaccard distance matrix. + :param hashes: a dict of hogs:hashes + :return: hashmat + """ + # generate an all v all jaccard distance matrix + hashmat = np.zeros((len(hashes), len(hashes))) + for i, hog1 in enumerate(hashes): + for j, hog2 in enumerate(hashes): + if i < j: + hashmat[i, j] = hashes[hog1].jaccard(hashes[hog2]) + hashmat = hashmat + hashmat.T + np.fill_diagonal(hashmat, 1) + return hashmat + + def hog_v_hog(self, hogs): + """ + give two hogs returns jaccard distance. + :param hog1 , hog2: str hog id + :return: jaccard score + """ + hog1, hog2 = hogs + # generate an all v all jaccard distance matrix + hashes = self.pull_hashes([hog1, hog2]) + hashes = list(hashes.values()) + return hashes[0].jaccard(hashes[1]) + + def allvall_nx(G, hashes, thresh=None): + + """ + Given a dict of hogs:hashes, returns generate an all v all jaccard distance matrix. + :param hashes: a dict of hogs:hashes + :return: hashmat + """ + + # generate an all v all jaccard distance matrix + + hashmat = [[hashes[hog1].jaccard(hashes[hog2]) if j > i else 0 for j, hog2 in enumerate(hashes[0:i])] for + i, hog1 in enumerate(hashes)] + hashmat = np.asarray(hashmat) + hashmat += hashmat.T + np.fill_diagonal(hashmat, 1) + + # hashmat = np.zeros((len(hashes),len(hashes))) + + # for i , hog1 in enumerate(hashes): + # for j, hog2 in enumerate(hashes): + # hashmat[i,j]= hashes[hog1].jaccard(hashes[hog2]) + return hashmat + + def iternetwork(seedHOG): + pass + + def rank_hashes(query_hash, hashes): + jaccard = [] + sorted = [] + scores = {} + hogsRanked = np.asarray(list(hashes.keys())) + for i, hog in enumerate(hashes): + score = query_hash.jaccard(hashes[hog]) + jaccard.append(score) + scores[hog] = score + hogsRanked = list(hogsRanked[np.argsort(jaccard)]) + jaccard = np.sort(jaccard) + return hogsRanked, jaccard diff --git a/src/HogProf/utils/__pycache__/__init__.cpython-310.pyc b/src/HogProf/utils/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 8165b0873c3ce3edb3871fe5310c1dd0532d8143..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 210 zcmd1j<>g`kg6T?=llg)4V-N=!FakLaKwQiLBvKfn7*ZI688n%0v6hx(<`nyBGTvgz zOUzAO$xy@$Q~)M^rR!(p=cei>CnlGa7H4Os=!g1w`uO^V=ob{_XQd{W6zhBBrw0_} zr^OegrU4b@7wH!lCBp>u^YTkllk)Ski}m&NKnSW>A8ez3e0*kJW=VX!UP0w84x8Nk Rl+v73JCG}jnScZb0|4~kIAs6; diff --git a/src/HogProf/utils/__pycache__/__init__.cpython-38.pyc b/src/HogProf/utils/__pycache__/__init__.cpython-38.pyc deleted file mode 100644 index 2464570e5fa9ace6b18da06a085da839ee86f749..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 192 zcmWIL<>g`kg6T?=llg)4V-N=!FakLaKwQiLBvKfn7*ZI688n%0v6hx(<`nyBGTvgz zOUzAO$xy@$Q~)M^h3l8+7iH_aIXdgRIr-|lI63Plr=%7aXXYB|r=+GOmgbb`r{v~m z>K7E{XQd{W6zhBBrw0_}r^OcmWs^(ti}Z_&lJ&th=*P!r=4F<|$LkeT-r}&y%}*)K NNwovH>N5~C0051B~}Aul$33L)up!P`@G}g*UW<2P6pJsh*CI(6#Q z>2vwUQN3Pacs~1Zm;d1uV}GW>{9|D7Hd_26bewT5V4eJ}b##pNz!;hxQ_JV9j-`0) zj;-FMPKj5z!Of4fj>D_m;x?!;B5RBWCw8K9o0mRfsd)=>NK$G|ozzwm%BhBRN3C1a$|1{nmg&S&V)C+*P3M&Ib9^>& z(8_7)7JK;mq>@y(YP^!v)5?K{o7bKglWNjPs?&wEdPt|$5;Lull(dpMX^q!c*fqu* z>ckpfSWzdY<+O5*ZGQmGnxw=*v*(&FE~%ViZivgmiQt+4f{2c*HWD?on`#B1O3my)}i`d^SjeVYW4N|3^M*P zFQb@mt?d7;;^O&%2!-?#5xbt7c)L*;4gH=A_Yv+m_QMS~*%WTrTlb6AJ=Y7l+PXUO zq&IY-%Ns)|YxT@6a#)*V}tJpH*%O}(#W+wkRd=dAEv3PV4Cfa?pcnRGGOW8wvq9u$X=C}Gb z?EI8LlmQJ|dZu5xoLR7?&!r7^)%|n5y%Whnd+nW@?X?^4w{P9J+3s->$NunYn~T0T z4w5z>Mt*xFBQjsy{$aFnPe%Q&#A+{zWIL9{2*?VW#X=)FOBxH%C*3clU_6& zU0=q*pOfZGZ2$a*NV;UuOV@&^=LPX~5C!8)S2eJSuKM{RI(1bGNkQ9U`9?hG<*=gxk!J3l)b1ew0G z-dfH~3V_VHbMwY+n8bs7iX^k87)FmoW{nU%NoE8h%*yvR_kxH5G^;6JiAGTnZR}-b zkMnMC(+_xNlE-8fS`GvvHJ0RE9b5E8(%Y0J3=mUdH-ThN#sXhif(?|UkCs(PR16tH zW>umiIokBXIJ0)7pNPziM$p6_A>adO$H&7_+&Yz+@D(pBZ3yHg5qIopoN!;_BxIz_ zz#=%HcgIYib5@!O?bK&P*AM&AmE*&!a)C6j$HJ4n%`TMSI1zT~MUrP)m{m#QEWLDW zSQ&?QY}gg=6V8-5MQoKJGky=9<2ah5RkRxZ8rmYL3mDflQ#bKm!W@m3(JEMVH2v`f z(A1hnO*{X*sbkM`$7ue`G32))@K_nAh$DO6M2mlgj>>q-4z-Dq8dF^rxAQ2$x;&Q{ zskw#njkkWF4KS0M69a{%mTK}Mt)&{uAE>Eyi1M;sM&Y&4?8HtP$vV(bDw{vkCnZQE zemgB)V`+(N%M4`?GbpN;LD^JlTKfLqRhpeEDWU6!TTHkda zdBGSWl}<_=RY3AECFw(<{Pkslihs4(Jug!UkyI2~rOfO_NWu;mVnmcwNrnO=j{uvC z01=s$MK~U+n%i>ZWh`f9sDM;Xp-}>fXdYxG6g9fFe1kYDq$Hdf_w6hm(WnhduqZJRu3?)+Kcn!op z2ht;lNq}Ct0sN;Co}6Jo8?_Nmm)gq=D2V|ZSyO`?3W&`~*Z~k8H-YX8HHXJE)rxdq27(+*(F|9!?BhE&+`B5)$yIIRNHl~!|i!$JgQDcTeaKOgnW{pk0{M(%KdfASC}zQT8JZltnZl&xa0hW^JpI*11Et za{J?a?R#wd7fQ<}w6t<9fp5#W`|tD7w5me#6SDBDtrK6xS_2jYa)t#9ozBd3YCw}e z@iOopbipk?%P9+$l9%6P22e8Qid}RQ#Yw9J^Ea9Vs-D&k8Mg-dY*Z(W=B2!qOZiJm zzdkD7FHx&hJgoV<3N=#UDMEeET?cUFoKQf424;bDDGzJ~CRc$6sVu0xP0(&GwtG>S zcz#F{D66}D84Zuli9$sevdQU>$)#9wk6@u17R)hfi#yZi*rvSfbbTIosr&(WB8LHA zQ5?Idrf$@y3t}BBUgE}^(Kz7lx^QRe9OW&<>3S3w2v`+gmy7lB#s*y|PR9Lye)K%X zB?}bkMrpj`C!0!%+)VDQE|O(M4$2>VQSec~HxuQ&e$?N<+*zODh#P` z_1YIbSiVhyUjfPn#evVBYSUY9Nbwr@vXDc?2Bbq-1WP0VD@n!t`km;=@)~; Q2;L9`0thlHmm25(10t_OTL1t6 diff --git a/src/HogProf/utils/__pycache__/files_utils.cpython-38.pyc b/src/HogProf/utils/__pycache__/files_utils.cpython-38.pyc deleted file mode 100644 index 2fccd017e33a57705372fceef796c4425121ec8b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4400 zcmcIoOK%*<5$<{KlcXq$A_c)7Bo5+D9Ia(LiNVl9M9EAP$O>aA4=gNfhTFa5aA#*$ zJ+tBiy_e9Z078`ff~9W){0H(Ee99?@x#px}4!HyZlCOGp#U-T#fq~3oW~#fZtEy}2 z@sYo**DDOy3+GRrKVM+%Z#0;_EDU~(JN*?NoN*&!t>SI8OpMLQ>f0^bDCV4&qj}tx ztKVMB;}vdk`*WjJ;#KZ&7gU+oc!`%mRk(4VHR{hHo84;|x%+XPNpaL6t1M4Nd?u;uZtl(K8jb3K+vX)JHyh2aNMkU;3 z<=tivw?>t$x>w_stgb4@22NgkVvVZVTvi>=tLh19t7W#TkrY)?B~|0~RkqIfoYq+5 z^Q&5;Dyzymd-Me?TOb=AzCEg|`rZXq>bdd-s736>c$Lu!>Ex;Dxmz2quE;D}V*!;u7XkN{am((2j>(}i2>=*1) z_8X>omcerwJj4zO30U`Wh8pb<*wk!O&vmH)X5B(tKdh6OCkU`&1`oWry-Gf6=KHv6}?LE=SW?9nU zv3LNHYyL)(M1D7s{{F6jWa)#9lg!T&pO_$~AEcR(+JIQ>hn=3^4I@!VlOh|+cukwv zPBRJn*05#5S4`+d%*MXR#QRfY^nQszW=H1tu0Jz!I}Z9{6XAcclSEN4NJTpxZl?p$ z2}Rl=#_45(uyNnVoi5?A!xVd18yOj+9QdR8$iB~>F^IBYLBq_=YwzX`d>L}-f?YrU z*=p`5ve(?ad8@g(@k#Uc#;s@m>cbjTJJItaX{mj;`*AtWB=GwjL#&AWCn5 zn6kcdNDJHaAs0dDLspRB#vUt-Vbp|ilg8l zSj$V37(2|ZgMLKL3VF-JckL%!w5nTy6mj+-IJm=KG4HkC0F{NUP7G`qEk|!Htg4Yg zFF(Mkeczh3uT{Pq#v<4eA0ND;@PGD){7WRc~r6#e9}$ejTaEz7M)#CdsZ z_b^H*m-Ctqu4Irz$ zG8G642{urYzHwe5Q7L2ynOBKUEfZ7njIzH?V(#D0{MrZ}O zwtWE~smj{axn z$b&@Uca^u!l*f$~hE9hW^wHmfa&^x!kN%`2@qTvq7Uh{d_$-*QKI_z*TMm@EHUkC!o(o>qN*YcZC5KF@PawY*9D?qLngs3`oaqz&V?M9Kf2L z?iQer+dccKAr*jBsb>+z9fV$)b^%!acRe$}Q3J-G0URQbRTV(aTZMMzAl(0|oU^m5 zZ0r$yL;`%FsypC1yG~H?+!+95$0eO=w?{Qq+XL+Dxv z)p>~kNDNs|U+%ntotl@{wcf8(ZCpEH-07J(361Jx z!K|DQr*ghh^yfw;_#N7l&W6qKK!ZwZHq%5u^tXX4IVYen1q_okx>lsM#+qw@i_{8q zttR9*li8gl&Vn!|2~^4bu1xx;dXiYvQ`r>qXXH{Q`KPE+52ws2`iVcW=FFy|)wDyN zwyEU-m!g6JhEW##=%9Ymr2}FeD?#R`yU8%({&5SalJelb4BplO9y)avL2Xm8&!+;98&) zHy8k7@S;FdnUminUTcsHaxcW!CJ+1oQ;qpnbtWWrwQkLyi;LWweBIpXh7!<-N;gGr z?oRMg6QWvP5^-(JO;Wf{Q*I$!m-v=VgWRF!m7YKMTj0KjJKe&Am2Jy5mrcL~(?!`X znOAl3UPei;X;^X9TmgI_U@=E;vxGuFU4I`>~hGt zEYF#hL}s=vN;S3HI%ylAXbTh|f%ZXfMc;ZsQJ{ZBfI%OMv@d$^6Pw2UeP?!;q?B}) zbLQM{-}n8#3$xkTx`O8)*Z#HpJ69CtKdEr`(NXw1Uhz{TOkrxQB&w&Xl-FV{(LG(} z_1H*E&y;y1wi4U3kvEwYJ4wx}$$C4kdv%O+;+dr3HDp~ao=uuwbKE}Xwa{LV=RctN z?RddkU{~1;YdlfCMdW5#6S*Ze$68Mm?*eM(*#c^oky~U-$gQvoY#D7A*$TUe?u%70rdDHX+kw~~=20v*(Kf%a^-jv8?sgKU z`TcyzXn@8#NY$@&HCk`{82CDP@8A`;krXOZhYD}xO0Mo`Oj}V7)sdDOIa(-}>xFV; z94m!VXiR4YGy5vDKD9=Aq2n<2!Mk-A&qkPrJjg@0pSj_5E3TluxLIc>?B?#HDBpGi zw-a=C`#c+_%w<6yh%jH1Q*0hAt3Az zYrwNX(8q#)7c=FeG=q4s9h7zw=Gz$)ZIhcgUD@bnX)o&g@~TRk7Gx3UbOoic8}5s? z;m!4fo}UMgdr_Q+JY=4ZWjzZC3;Lzq&%yvFc&hMuj5%_46A{x)ppDu+CPeLdD z97&{nqJY*AJEncA{b+5Z71~|p!BwK~k$S9*jKX00s`9b&P~}&14JBh$>4P^uR;rd; z%HGT*zp5PRRgYr`@ZQXa5HbV5Lx^zdysn6^JwiSGzK6F^`8WPR>8T{dPn<%%3*P$# zD-mC9mMVX$mPV4XkQ38$3rUx9lZW`4&O?V3%Sa-eGeD{Qsp>U|?h~TR4txQ#l=WLI>gL@n8Qfl{scD^M<={$Z7{zSk?R#st;;b9Q z;`Z8U$F@-#*&s|yQw-uLFLkhWsRxX;P0tyx6MD`tjUI+vgr14IC@qaRO2g88lyN4i z9S}LIZ8($luF@u`-q4z=gI`0vs9w~x|GDy+;Vh_z&RsN~BlASeC7BH zYdbpARuv*DGgc&mg77ax)Quyv(2hX@NfetoPn?gHBP+KH5aw8t2W+7pIfYK-Jxg2$ zqJg|O&XSkv+O^Hn7J0z)kd-<_4|0`|2p%-HlHh*GaUE;hH~13v$S=?_ekq0N2Kq!{ zXQbxf65x}zqyFhBjoTWhbC)Joh!{#MR8m8*k;OVBFz1LIN1`I~*U(K8`J!_0>R8;i zv%c{2C<(J+eum!DG3{DuO872oM0R4Mkk8V3R)sjchJxo1oB_&0Z#D^&PRM=WLN7>4 z9mrqmgJBNFv3Mwkab6l&d(X4BZ}>PssiU^q?-wys#S)9ZV0SlY_YEYDdPS|{ZKz-P zUw#c0UAm3QL!3PY{-5HKN#LL0I!Wk8Mx=~P@Q6-4(pN?n@LFE3dP8gE9{=e>`%2(y{I2q#}NG7KC&_E z40A?}DqqZ81Sjq)oT2Qn8u1r;*N&mfvBpQ(Po3sYo%8+0d*{jq=WGUZDy%`hF+-i! z`|WyRAI*&C`dj21%sf^{vpChPJk?7$RdX~~G}$cn@Mk#DoE+a|&0T~41=gLVGXdUa zi#fSBYm(dh!QSP`?(nU~X-Cb%!V0ZPOQG{;`CQ)GnJ;w6Tnq2~q4uNs(R?v~w2&_z zEfpqPIEM5|xmr92nm;)!RfTqRf#%7V3s4L5EUhR->sZ6BD(r&Xb*nIY0KWF}$=^uL z>f(7nE3 zYTR^KHJ^Yv5Kg9a18Dqrw>Dft)=WfX^T_@$h`bG6y^mLX6^Ru2g^>zt4Pu`I+z;^s zw`fbs%DB9uR4~xi4-Bku^BK)hs@DMcdJIt3Mb5`~03#wu{D@VeMH*d-7Jn7F-@q$@ z;H%2Pm-sOCVOG8EXX*G(ei06wH0d-EV7%|rRd^;8*6VLPBa>IEEo{L8{Pm#@41C1B z{$KqiJ^%1lX|OB}Kk$sZVZfwzi~bvOd2$&mr!?Qj)>&#vt$)&ly&$f`yvW7cU^gzc zN1bP+pEggLrxwCm{y0A|?{00r^KyHIe;LiCmJCYsTf=Y|mbEY)Cem0dZA{b+g^+Lo z)kQd68U%YN#9=rnok;jW45d}pU>hY-3X`ie!(@=}mpUEF3I@PGR@Uggqb|@_Gf-0p_Bt@YT0d)<)RXcJy^a>j)5s=z zXunJ2>$1G8E~!^OyCmDWk0s7w_6#bfnEf*pfYLC`D@_iRHc6XH{Q5HR%Np5*#WbL@ z4#d0!46_IgAU{T71C8|~d`XKxXpJ1I1DctIvjdz0Qktjvy{mhKpcVcDl_jQVM|MG& zS(~;vd2OdI(aZu0*%HkN;b-y&u+d=76wPYq(2PBYW=)zWpBtlD9cWe?quGpHxmFkw z%^Li5Tmk=8O5UV|ECqg(l3$~Q$i#o067p*{`CF9zI+Auv$^gGZ6(3MSOja%vllT>P z2-3@7GXjS9s0Z0x{F{`V=Vrc1bz79YPsz6^kvu%Zzm0sULxv>(azZfv9ZJ4S$%m8> zxAKoDc|Zv<)$Te;=A75JMhTZr-!c@WAT1!hkQK$l<^xNtXIS z3M7KJX`RBOP6NkY zp;1kih;HEbvxouKx*2~#=;r_n*tL&6%jN%??uWojGcMwK4c+>=g-hods=BnF$wPU% zVEPK;lNbqUjsLqVE= zdIp{BVdzeR1=U@WV&6*R{jvI{OHcc8=q4Ga6oarE^`fr(ni~iK$QLf$kG)~&hL2(T zifW2;%9kXHw2AbnNQJ@5R#ufAXyd0U1sY5i4)SeeR5(z42R$nGm41_RD-Mzl3vT}% z>ZKm7D+g=;ADPc;)M_ZE*5Q_u;!c9cOO}~(oFg% z%p>Q79BBSsn2ZL^z;en>jcDdF>d&4Rwm>*+XDEY9={#N6!&GIRrQDn8Xcep&FR& zRP!bjIMo#9DjcrD+A+yFf~#d0HgVvgy5k@>=Di;(P}(#et{07m3kJ1G{Dx?pCQBNP zr;($&+*`fcn+!TSyo)28VH3WG@;!)KEf8H8Z~zX@P}QE1fXn;HmnQf%d<+{p>q;e8 z+La&*{(YKF(z0=uh7`~MU1vnbbN^uw4@1$OfleI4Kw~|=OQW_aiD>%T1nv;NBhVJ! z?6ig`&wmfSyw;e-rcFGd8KlW36A3=^XCQn4ehI|CpqHR6I)(#dZAoisOEA0Y>Loxr z{Tcvv{s}sqGrA_2AR(O$1$mbWOdsPE(h&p!E|4)bm2>^zyHn96%L44`0qXF#vRsQ-<&r1-%YcAhN!BqTZAa7cS#h z(1%|}f`A}g1bI|~6u5EG_-dw1#(U`ZA9&^cENHU)!t%s8cc>F7P^KPX1IM(iws}z7 z*t+@so7diCBM6>V(VqVax;@9ud>?iHMRO2<*Hu~O1LWJEFY7-3Wq_D)==+?S>H4au zv6NhIN#6Eahi8ujf@>5&jfISid5e>t6j=8m%ji(W$rVXMY1n$p(_XlbQ?F76&Ar)k zv&$$~IhP=05Wz1aUy^|2ZETC&-cF1d);$ztAey_ByG9AQI?_CqCY4mo5`m>SN~+4c ztBos;DVgNhT4FX(~knA6X&e1*B70Ix?R8Je9>t*HK$Qub!rXc F{{Z#(eOCYg diff --git a/src/HogProf/utils/__pycache__/goautils.cpython-310.pyc b/src/HogProf/utils/__pycache__/goautils.cpython-310.pyc deleted file mode 100644 index e4b5e44f16b0ec081d99900ca9934be21f2a490f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6800 zcmaJ`+ix7#d7t~tE|<$CMNy)pIAhCAxb>+2xRP zS)Ma1iOg)$mP%`L-6bg{ zo#mW4_uKbk zBEHLPnO(y761&W<;CmT8KgC``&nxWH%ty_q*vsq{d|zVjeXafKGn`Xv>#{lEaaIJq zVb(3;JVT8qqBP8kxQl$_U7i@O(}x;w6k4Hg8_c+%9q1z?vkJ6O zt}sjO&^ppetu&a)EN1t0<~(&qW@+Lu?*7|#AJ1BpMLaAbzn}ZjbSu7~z4&=&JL(qx zqqx}eL%$PtcltaZX3S?{5sIi-QB$n%FRs!Vi*U7{zt#yw++E2#`Mv%d>iM@frPE_P z=xnu3*&6VC5caWP(8WxJkan01wnFKqQL&XX(YCpb)5%6R&w6n`P*){gT9Cz@(-lZ- zC)yKjOV0JfUQmROdvQ`kJYv$sveH4q!oGC-c@*LVA8ULGqmKKlTX`C-cEfHl6gx3n z{hjqMzw_1g_g1kbI;L2CJMV*by?{qObjtav;N3~(Y6hZq@_a|EuB_1Cxbv#oAPyd? z%*tR-Ugl9T!4$kxQV|+ z5^Eo6pfUuG8BdMxt&EJ)xT`(5N|Zg+kF=3hTFhM5KGYuS{AyvKWG!obaK(pO)pASQ zotfm9wL`P&aRkxbop~SPW#M-K;my3)HSxtqsHfjI@RmCN@}Fouoy7NvSL%1cbsu3R z;;410^N)3Dr8$c@u{(E=bSXD^h@R_&YhSvzax2NZVIpp? zoOA>OS@|H!q%8(XTu2iPEzOXzwk^H!I+65-S^O~KB9b=h;!IjeoT2b2=S);PcnOPD z+wdmqU8PMBB`u?=d-yfw_@67ESl)tenf!BTJVWM*fGaXTMr#2AAK>Z>y)d>- zW-Mz&RAyaJ*a^bF3?a7;?b0{`2^3LW<~{K~)DE4(EkT$gMINw)e(04Zk@qxl8Hfh* z-Z)KOs%zKQr7Mb%7bH;-I!IJX!nfbpNW=RP$91f1-QdgEBfmt)_>~l<8|V{-y^&sk zOMpzqw*KcQG;SN5&Mj@M5HpmZXEdotAhp9fBrs=)oW!Cc@)yxf5&5FF|Jkv)ZRLFt z6mc5m!{QXZr(@bRX)EBCH6lB)QN-tHJ*UDNUZa9XKn4(tvWY-j&YQjQ|IbCTgqwUc*ckOPu>9ySqWVZy@pXD|#JoLx1IeIe~hY zZe#K+ATx#hpW%{8;Gf_+N$5satc`5&h)F!s*G3NVR>d!e20*|i1#kckc<|j1jFC60 zl^(N~H0JCWybbv9R9~&ERpU#$tlc72Vim}hZqH!uQ)5&w>xa$}1ix?(UCcVeyiudd zmopc^i8~r+D0{3%{Dt1NBWQ7~@d5Tzr@1rlY=802nX<(>o57qKYfx{@P^b02RWIGc znekkIi+qFGNBU?MrUK3RiZ^&Lx~o@j@)nKRY350ZT~ZT zm<2GXz7*uy_)dNq4jeb>G%;Yj@6%OC8w%_7H=dEPtJD^@-~j&mPzM%1;$HuUe?`wv z-jo*0v*!_ z8N}V41cTWy9SG?>;sLhF@1RS2LAv8R=C9MZ^EA!{^kyNp;^1OqCM1#eKEW;y&wrD; z=rO;kz5{6#*Q7fJZ0TY_4!oE8j&v%%l`XR1V2vsE#+edni4fXHvA~oMPv9W9inkxle4{(#2URe4EY z)USMUNwxD`EO7?2XHYT4?4P3ml!i%OX>y>nP1;=H*RKP=oRM2P%m6BzK+H?PFo)0p z@?(`Q(AYf0m$dkU*2tqepqX8I+rTLxrG1j$y}C;XTH!xXSz(HC=$3?;wP}l2)VAvi z%^aYRtI&)Pex_&u8!hHd(X4g`&De8j)}(ogxiOm6fo8Qan$4({Yo(>otij*J74Wwx zS)+t31^yLE-l62Hl&n*-fn=S(OWCg>X}6RN@b{=9qJ)@CE)kOi6?X{I%V0ACh6mJR zg%YCqbKJ}Ws{4IPLP|Q6C?1~SUF4++8B+Yq88!7N=~J>r32`gWD9I@qQ1ToDF1^Tr za7#EDaf0kTt3iy^Qmp2Ip;~y2Uw9^13T!lHD3m;%7wFz43wiPoERq>b8y3XY`{xy; zLbX@G+K2UDS_dXr%YY?s!_Wg`y$@Ep4>d|ZrjML&sB+sleq)OFcYuP!A`<@BBay{B zpB27{(>Mts`g_nEp&0V0a`2vgf(KU56$X5XL;=?=P4g@WGawQCq+pBnZpd06T85|B zK@SuAg!D#55NEv{-LnGzZQGSjKL@>DAAhgIrdF<6od%BmL93c972PoC=P?7Ub#wlL z(9Z!Du|V{9)uz!UWY_l49S; zl0B$cauVs%vwjl!X^ttyAnL}wxa+^>he81Ig%9^*cNqE6W0=08n&OP|C5a+!qC6^6 zVX(5bWo;YU_+yVcKEg?SDYM(xY{4f93xp^J$G* z4aL+Z9CA|JY4}(!kC|hDTSAaT5nLz|!Tqo3FRE>PYr2>3P>wdlA0hb|uiDEcSC!AW zNhAwoB;TP zUOz)nA|@SBSyPM1C~L|g2a4>d)6SDFYcacCiQU9j(!0;0lQNV33G=8cUDWnp`;`Pb zP1}j5)i(K642DIN(~0CcctPY2tH@U{93U_gwH^Ll>f@2<2INN9yRaN%YJ+*DSU}F7 zR0TyIz|WS85NwW_A{Lx9KEM5sF!PUT`{Xb!+AW2n5OAGS%Nfxoa1t@T;DeYxA zT&1&ZlXC=D%Pn2vzyp2TLvGA_f2Kic(|EXEG#)M()F$y8;&6s488n_oj_PV}^=fY_ z-01Nkj&O=i_yNlIAa1o#bYZ{&IQT+und+Nb23G<|IXcc3eQwve;a8lpV^0rhH) zS!~+GzfChdr#9zLLHGdt5{iF8FL4ElXL&Hz7LAs%2(zoMUjn4luK{4^N9b_I=$c@H zNy9&bMBSwZ)5ki2bOb?w3uH`9<=ou=-Kpr3WdU~ef!Lhfg0b%eKMF1oG?|iV#7Ie@ z@iZh)$nWty^+O+VA#$f+mgIa-P56R9lfqF5{VFml=GCN_*EYrIG;UNmO3y^V+PX4o zCZcVm6kap2+zEnn9~1sNUM1bHYWuH})`i3=;soe>N~otXG6=PDw`$5vzYh0`bIj3p zl;I8yEoTOMh-d_?Lm`+Sp$*~GC`0^INRxsg-eietD}RF?RlGxK{YfLO|5Mr=QK$~Z z45|G)WY63WX|So(E}#IQgPbyiuPEqspaYT3^<({}a=36Azkxpd6(qO~xCrW~1SxRi z;_=l?odacr{RjH0`&lql`Gw_)aqdtjQlLyf#0HLNSz~>_wzhHen>Vk0fsG(|PDOkE z_vrQ1mJaDmH8hb-~P0$2l!_J)O{2LoSNzS!XnQTQJG4M>b6%pJbxq* zT%!Ogn&?vGxk*n7sRxi{bf_ZanxdgHY~|9l7w+T4tE|H1vu9?nqFi$+LIyGXGV&z} zNJy2E+uKPH!+IYD6^K@0O66)QS4Wwr%A`_?Ss}0zM@3b2x7E2%Bz;N=>D|J)hDmgr p2rkH4Z|Dt3IsGU=I&seGX8oMEPa=XqCf&fW8aqx})ir8MwH?5gjksulZ48;(nblI} zlGKnZX$>x5KrWgdd?#iV^v54G*PeVfdg-P8-f&l1*^%M4D7q4d!#BU)Jihn6 zZQv$~I7NgR{tJmUq@<`q^xn`ffEXeYcyozE_%# zzPnA2o7{S2G^>1uSGe?-{6{9=L@i z&*YWD-Eejp?$$25Z^|=;JE$-tt{0~Kl-de&%b?oO-7}c6`K`^ja*tyz%rQ6c_UD@& zxZEu4a@E2+VHAu;&l@L=i^k^C=1&V7JF4q&o5?dnj^Tb~hevqb7D7epiGK|-0pq{Cf#91zTtvrlrf^sf&>xOYB z$T}j^y%y6z23@}!WVw*J6k^eD_oB#eB~pr3j{b3LEgY}hpBw{ig3n?ub{2gewsOpo zrdk2WNwroI^}6w)WiX^2Mp$XlkHRb`z1Cz9x1bG;;35)vsr^VV41j9)7yYn}jfI(y z|2WD0m0K%6C#9(IjJDPC6>8hxG!)+sbC`?Xxq)(Hu}%9>6})|{#A3z5r-qz^KL5D4u${>J z3s)~MEnK~Fb7A?)(n5=iEDO8m7Px2!y(nMc-6UK{WwL?&W((Jo_1iLOuS&Gi{ueUY z8VeVCc^G92m%P>B_&2m)vKSRD0K@%{^>VHn{kWVkG#SdS`@^) zY~X2W_wcO4t>jrnfW<{;=it4(xYhsS6+$pAXK;Qe)9MlKX2;!+?dLHw0 z@4hXcLMOMB09s}ypNjfe`;*3CwlIrXKw0(I0HJwf=-p2~J zIMy}+LbJi|Ujm`vCD`zTnEPvC9LPQbK9}K+FOzK@F9>@0g%AIN2aDPvs7NWy(tZa} zbEp$R-a)AU+i-H=hp!vPhu~PtL5Hbchy6~ z$E&PTbkPiB!5Eqehywrmt!t$XCB=p`QM3qo#y=8J3VajS^Y>A#BcfJ>~p(cG7_N(nxC{Pm}oD<2Jpr{0<%$N#vtA za>nN$qE+wwc)4?KAU2QOJvK--#Ol>}O|H`W&0ej-PtaZO`a|St<)u<&B5nz7KL9`W zpl4k9g>0awBz3MfOfAzzqb}(!$qE&ObkakC%_C(h>p_^Rs)&1Co%EF(WPy}HpYj9} zymCj$8mqy!%56tMo{P9~T$y>QUzM7g0Apprqm_p@nAQd}J4>RbRnjX)py}EQD{X(@DRrlS6 zjFg>4F*iMj3^AwwPO)S7n=#*JzOK;|{h1HVzk4%pm>!e#8d~Ih;~z0TvMB{e_E!`d z+!;4@A}~gYz#OHA!%|C2?HBsmIq^mCy&?prWOC8WM{`tCl)iiZp>9ZeAHW zxl8E^fa(?A(BqXo#tiNtom4I&wczgJj<&)92z!9rYHy}^?THX7?k|}t7f`OjUS*;?6P9ZirrA`cB87%M1Dka zD-*e@gMt$I1no~^RpVj14OaqSX%te`RtJ2aAVy=6vXQDI%yOP)*Ub4}e_=9<|0R_fAEE?*4K72mi^TW|Ua$eMwTYLH=L}3qcGSQzSoDG-bBLsIIoA?+}fdjjNId2;+*u!ffgy(nCwp84s^l$ z?u6jUqeqGxoiI8gJ~(-gzCq}j$~nd-?h%>!Xi={;CJg1iuzWx1NzHo6Q&u#2dPW(a zTqFPFV@-n_ppBUj9Itm@(S}#zpM78SvyyBP!B8%!S|>zJzH$XhhXCd(*yv_s{^SV{ D6XsK2 diff --git a/src/HogProf/utils/__pycache__/hashutils.cpython-38.pyc b/src/HogProf/utils/__pycache__/hashutils.cpython-38.pyc deleted file mode 100644 index 56ce8c6d7e412af9b166cb2e4d992c9e625c8c46..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5135 zcmc&&TW{OQ73Po>Ny(R_yJ@oCA}JfR1r(&wZgvwChMR5@Z#G2}Z_w=}Rl6`~&B&rn zk@RpB+bIPUaN6c&fffZ8?VF4~1t|K1=Cx1x3w`Q$hLSBgO^~2nR5oYG;atv~Ip;gy z;m;Np8Ww)%|8lz1IAdA=q=)IBgNKiB6^} z{Vnf+p>__~wk;o){;dLrBv@n%mIVb%>r)aivP z+RM_cA9un;JQ7K$vT#F$QJyEU;2|H#INhXyW~NS-7D^5}Do$yFYA$r^$7wGrdZIAB zmh(tP{jeVuN=PMzSPr{`Bndm2l%k{1KkaP9la=?U$3U9sM=2M3%OQ_D3UlPS5diHZ zS|>{e{d8s-4C%%RR$2~|xKJe5hK$kW; zuas>DKPP*CjtSm*liRI3H*dG@ z+`8Xdy>+|Q;i4$w{?!&2-Dr@g7Vl?qE0@_8_FJ^>W}6RW)?Jr)OZ#sXveO!{as-49rT&;)l+EjaFH&H z%TQ+JtK3%hQ;WQH?5oCuPqfxb>yJz}{$O<(cewi`D6Nr0o_oNCo@uR}X!T92ccL{= zuBvgL5N6XUgVHPgP3OSIEOq6Tb>qkRukB%DI0xUYmvc*4fj72Y`6Z;2{Pa`V znC#%3HC!m|l5lJ8cfg({YZM$>rCU9h3tf8!SXtJlpLf1YfgtEJzhafnsID5NJDNMh zx>#qitd)yh8(6Ur{rLr~h+gpSFiLs25vP&dhhHig?}akkHD(Rp#4(020~nfU%y@K8 z-jMJ2fCw|42$_yO@{PUz*lb?c+h$N>x4*d@u0{&JE)-z!B$S0?1I^^LhljmvbG_R8 z>#jO8e6q?qIRniwX_z1=;YAeQ{q&Pchm!1B+9)~%5R)G@P^5YrSMgmGn}|H0Ul_8I zZCTPq$(~w>CGJzihAp;a{mI@zOfXLU%+_v_0YxYXG1AI!kbvi@Ab2uT z$_u!=fon#*m#=>bU5&ObR@&kg{O_e2gHlNEP*XkKkn3a<#j8d6;j}0XXi5jU6op7T z!ss2ij{TTf*P6DeP0^f}bfU6G1tFOvb!(TRpmrX|xvqf%UiFKDOoci5a%PHq}_Uy7pi#1-2cMP~Rq@ z{5XYJX!<%ekrB271QxW4y>@^Z%$;7C%xbK?Ix=&3#=7c92Gqm^73?pZwa>a9J8S;l zWvB7C$lhfkJL3fQTTEU<&kh;??%%Z9m@Gip-+DERg$$&O7?P% zycy)lZ_%xF7t$T zlT%v5pn@2qHD{2;p>m!rIt>s^{Mqs&yqlqubmW99w{aC8q5vfXnt;5ygasf@3{=cH zaM9wy_w6mG^hS-VhY(PX-2qE^$@p2{eUnrNf#oNMnhA!AWA1MQ%1Ws~p0DCGMnv;*xSv4YN! ztm?*b0F`7oRoFYCD>}doS83gCyeuftTXXXavF*pWd;VJ#GlobV=rV@*bKI0LvF$*< zqrmJ&jF>z}CNM;vGjhl%mtd+eLN?A2+|uFBq4`b$=8rtae6jGhe2(uF-21}D_lg=! z)sO6nss2Pn++@E6@;9Ws#uy(($pHB@WdLKOev!LQn-Fwj&NGbpbaRh)uc7@~z8_{A zlra+DrLPjSNVUdvcd~0B2u`*agQJfh$wN$t>WTC?IUj$fFf&ziOp@|3IeLHDtTZ7d z)xNO&em0PXag$A~88Y^qI0hMny8fY|;7yR`LI{rIJrcATR2b)Oi~XV^Z-lhk2i@yM zIr4LS|3J_M;vJ6)%4_D8sw5@v_IaV4$>m(=Lsm*1H5eqlP#RI zrUmHPSkpR<{|)*Ultf$g0MzLsGUi1lDTX!eHF=NfMfp=yx*pS)nJh~R?c(dm2iomq z$jV*dO;6XrGxPlloV^@J9u@Lqj51lb@d^W&#$wIvQ&LSC`bt0JgGAhLFtng{1GW|9 NvcL(>2j9VU`rnJ)Mj`+J diff --git a/src/HogProf/utils/__pycache__/pyhamutils.cpython-310.pyc b/src/HogProf/utils/__pycache__/pyhamutils.cpython-310.pyc deleted file mode 100644 index 16199a6abbee2598a17fdc3ebaf1b2a7a596ca19..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2496 zcmZ`)OK%)S5bmD$uGfwqA&vuw0m5UX$cus^Arm1ICn1r@24RcGDrhvG?zLwwP?L!cWu4H={n3{=1bCbnZ<1AJ?1bM`UYEI z9&5a$-6mUPGpq%p88*x2K-OZH*gW*JjBb!%=`T2sbYcU$Nx{2dU58pOLQ~NrQjueN zM32eXsEiHrnsx$1wYIqo3n6!kQI;x)@wi|-FqPShGMH|K#a3T=CmUW9;eoP$qV|3v zcH0l`thXQBecZlxcfB1mUY1FAv(0!f8m6+%vLa~@MA7H5EZYx@?e9d<3k67HS%`Ki z;*-VpP$p^F9vtjMnI5hV4%Gbg#lj+s9E4&)Ge<2dkczh>{urbuh!HI#Ln>w~1JQ2~ zX-JdNW%7dlMgyaBc-3F?^FsQ)Vwf{O$$c4#UXpS@u+ zB3Dkvvn?)!3+Dx()=S;z%Eum=&8Xw4?|;ATlyQGkHG87SN-2_j8&Pa4Cz3GR3M>t+FjwUbx zwp&s0;h7=8D5nxh`W83^1RQf-nPUhHFp%337%1X0HcQYbcbF#_4$6esfLQT(4pX!a z@D35f0$aR;yRTrgjLlX2&yz)M#S_E1Su|itEX4}$xrPlAKjj@u<$faLoe<3$=J8e% zCJc4y*ln6>6UJ zvg%%v%6UP@j--79jD%T;g3NqnkKM}co0WTH9TR4)5N5xkW3Tf1_K#pDsz2#4N4iJe zF?nH7@-yVL4PC*Ao0DMZ&dP%U4|j^~5M!siFf4hP$5TAM z?7FxOC;%1U+fp?!qWM-7?}E!0B8z0mM{&?lrYQE6g}arl|5skQ9}R$uQhWdx(*a(` zf(yAy=SD*gE|Pf_FM{&PxwZR)?(3kVL|Q%I(xCI;{g>hMs6*$VT7QdG5T1(xOyXE$ zsO1f4eg$Itqz|+~@XzYxMls-bEg%<2Tp>RIMbCgx@M-h#P1A?+O|r*xKk|2O>AV&f zxr~w=JlTKp_>Nz4vB$*?UvN1TdFe;Fp8!u0q-nnZi7$7!5AQ*eLh{i1KE-rd4h8p< zo?k-tPBCo)!+{pB4I&X`lhf9GNZxqtw5WsxC_~V>hKhYzfEHNsi@e@dU!@l2L(H*y zUAsr;wW$M1c~9`&0{>C=qnz;(JX=nUu-iC2Km=g4iww;jBs^t!zQ*(!8n*?F@qEr} zL99VO`d54o3+K4N2Nvr()XAe-Gf@!;6Rv>Cnrcli z>cpv>K@#t#TzrE1@IlmXpneH-(Cc{6MAS!~WlcTO536@OE7&mQUt#JfF%^4c5n>8H P!?QfM={8$lt2OfLMBttcq6aopHR*{9OlBe!93Q0ug@+q zpEY06u*ufh1=faEi>o@T1c@-ppCL^$+`Uz2% zvKhTXUeaHva)1G=|Ke)U3yM+&!*W`%AT0tFi(#7apyTQb{74BnE%vn|c+7P3-ciCQ zDlH4`w=r>`4w!jqIs z?G$m&b+f;F=iy#_^pJHNZQa|`t)VD$sYF`rBaVIT#R__Zj$UgIjIs?Cp^gFHqMKA)gZ>$}o&o<9X3qitd$Yz&YVT?e){tu-nF7~4xsKHxMw7-DGB#7Q3-6?tmSow-}N;_C|HOh zDpI*G>D*OxYymN-Iuu^}wLAAJZ){iIvGay7XNxfRHJvxA#@KxV8U^Hd{k=tyhbw9} z;nAqvk1!{~OH;|Cv`}0Wac0gRt{bWN@zFO?ETftUfkEA&+J^r+o≶lk}neAkM>$ z1X4@!ND0m(5Da7b3IzRPZ1j1`;$jLtz77 zbdB$*mgXvSG7FCw`s}zLt6KI|B0bYDq zcA+dTyV?g8fp7-SfotoXDy2N^e)l)06RI-9&T r@oyYpwpu!w(KK_}`M2|uO*8%lroC~859;MxzT-E2zqQ(G`z!weUvxl> diff --git a/src/HogProf/utils/files_utils.py b/src/HogProf/utils/files_utils.py index 4b1ca40..9f1464d 100755 --- a/src/HogProf/utils/files_utils.py +++ b/src/HogProf/utils/files_utils.py @@ -1,16 +1,15 @@ -import ete3 -import pandas as pd -from Bio import Entrez import copy import pickle -import os +import ete3 +from Bio import Entrez -def get_tree(taxa , genomes , outdir = None): + +def get_tree(taxa, genomes, outdir=None): """ Generates a taxonomic tree using the ncbi taxonomy and :param oma: a pyoma db object - :param saveTree: Bool for whether or not to save a mastertree newick file + :param saveTree: Bool for whether to save a mastertree newick file :return: tree_string: a newick string tree: an ete3 object """ @@ -19,9 +18,8 @@ def get_tree(taxa , genomes , outdir = None): genomes = set(genomes) tax.remove(0) print(len(tax)) - tree = ete3.PhyloTree( name = '-1') - topo = ncbi.get_topology(genomes , collapse_subspecies=False) - tax = set([ str(taxid) for taxid in tax]) + tree = ete3.PhyloTree(name='-1') + topo = ncbi.get_topology(genomes, collapse_subspecies=False) tree.add_child(topo) orphans = list(genomes - set([x.name for x in tree.get_leaves()])) print('missing taxa:') @@ -29,30 +27,32 @@ def get_tree(taxa , genomes , outdir = None): orphans_info1 = {} orphans_info2 = {} + for x in orphans: + Entrez.email = 'leo.burgy@epfl.ch' search_handle = Entrez.efetch('taxonomy', id=str(x), retmode='xml') record = next(Entrez.parse(search_handle)) print(record) - orphans_info1[ record['ParentTaxId']] = x + orphans_info1[record['ParentTaxId']] = x orphans_info2[x] = [x['TaxId'] for x in record['LineageEx']] + for n in tree.traverse(): if n.name in orphans_info1: - n.add_sister(name = orphans_info1[n.name]) + n.add_sister(name=orphans_info1[n.name]) print(n) - orphans = set(genomes) - set([x.name for x in tree.get_leaves()]) + tree = add_orphans(orphans_info2, tree, genomes) - orphans = set(genomes) - set([x.name for x in tree.get_leaves()]) tree_string = tree.write(format=1) - - - with open( outdir +'master_tree.nwk' , 'w') as nwkout: + + with open(outdir + 'master_tree.nwk', 'w') as nwkout: nwkout.write(tree_string) - with open( outdir + '_master_tree.pkl' , 'wb') as pklout: + with open(outdir + '_master_tree.pkl', 'wb') as pklout: pklout.write(pickle.dumps(tree)) - + return tree_string, tree -def generate_taxa_index(tree , taxfilter= None, taxmask=None): + +def generate_taxa_index(tree, taxfilter=None, taxmask=None): """ Generates an index for the global taxonomic tree for all OMA :param tree: ete3 tree @@ -67,13 +67,13 @@ def generate_taxa_index(tree , taxfilter= None, taxmask=None): break if taxfilter: if n.name in taxfilter: - #set weight for descendants of n to 0 + # set weight for descendants of n to 0 n.delete() taxa_index = {} taxa_index_reverse = {} for i, n in enumerate(tree.traverse()): taxa_index_reverse[i] = n.name - taxa_index[n.name] = i-1 + taxa_index[n.name] = i - 1 return taxa_index, taxa_index_reverse @@ -89,7 +89,6 @@ def add_orphans(orphan_info, tree, genome_ids_list, verbose=False): """ first = True - newdict = {} leaves = set([leaf.name for leaf in tree.get_leaves()]) @@ -101,7 +100,7 @@ def add_orphans(orphan_info, tree, genome_ids_list, verbose=False): i = 0 print(i) - while first or ( len(orphans) > 0 and keys != oldkeys ) : + while first or (len(orphans) > 0 and keys != oldkeys): first = False oldkeys = keys leaves = set([leaf.name for leaf in tree.get_leaves()]) @@ -126,18 +125,17 @@ def add_orphans(orphan_info, tree, genome_ids_list, verbose=False): newdict = {} nodes = {} print(orphans) - #clean up duplicates + # clean up duplicates for n in tree.traverse(): if n.name not in nodes: - nodes[ n.name] =1 + nodes[n.name] = 1 else: - nodes[ n.name] +=1 + nodes[n.name] += 1 for n in tree.traverse(): - if nodes[ n.name] >1: - if n.is_leaf()== False: + if nodes[n.name] > 1: + if n.is_leaf() == False: n.delete() - nodes[ n.name]-= 1 - + nodes[n.name] -= 1 return tree diff --git a/src/HogProf/utils/goautils.py b/src/HogProf/utils/goautils.py index 5bd223f..780106c 100644 --- a/src/HogProf/utils/goautils.py +++ b/src/HogProf/utils/goautils.py @@ -1,4 +1,3 @@ - from __future__ import print_function from goatools import semantic @@ -11,93 +10,93 @@ import multiprocessing as mp from tables import * import time + + ##############enrichment############################################## -def return_enrichment_study_obj(gaf_taxfiltered, obo = None): +def return_enrichment_study_obj(gaf_taxfiltered, obo=None): ''' Generate go enrichment study object with a background dataset. ''' if obo is None: - obodag = GODag(config_utils.datadir+"/GOData/go-basic.obo") + obodag = GODag(config_utils.datadir + "/GOData/go-basic.obo") else: obodag = GODag(obo) goeaobj = GOEnrichmentStudy( - gaf_taxfiltered.keys(), # - gaf_taxfiltered, # geneid/GO associations possible with tree used for DB - obodag, # Ontologies - propagate_counts = False, - alpha = 0.15, # default significance cut-off - methods = ['fdr_bh']) # defult multipletest correction method + gaf_taxfiltered.keys(), # + gaf_taxfiltered, # geneid/GO associations possible with tree used for DB + obodag, # Ontologies + propagate_counts=False, + alpha=0.15, # default significance cut-off + methods=['fdr_bh']) # defult multipletest correction method return goeaobj -def buildGAF(gaf_file , universe= None): - +def buildGAF(gaf_file, universe=None): gaf_filtered = {} with open(gaf_file, mode='r') as gafin: for line in gafin: words = line.split() if words[0] not in gaf_filtered: - gaf_filtered[words[0]]=set([words[1]]) + gaf_filtered[words[0]] = set([words[1]]) else: gaf_filtered[words[0]].add(words[1]) if universe: - gaf_filtered = { prot:gaf_filtered[prot] for prot in universe} - + gaf_filtered = {prot: gaf_filtered[prot] for prot in universe} return gaf_filtered -def return_hogs_timeout( result, retq): +def return_hogs_timeout(result, retq): print('started') - with open_file(config_utils.config['dir']['omadir']+'OmaServer.h5' , mode="r") as h5_oma: + with open_file(config_utils.config['dir']['omadir'] + 'OmaServer.h5', mode="r") as h5_oma: db_obj = db.Database(h5_oma) - res = [ ProteinEntry(db_obj, e).omaid for e in db_obj.member_of_fam(int(result)) ] + res = [ProteinEntry(db_obj, e).omaid for e in db_obj.member_of_fam(int(result))] retq.put(res) -def run_GOEA_onresults(results, db_obj, goeaobj, outname = None): + +def run_GOEA_onresults(results, db_obj, goeaobj, outname=None): ''' Perform enrichment analysis on returned results grabs all member protein of all hogs in result returns goe results and HOG composition ''' - hogids =[ "HOG:" + (7-len(str(fam_id))) * '0' + str(fam_id) for fam_id in results ] - HOGS={} + hogids = ["HOG:" + (7 - len(str(fam_id))) * '0' + str(fam_id) for fam_id in results] + HOGS = {} print('compiling hogs') prots = [] print('mod13') retq = mp.Queue() - for i,result in enumerate(results): - if i %10 ==0: + for i, result in enumerate(results): + if i % 10 == 0: print(i) print(result) - HOGS[result]=[] - p = mp.Process( target= return_hogs_timeout , args= (result, retq)) + HOGS[result] = [] + p = mp.Process(target=return_hogs_timeout, args=(result, retq)) p.start() t0 = time.time() timeout = False - while time.time()-t0 < 10 : + while time.time() - t0 < 10: time.sleep(.1) if p.is_alive() == False: print('done') break - if time.time()-t0 > 10: + if time.time() - t0 > 10: timeout = True print('Dead') p.terminate() del p - if retq.empty() == False: iterobj = retq.get(10) - #retq get - for k,member in enumerate(iterobj): + # retq get + for k, member in enumerate(iterobj): if k < 1: print(member) if k > 500: @@ -107,47 +106,46 @@ def run_GOEA_onresults(results, db_obj, goeaobj, outname = None): print('done') print('running GO enrichment study') - goea_results_all = goeaobj.run_study(prots ) + goea_results_all = goeaobj.run_study(prots) print('done') if outname: - with open( config_utils.datadir + outname + 'Hogs2Prots.pkl' , 'wb' ) as save: - save.write(pickle.dumps(HOGS,2)) - goeaobj.wr_txt(config_utils.datadir+ str(outname)+"enrichment.txt", goea_results_all) + with open(config_utils.datadir + outname + 'Hogs2Prots.pkl', 'wb') as save: + save.write(pickle.dumps(HOGS, 2)) + goeaobj.wr_txt(config_utils.datadir + str(outname) + "enrichment.txt", goea_results_all) print('DONE!') return goea_results_all, HOGS -def run_GOEA_onresults_tar(results, tar, goeaobj, outname = None): +def run_GOEA_onresults_tar(results, tar, goeaobj, outname=None): ''' Perform enrichment analysis on returned results grabs all member protein of all hogs in result returns goe results and HOG composition ''' ## TODO: finish this function with tar hog to list of prot IDS - #print(db_obj.member_of_hog_id(int(results[0]))) - #hogids =[ "HOG:" + (7-len(fam_id)) * '0' + fam_id for fam_id in results ] - #print( db_obj.member_of_hog_id(hogids[0]) ) - + # print(db_obj.member_of_hog_id(int(results[0]))) + # hogids =[ "HOG:" + (7-len(fam_id)) * '0' + fam_id for fam_id in results ] + # print( db_obj.member_of_hog_id(hogids[0]) ) - HOGS={} + HOGS = {} print('compiling hogs') prots = [] - for i,result in enumerate(hogids): - if i %10 ==0: + for i, result in enumerate(hogids): + if i % 10 == 0: print(i) - HOGS[result]=[] + HOGS[result] = [] for member in db_obj.iter_members_of_hog_id(result): HOGS[result].append(member.omaid) prots.append(member.omaid) print('done') print('running GO enrichment study') - goea_results_all = goeaobj.run_study(prots ) + goea_results_all = goeaobj.run_study(prots) print('done') - with open( config_utils.datadir + outname + 'Hogs2Prots.pkl' , 'wb' ) as save: - save.write(pickle.dumps(HOGS,2)) + with open(config_utils.datadir + outname + 'Hogs2Prots.pkl', 'wb') as save: + save.write(pickle.dumps(HOGS, 2)) - goeaobj.wr_txt(config_utils.datadir+ str(outname)+"enrichment.txt", goea_results_all) + goeaobj.wr_txt(config_utils.datadir + str(outname) + "enrichment.txt", goea_results_all) print('DONE!') return goea_results_all, HOGS @@ -175,6 +173,7 @@ def deepest_common_ancestor_hdf5(go_ids, godag, hdf5): # Take the element at maximum depth. return max(common_parent_go_ids_hdf5(go_ids, hdf5), key=lambda t: godag[t].depth) + def common_parent_go_ids_hdf5(go_ids, hdf5_set): ''' Finds the common ancestors in the GO @@ -187,12 +186,13 @@ def common_parent_go_ids_hdf5(go_ids, hdf5_set): corrected_candidates = [id2goterm(c) for c in candidates] return corrected_candidates -def resnik_sim_pandas(tup, df , termcounts): + +def resnik_sim_pandas(tup, df, termcounts): ''' Computes Resnik's similarity measure. ''' go_id1, go_id2 = tup - #print(df.head()) + # print(df.head()) if go_id1 == go_id2: return semantic.get_info_content(go_id1, termcounts) @@ -202,9 +202,9 @@ def resnik_sim_pandas(tup, df , termcounts): ancestors += df.loc[str(go_id1)].parents terms = df.loc[ancestors] ancestors_set = terms.parents.tolist() - intersection = set(ancestors_set[0]).intersection(* ancestors_set[1:]) + intersection = set(ancestors_set[0]).intersection(*ancestors_set[1:]) common_ancestors = df.loc[list(intersection)] - common_ancestors = common_ancestors.sort_values('depth', ascending= False) + common_ancestors = common_ancestors.sort_values('depth', ascending=False) msca_goid = common_ancestors.index.tolist()[0] return semantic.get_info_content(msca_goid, termcounts) @@ -212,18 +212,18 @@ def resnik_sim_pandas(tup, df , termcounts): return -1 -def get_go_terms_gaf(hog_id, pyoma_dbobj, gaf , genomes = None): +def get_go_terms_gaf(hog_id, pyoma_dbobj, gaf, genomes=None): ''' iterate over hog members and get the go information from a gaf in memory ''' fam = hashutils.hogid2fam(hog_id) - go_terms = { mr.omaid:gaf[mr.omaid] for mr in pyoma_dbobj.iter_members_of_hog_id(hog_id) if mr.omaid in gaf } + go_terms = {mr.omaid: gaf[mr.omaid] for mr in pyoma_dbobj.iter_members_of_hog_id(hog_id) if mr.omaid in gaf} return go_terms def goterm2id(go_term_to_modif): - return int(go_term_to_modif.split(':')[1]) + def id2goterm(go_term_to_modif): - return 'GO:{:07d}'.format(go_term_to_modif) \ No newline at end of file + return 'GO:{:07d}'.format(go_term_to_modif)