Initial commit.

epi2me-labs · Aug 7, 2018 · 58c5aac · 58c5aac
commit 58c5aac
Show file tree

Hide file tree

Showing 31 changed files with 1,907 additions and 0 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -0,0 +1,37 @@
+image: ubuntu:xenial
+
+stages:
+    - test
+    - pages
+    - release
+
+before_script:
+    - apt-get update
+    - apt-get install -y software-properties-common
+    - apt-add-repository universe
+    - apt-get update
+    - apt-get install -y python-pip make python-numpy python-matplotlib python-biopython python-pandas mummer last-align
+    - pip install --upgrade pip sphinx sphinx-argparse sphinx_rtd_theme pytest pycmd futures packaging appdirs pysam
+    - hash -r pip
+    - pip install -e ./
+
+
+do_testing:
+  stage: test
+  script:
+    - make test
+  except:
+    - tags
+
+pages:
+  stage: pages
+  script: 
+    - make docs
+    - mv docs/_build/html public
+  artifacts:
+    paths:
+    - public/
+  only:
+    - master
+  except:
+    - tags
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2017 Oxford Nanopore Technologies
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,8 @@
+
+include README.md
+include LICENSE
+
+recursive-exclude * __pycache__
+recursive-exclude * *.py[co]
+
+recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
diff --git a/Makefile b/Makefile
@@ -0,0 +1,75 @@
+MODULE=pychopper
+
+.PHONY: clean clean-test clean-pyc clean-build docs com help 
+
+.DEFAULT_GOAL := help
+
+define PRINT_HELP_PYSCRIPT
+import re, sys
+
+for line in sys.stdin:
+	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
+	if match:
+		target, help = match.groups()
+		print("%-20s %s" % (target, help))
+endef
+export PRINT_HELP_PYSCRIPT
+
+help:
+	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
+
+clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
+
+
+clean-build: ## remove build artifacts
+	rm -fr build/
+	rm -fr dist/
+	rm -fr .eggs/
+	find . -name '*.egg-info' -exec rm -fr {} +
+	find . -name '*.egg' -exec rm -f {} +
+
+clean-pyc: ## remove Python file artifacts
+	find . -name '*.pyc' -exec rm -f {} +
+	find . -name '*.pyo' -exec rm -f {} +
+	find . -name '*~' -exec rm -f {} +
+	find . -name '__pycache__' -exec rm -fr {} +
+
+clean-test: ## remove test and coverage artifacts
+	rm -f .coverage
+	rm -fr htmlcov/
+
+lint: ## check style with flake8
+	@(flake8 --max-line-length=120 $(MODULE) | grep -v "E501 line too long") || true
+	@(flake8 --max-line-length=120 scripts/*.py | grep -v "E501 line too long") || true
+
+test: ## run tests quickly with the default Python
+	py.test
+
+coverage: ## check code coverage quickly with the default Python
+		coverage run --source $(MODULE) --omit="*/tests/*,*__init__.py" `which py.test`
+		coverage report -m --omit="*/tests/*,*__init__.py"
+		coverage html
+
+docs: ## generate Sphinx HTML documentation, including API docs
+	@cd docs; make clean html
+
+servedocs: docs ## compile the docs watching for changes
+	watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
+
+release: clean ## package and upload a release
+	python setup.py sdist upload
+	python setup.py bdist_wheel upload
+
+dist: clean ## builds source and wheel package
+	python setup.py sdist
+	python setup.py bdist_wheel
+	ls -l dist
+
+install: clean ## install the package to the active Python's site-packages
+	python setup.py install
+
+com: ## commit all changes to git
+	git commit -a
+
+it: ## integration test
+	./scripts/cdna_classifier.py -s 95 -i fasta -b pychopper/tests/data/barcodes.fas pychopper/tests/data/ref.fas pychopper/tests/data/test_output.fas
diff --git a/README.md b/README.md
@@ -0,0 +1,76 @@
+Pychopper: A tool to identify full length cDNA reads
+====================================================
+
+Installation
+------------
+
+Install the package:
+
+```
+python setup.py install
+```
+
+Install the package in developer mode:
+
+```
+python setup.py develop
+```
+
+Run the tests:
+
+```
+make test
+```
+
+Build the documentation:
+
+```
+make docs
+```
+
+Issue `make help` to get a list of `make` targets.
+
+Usage
+-----
+
+```
+usage: cdna_classifier.py [-h] -b barcodes [-i input_format] [-g aln_params]
+                          [-t target_length] [-s score_percentile]
+                          [-n sample_size] [-r report_pdf] [-u unclass_output]
+                          input_fastx output_fastx
+
+Tool to identify full length cDNA reads. Primers have to specified as they are
+on the forward strand.
+
+positional arguments:
+  input_fastx          Input file.
+  output_fastx         Output file.
+
+optional arguments:
+  -h, --help           show this help message and exit
+  -b barcodes          Primers fasta.
+  -i input_format      Input/output format (fastq).
+  -g aln_params        Alignment parameters (match,
+                       mismatch,gap_open,gap_extend).
+  -t target_length     Number of bases to scan at each end (200).
+  -s score_percentile  Score cutoff percentile (100).
+  -n sample_size       Number of samples when calculating score cutoff
+                       (100000).
+  -r report_pdf        Report PDF.
+  -u unclass_output    Write unclassified reads to this file.
+```
+
+The primers have to specified as they are on the forward strand (see `data/cdna_barcodes.fas` for an example).
+The score cutoffs for each primer are calculated by aligning them against random sequences and applying the following formula: `<-s percentile of the score distribution> + 2 * <standard deviation of score distribution>`. The default settings are stringent in order to avoid false positives. Stringency can be lowered by lowering the value of `-s`.
+
+Documentation
+-------------
+
+Documentation can be found at: XXX 
+
+Contributing
+------------
+
+- Please fork the repository and create a merge request to contribute.
+- Use [bumpversion](http://bit.ly/2cSUryt) to manage package versioning.
+- The code should be [PEP8](https://www.python.org/dev/peps/pep-0008) compliant, which can be tested by `make lint`.
diff --git a/data/cdna_barcodes.fas b/data/cdna_barcodes.fas
@@ -0,0 +1,4 @@
+>cDNA|1
+TTTCTGTTGGTGCTGATATTGCGGG
+>cDNA|2
+AAAAAAAAAAAAAAAAAAAAGAAGATAGAGCGACAGGCAAGT
diff --git a/docs/.gitignore b/docs/.gitignore
@@ -0,0 +1 @@
+*.rst