Skip to content

Commit

Permalink
Resolve "Split reads with multiple hits"
Browse files Browse the repository at this point in the history
  • Loading branch information
ollenordesjo committed Jan 24, 2022
1 parent ada9255 commit 0b7172a
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 22 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.2.9]
### Added
- Flag to allow splitting multiple times for reads with multiple adapters
- Moving debug output and edited reads to the main output directory

## [v0.2.8]
### Changed
- Removed explicit dependency on pathlib, which caused https://github.com/nanoporetech/duplex-tools/issues/7
Expand Down
2 changes: 1 addition & 1 deletion duplex_tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"split_on_adapter", "assess_split_on_adapter",
"pairs_from_summary", "filter_pairs"]

__version__ = '0.2.8'
__version__ = '0.2.9'


def main():
Expand Down
68 changes: 47 additions & 21 deletions duplex_tools/split_on_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys

import edlib
from more_itertools import pairwise
from natsort import natsorted
import numpy as np
from pyfastx import Fastx
Expand Down Expand Up @@ -113,7 +114,9 @@ def process_file(
fastx, targets, output_dir=None,
debug_output=False,
edit_threshold=None,
print_alignment=False, print_threshold_delta=0):
print_alignment=False,
print_threshold_delta=0,
allow_multiple_splits=False):
"""Run the workflow on a single file."""
newfastx = fastx.with_name(
fastx.name.replace('.fastq',
Expand All @@ -122,7 +125,7 @@ def process_file(
if output_dir is not None:
newfastx = Path(output_dir) / newfastx.name
if debug_output:
newfasta = fastx.with_name(
newfasta = Path(output_dir) / Path(
fastx.stem.split('.')[0] + '_middle').with_suffix('.fasta')
fasta = open(newfasta, 'w')

Expand All @@ -140,21 +143,35 @@ def process_file(

if result['editDistance'] < edit_threshold:
result = deduplicate_locations_first_key(result)
if len(result['locations']) > 1:
if not allow_multiple_splits and len(result['locations']) > 1:
outfh.write(f'@{read_id} {comments}\n{seq}\n+\n{qual}\n')
split_multiple_times.add(read_id)
continue

edited_reads.add(read_id)
[(start, end)] = result['locations']
if debug_output:
write_match_to_fasta(fasta, seq, start, end, read_id)
left_seq, right_seq = seq[:start], seq[end:]
left_qual, right_qual = qual[:start], qual[end:]
outfh.write(
f'@{read_id}_1 {comments}\n{left_seq}\n+\n{left_qual}\n')
outfh.write(
f'@{read_id}_2 {comments}\n{right_seq}\n+\n{right_qual}\n')
else:
hits = []
edited_reads.add(read_id)
for left_hit, right_hit in pairwise(
[(0, 0), *result['locations'], (len(seq),
len(seq))]):
hits.append([left_hit[1], right_hit[0]])
for idx, (start, end) in enumerate(hits, start=1):
if debug_output:
write_match_to_fasta(fasta,
seq,
start,
end,
read_id)
# This edge case can happen and results in empty
# sequence
if end <= start:
continue
subseq = seq[start:end]
subqual = qual[start:end]
h = (f'@{read_id}_{idx} {comments} {start}->{end}\n'
f'{subseq}\n'
f'+\n'
f'{subqual}\n')
outfh.write(h)
else:
outfh.write(f'@{read_id} {comments}\n{seq}\n+\n{qual}\n')
unedited_reads.add(read_id)
Expand All @@ -175,7 +192,8 @@ def split(
n_replacement=None,
pattern='*.fastq*', print_alignment=False,
print_threshold_delta=0,
threads=None):
threads=None,
allow_multiple_splits=False):
"""Split reads.
:param fastq_dir: The directory from which to search for
Expand All @@ -199,6 +217,7 @@ def split(
:param print_threshold_delta: The threshold to print the
alignment at, relative to edit_threshold
:param threads: number of worker threads.
:param allow_multiple_splits: Allow reads to be split more than once
"""
fastxs = natsorted(list(Path(fastq_dir).rglob(pattern)), key=str)
if output_dir is not None:
Expand All @@ -225,7 +244,8 @@ def split(
debug_output=debug_output,
edit_threshold=edit_threshold,
print_alignment=print_alignment,
print_threshold_delta=print_threshold_delta)
print_threshold_delta=print_threshold_delta,
allow_multiple_splits=allow_multiple_splits)

with ProcessPoolExecutor(max_workers=threads) as executor:
results = executor.map(worker, fastxs)
Expand All @@ -234,12 +254,13 @@ def split(
unedited_reads.update(unedited)
split_multiple_times.update(multi)

with open('edited.pkl', 'wb') as handle:
with open(Path(output, 'edited.pkl'), 'wb') as handle:
pickle.dump(edited_reads, handle)
with open('unedited.pkl', 'wb') as handle:
with open(Path(output, 'unedited.pkl'), 'wb') as handle:
pickle.dump(unedited_reads, handle)
with open('split_multiple_times.pkl', 'wb') as handle:
pickle.dump(split_multiple_times, handle)
if not allow_multiple_splits:
with open(Path(output, 'split_multiple_times.pkl'), 'wb') as handle:
pickle.dump(split_multiple_times, handle)
nedited_reads = len(edited_reads)
nunedited_reads = len(unedited_reads)
print(f'Split {nedited_reads} reads kept {nunedited_reads} reads')
Expand Down Expand Up @@ -302,6 +323,10 @@ def argparser():
parser.add_argument(
"--print_threshold_delta", default=0, type=int,
help="Threshold to print the alignment, relative to edit_threshold.")
parser.add_argument(
"--allow_multiple_splits", action="store_true",
help="Whether to split reads into more than 2 new reads if there"
" are multiple hits")
return parser


Expand All @@ -320,4 +345,5 @@ def main(args):
args.pattern,
args.print_alignment,
args.print_threshold_delta,
args.threads)
args.threads,
args.allow_multiple_splits)

0 comments on commit 0b7172a

Please sign in to comment.