Skip to content

Commit

Permalink
Compress final outputs in merge_samples when --unzip is not used
Browse files Browse the repository at this point in the history
  • Loading branch information
Donaim committed Feb 14, 2024
1 parent df9b3b6 commit 6840f80
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 15 deletions.
30 changes: 26 additions & 4 deletions micall/core/merge_fastqs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import os
import tempfile
import gzip

from micall.core.trim_fastqs import TrimSteps, trim

Expand Down Expand Up @@ -47,10 +48,31 @@ def merge_fastqs(args):
os.makedirs(os.path.dirname(args.fastq1_result), exist_ok=True)
os.makedirs(os.path.dirname(args.fastq2_result), exist_ok=True)

concatenate_files(trimmed_fastq1_a.name, trimmed_fastq1_b.name,
args.fastq1_result)
concatenate_files(trimmed_fastq2_a.name, trimmed_fastq2_b.name,
args.fastq2_result)
if args.unzipped:
concatenate_files(trimmed_fastq1_a.name, trimmed_fastq1_b.name,
args.fastq1_result)
concatenate_files(trimmed_fastq2_a.name, trimmed_fastq2_b.name,
args.fastq2_result)

else:
with tempfile.NamedTemporaryFile() as temp_fastq1, \
tempfile.NamedTemporaryFile() as temp_fastq2:

temp_fastq1.close()
temp_fastq2.close()

concatenate_files(trimmed_fastq1_a.name, trimmed_fastq1_b.name,
temp_fastq1.name)
concatenate_files(trimmed_fastq2_a.name, trimmed_fastq2_b.name,
temp_fastq2.name)

logger.info('Compressing final outputs.')

with open(temp_fastq1.name, 'rb') as f_in, gzip.open(args.fastq1_result, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)

with open(temp_fastq2.name, 'rb') as f_in, gzip.open(args.fastq2_result, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)

logger.info('Done.')

Expand Down
77 changes: 66 additions & 11 deletions micall/tests/test_merge_fastqs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
from io import StringIO
import csv
import gzip

from micall.core.merge_fastqs import main

Expand Down Expand Up @@ -40,26 +41,28 @@ def fastq_files(tmp_path):
fastq1_result = tmp_path / "fastq1_result.fastq"
fastq2_result = tmp_path / "fastq2_result.fastq"

args = [str(fastq1_a), str(fastq2_a), str(fastq1_b), str(fastq2_b), '--unzipped', str(fastq1_result), str(fastq2_result)]
return args
return (fastq1_a, fastq2_a, fastq1_b, fastq2_b, fastq1_result, fastq2_result)


def test_basic_main(fastq_files):
main(fastq_files)
(fastq1_a, fastq2_a, fastq1_b, fastq2_b, fastq1_result, fastq2_result) = fastq_files
args = [str(fastq1_a), str(fastq2_a), str(fastq1_b), str(fastq2_b), '--unzipped', str(fastq1_result), str(fastq2_result)]

main(args)

# Add some assertions
assert os.path.exists(fastq_files[-2]) # assert fastq1_result exists
assert os.path.exists(fastq_files[-1]) # assert fastq2_result exists

# New checks: Check if the merged files contain the correct sequences and qualities.

with open(fastq_files[-2], 'r') as f: # open fastq1_result
with open(fastq1_result, 'r') as f:
data = f.read()
# should contain the sequences and qualities of the first reads of both samples
assert FASTQ_DATA_A in data
assert FASTQ_DATA_B in data

with open(fastq_files[-1], 'r') as f: # open fastq2_result
with open(fastq2_result, 'r') as f:
data = f.read()
# should contain the sequences and qualities of the second reads of both samples
assert FASTQ_DATA_A2 in data
Expand Down Expand Up @@ -92,20 +95,72 @@ def fastq_files_with_bad_cycles(bad_cycles_csv, tmp_path):
fastq1_result = tmp_path / "fastq1_result.fastq"
fastq2_result = tmp_path / "fastq2_result.fastq"

return (fastq1_a, fastq2_a, fastq1_b, fastq2_b,
fastq1_result, fastq2_result, bad_cycles_csv)


def test_rejected_reads(fastq_files_with_bad_cycles):
(fastq1_a, fastq2_a, fastq1_b, fastq2_b, fastq1_result, fastq2_result, bad_cycles_csv) = \
fastq_files_with_bad_cycles

args = [str(fastq1_a), str(fastq2_a), str(fastq1_b), str(fastq2_b),
'--bad_cycles_a_csv', bad_cycles_csv,
'--unzipped', str(fastq1_result), str(fastq2_result)]
return args


def test_rejected_reads(fastq_files_with_bad_cycles):
main(fastq_files_with_bad_cycles)
main(args)

with open(fastq_files_with_bad_cycles[-2], 'r') as f: # open fastq1_result
with open(fastq1_result, 'r') as f:
data = f.read()
assert FASTQ_DATA_A not in data # first nucleotide filtered out.
assert FASTQ_DATA_B in data
with open(fastq_files_with_bad_cycles[-1], 'r') as f: # open fastq2_result
with open(fastq2_result, 'r') as f:
data = f.read()
assert FASTQ_DATA_A2 in data
assert FASTQ_DATA_B2 in data


# Helper function to write data to a gzipped file
def write_to_gzip(filepath, data):
with gzip.open(filepath, 'wt') as f:
f.write(data)


@pytest.fixture
def gzipped_fastq_files(tmp_path):
# Create gzipped forward and reverse fastq files for two samples: A and B.
fastq1_a = tmp_path / "fastq1_a.fastq.gz"
fastq2_a = tmp_path / "fastq2_a.fastq.gz"
fastq1_b = tmp_path / "fastq1_b.fastq.gz"
fastq2_b = tmp_path / "fastq2_b.fastq.gz"
write_to_gzip(fastq1_a, FASTQ_DATA_A)
write_to_gzip(fastq2_a, FASTQ_DATA_A2)
write_to_gzip(fastq1_b, FASTQ_DATA_B)
write_to_gzip(fastq2_b, FASTQ_DATA_B2)

# Paths for output files
fastq1_result = tmp_path / "fastq1_result.fastq.gz"
fastq2_result = tmp_path / "fastq2_result.fastq.gz"

return (fastq1_a, fastq2_a, fastq1_b, fastq2_b, fastq1_result, fastq2_result)


def test_gzipped_main(gzipped_fastq_files):
(fastq1_a, fastq2_a, fastq1_b, fastq2_b, fastq1_result, fastq2_result) = gzipped_fastq_files
args = [str(fastq1_a), str(fastq2_a), str(fastq1_b), str(fastq2_b), str(fastq1_result), str(fastq2_result)]

main(args)

# Add some assertions
assert os.path.exists(fastq1_result) # assert gzipped fastq1_result exists
assert os.path.exists(fastq2_result) # assert gzipped fastq2_result exists

# Check if the merged gzipped files contain the correct sequences and qualities.
with gzip.open(fastq1_result, 'rt') as f:
data = f.read()
assert FASTQ_DATA_A in data
assert FASTQ_DATA_B in data

with gzip.open(fastq2_result, 'rt') as f:
data = f.read()
assert FASTQ_DATA_A2 in data
assert FASTQ_DATA_B2 in data

0 comments on commit 6840f80

Please sign in to comment.