Skip to content

Commit

Permalink
enable gzip output mode
Browse files Browse the repository at this point in the history
gzip input --> gzip output
ungzip input --> ungzip output
ungzip input + --gzip (-z) option --> gzip output

issues:
#26
#14
  • Loading branch information
sfchen committed Oct 28, 2017
1 parent a00ce1e commit ee05832
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 22 deletions.
4 changes: 4 additions & 0 deletions after.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ def parseCommand():
help = "set the qual num to 0 for mismatched base pairs in overlapped areas to mask them out")
parser.add_option("", "--no_overlap", dest = "no_overlap", action='store_true', default = False,
help = "disable overlap analysis (usually much faster with this option)")
parser.add_option("-z", "--gzip", dest = "gzip", action='store_true', default = False,
help = "force gzip compression for output, even the input is not gzip compressed")
parser.add_option("", "--compression", dest = "compression", type = "int", default = 2,
help = "set compression level (0~9) for gzip output, default is 2 (0 = best speed, 9 = best compression).")
return parser.parse_args()

def matchFlag(filename, flag):
Expand Down
11 changes: 9 additions & 2 deletions fastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,12 @@ class Writer:

__file = None

def __init__(self, fname):
def __init__(self, fname, force_gzip = False, gzip_compression = 2):
self.filename = fname
if not self.filename.endswith(".gz") and force_gzip:
self.filename = self.filename + ".gz"
if self.filename.endswith(".gz"):
self.__file = gzip.open(self.filename, "w")
self.__file = gzip.open(self.filename, "w", compresslevel = gzip_compression)
elif self.filename.endswith(".bz2"):
print("ERROR: Write bzip2 stream is not supported")
sys.exit(1)
Expand All @@ -81,6 +83,11 @@ def __del__(self):
def flush(self):
if self.__file !=None:
self.__file.flush()

def close(self):
if self.__file !=None:
self.__file.flush()
self.__file.close()

def writeLines(self, lines):
if self.__file == None:
Expand Down
45 changes: 25 additions & 20 deletions preprocesser.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,17 +314,22 @@ def run(self):

if self.options.store_overlap and self.options.read2_file != None and (not os.path.exists(overlap_dir)):
os.makedirs(overlap_dir)

gzip_out = self.options.gzip
gzip_comp = self.options.compression;
if not gzip_out and self.options.read1_file.endswith(".gz"):
gzip_out = True

good_read1_file = None
bad_read1_file = None
overlap_read1_file = None
if not self.options.qc_only:
good_read1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read1_file)+".good.fq"))
bad_read1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read1_file)+".bad.fq"))
good_read1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read1_file)+".good.fq"), gzip_out, gzip_comp)
bad_read1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read1_file)+".bad.fq"), gzip_out, gzip_comp)

overlap_read1_file = None
if self.options.store_overlap:
overlap_read1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read1_file)+".overlap.fq"))
overlap_read1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read1_file)+".overlap.fq"), gzip_out, gzip_comp)

#other files are optional
read2_file = None
Expand All @@ -346,24 +351,24 @@ def run(self):
if self.options.read2_file != None:
read2_file = fastq.Reader(self.options.read2_file)
if not self.options.qc_only:
good_read2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read2_file)+".good.fq"))
bad_read2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read2_file)+".bad.fq"))
good_read2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read2_file)+".good.fq"), gzip_out, gzip_comp)
bad_read2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read2_file)+".bad.fq"), gzip_out, gzip_comp)
if self.options.store_overlap and self.options.read2_file != None:
overlap_read2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read2_file)+".overlap.fq"))
overlap_read2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read2_file)+".overlap.fq"), gzip_out, gzip_comp)
if self.options.index1_file != None:
index1_file = fastq.Reader(self.options.index1_file)
if not self.options.qc_only:
good_index1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index1_file)+".good.fq"))
bad_index1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index1_file)+".bad.fq"))
good_index1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index1_file)+".good.fq"), gzip_out, gzip_comp)
bad_index1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index1_file)+".bad.fq"), gzip_out, gzip_comp)
if self.options.store_overlap and self.options.read2_file != None:
overlap_index1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index1_file)+".overlap.fq"))
overlap_index1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index1_file)+".overlap.fq"), gzip_out, gzip_comp)
if self.options.index2_file != None:
index2_file = fastq.Reader(self.options.index2_file)
if not self.options.qc_only:
good_index2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index2_file)+".good.fq"))
bad_index2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index2_file)+".bad.fq"))
good_index2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index2_file)+".good.fq"), gzip_out, gzip_comp)
bad_index2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index2_file)+".bad.fq"), gzip_out, gzip_comp)
if self.options.store_overlap and self.options.read2_file != None:
overlap_index2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index2_file)+".overlap.fq"))
overlap_index2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index2_file)+".overlap.fq"), gzip_out, gzip_comp)

r1 = None
r2 = None
Expand Down Expand Up @@ -633,17 +638,17 @@ def run(self):

#close all files
if not self.options.qc_only:
good_read1_file.flush()
bad_read1_file.flush()
good_read1_file.close()
bad_read1_file.close()
if self.options.read2_file != None:
good_read2_file.flush()
bad_read2_file.flush()
good_read2_file.close()
bad_read2_file.close()
if self.options.index1_file != None:
good_index1_file.flush()
bad_index1_file.flush()
good_index1_file.close()
bad_index1_file.close()
if self.options.index2_file != None:
good_index2_file.flush()
bad_index2_file.flush()
good_index2_file.close()
bad_index2_file.close()

# print stat numbers
BAD_READS = TOTAL_READS - GOOD_READS
Expand Down

0 comments on commit ee05832

Please sign in to comment.