Skip to content

Commit

Permalink
support gzip-compressed FASTA
Browse files Browse the repository at this point in the history
  • Loading branch information
sfchen committed May 19, 2020
1 parent 9e8a8d4 commit 8e5365a
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 28 deletions.
2 changes: 1 addition & 1 deletion src/common.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef COMMON_H
#define COMMON_H

#define UNIQUEKMER_VER "0.0.1"
#define UNIQUEKMER_VER "0.1.0"

#define _DEBUG false

Expand Down
105 changes: 80 additions & 25 deletions src/fastareader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "fastareader.h"
#include "util.h"
#include <sstream>
#include <string.h>

FastaReader::FastaReader(string faFile, bool forceUpperCase)
{
Expand All @@ -11,41 +12,92 @@ FastaReader::FastaReader(string faFile, bool forceUpperCase)
setlocale(LC_ALL,"C");
ios_base::sync_with_stdio(false);

mFastaFile = faFile;
mFilename = faFile;
mForceUpperCase = forceUpperCase;
if (is_directory(mFastaFile)) {
string error_msg = "There is a problem with the provided fasta file: \'";
error_msg.append(mFastaFile);
error_msg.append("\' is a directory NOT a file...\n");
throw invalid_argument(error_msg);

if (ends_with(mFilename, ".fasta.gz") || ends_with(mFilename, ".fa.gz") || ends_with(mFilename, ".fna.gz")){
mZipFile = gzopen(mFilename.c_str(), "r");
mZipped = true;
}
mFastaFileStream.open( mFastaFile.c_str(),ios::in);
// verify that the file can be read
if (!mFastaFileStream.is_open()) {
string msg = "There is a problem with the provided fasta file: could NOT read ";
msg.append(mFastaFile.c_str());
msg.append("...\n");
throw invalid_argument(msg);
else if(ends_with(mFilename, ".fasta") || ends_with(mFilename, ".fa") || ends_with(mFilename, ".fna")){
mFile.open(mFilename.c_str(), ifstream::in);
mZipped = false;
} else {
error_exit("FASTA file should have a name (*.fasta, *.fa or *.fna) or (*.fasta.gz, *.fa.gz or *.fna.gz). Not a FASTA file: " + mFilename);
}

char c;
// seek to first contig
while (mFastaFileStream.get(c) && c != '>') {
if (mFastaFileStream.eof()) {
while (getChar(c) && c != '>') {
if (eof()) {
break;
}
}
}

FastaReader::~FastaReader()
{
if (mFastaFileStream.is_open()) {
mFastaFileStream.close();
if (mZipped){
if (mZipFile){
gzclose(mZipFile);
mZipFile = NULL;
}
}
else {
if (mFile.is_open()){
mFile.close();
}
}
}

bool FastaReader::getLine(char* line, int maxLine){
bool status = true;
if(mZipped)
status = gzgets(mZipFile, line, maxLine);
else {
mFile.getline(line, maxLine, '\n');
status = !mFile.fail();
}

// trim \n, \r or \r\n in the tail
int readed = strlen(line);
if(readed >=2 ){
if(line[readed-1] == '\n' || line[readed-1] == '\r'){
line[readed-1] = '\0';
if(line[readed-2] == '\r')
line[readed-2] = '\0';
}
}

return status;
}

bool FastaReader::eof() {
if (mZipped) {
return gzeof(mZipFile);
} else {
return mFile.eof();
}
}

bool FastaReader::getChar(char& c) {
bool status = true;
if (mZipped) {
c = (char)gzgetc(mZipFile);
if(c == -1)
status = false;
} else {
mFile.get(c);
status = !mFile.fail();
}
return status;
}

void FastaReader::readNext()
{
const int maxLine = 1024;
char linebuf[maxLine];

mCurrentID = "";
mCurrentDescription = "";
mCurrentSequence = "";
Expand All @@ -55,8 +107,8 @@ void FastaReader::readNext()
stringstream ssSeq;
stringstream ssHeader;
while(true){
mFastaFileStream.get(c);
if(c == '>' || mFastaFileStream.eof())
getChar(c);
if(c == '>' || eof())
break;
else {
if (foundHeader){
Expand All @@ -68,10 +120,13 @@ void FastaReader::readNext()
else
ssHeader << c;
}

string line = "";
getline(mFastaFileStream,line,'\n');

string line;
if(mZipped) {
getLine(linebuf, maxLine);
line = string(linebuf);
} else {
getline(mFile,line,'\n');
}

if(foundHeader == false) {
ssHeader << line;
Expand All @@ -89,11 +144,11 @@ void FastaReader::readNext()
}

bool FastaReader::hasNext() {
return !mFastaFileStream.eof();
return !eof();
}

void FastaReader::readAll() {
while(!mFastaFileStream.eof()){
while(!eof()){
readNext();
mAllContigs[mCurrentID] = mCurrentSequence;
}
Expand Down
10 changes: 8 additions & 2 deletions src/fastareader.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <stdexcept>
#include <string>
#include <map>
#include "zlib/zlib.h"

using namespace std;

Expand Down Expand Up @@ -54,11 +55,16 @@ class FastaReader
bool readLine();
bool endOfLine(char c);
void setFastaSequenceIdDescription();
bool getLine(char* line, int maxLine);
bool getChar(char& c);
bool eof();

private:
string mFastaFile;
ifstream mFastaFileStream;
string mFilename;
bool mForceUpperCase;
gzFile mZipFile;
ifstream mFile;
bool mZipped;
};


Expand Down

0 comments on commit 8e5365a

Please sign in to comment.