diff --git a/src/common.h b/src/common.h index a14b3ef..823585e 100644 --- a/src/common.h +++ b/src/common.h @@ -1,7 +1,7 @@ #ifndef COMMON_H #define COMMON_H -#define UNIQUEKMER_VER "0.0.1" +#define UNIQUEKMER_VER "0.1.0" #define _DEBUG false diff --git a/src/fastareader.cpp b/src/fastareader.cpp index 36795bc..fe14662 100644 --- a/src/fastareader.cpp +++ b/src/fastareader.cpp @@ -2,6 +2,7 @@ #include "fastareader.h" #include "util.h" #include +#include FastaReader::FastaReader(string faFile, bool forceUpperCase) { @@ -11,27 +12,24 @@ FastaReader::FastaReader(string faFile, bool forceUpperCase) setlocale(LC_ALL,"C"); ios_base::sync_with_stdio(false); - mFastaFile = faFile; + mFilename = faFile; mForceUpperCase = forceUpperCase; - if (is_directory(mFastaFile)) { - string error_msg = "There is a problem with the provided fasta file: \'"; - error_msg.append(mFastaFile); - error_msg.append("\' is a directory NOT a file...\n"); - throw invalid_argument(error_msg); + + if (ends_with(mFilename, ".fasta.gz") || ends_with(mFilename, ".fa.gz") || ends_with(mFilename, ".fna.gz")){ + mZipFile = gzopen(mFilename.c_str(), "r"); + mZipped = true; } - mFastaFileStream.open( mFastaFile.c_str(),ios::in); - // verify that the file can be read - if (!mFastaFileStream.is_open()) { - string msg = "There is a problem with the provided fasta file: could NOT read "; - msg.append(mFastaFile.c_str()); - msg.append("...\n"); - throw invalid_argument(msg); + else if(ends_with(mFilename, ".fasta") || ends_with(mFilename, ".fa") || ends_with(mFilename, ".fna")){ + mFile.open(mFilename.c_str(), ifstream::in); + mZipped = false; + } else { + error_exit("FASTA file should have a name (*.fasta, *.fa or *.fna) or (*.fasta.gz, *.fa.gz or *.fna.gz). Not a FASTA file: " + mFilename); } char c; // seek to first contig - while (mFastaFileStream.get(c) && c != '>') { - if (mFastaFileStream.eof()) { + while (getChar(c) && c != '>') { + if (eof()) { break; } } @@ -39,13 +37,67 @@ FastaReader::FastaReader(string faFile, bool forceUpperCase) FastaReader::~FastaReader() { - if (mFastaFileStream.is_open()) { - mFastaFileStream.close(); + if (mZipped){ + if (mZipFile){ + gzclose(mZipFile); + mZipFile = NULL; + } + } + else { + if (mFile.is_open()){ + mFile.close(); + } } } +bool FastaReader::getLine(char* line, int maxLine){ + bool status = true; + if(mZipped) + status = gzgets(mZipFile, line, maxLine); + else { + mFile.getline(line, maxLine, '\n'); + status = !mFile.fail(); + } + + // trim \n, \r or \r\n in the tail + int readed = strlen(line); + if(readed >=2 ){ + if(line[readed-1] == '\n' || line[readed-1] == '\r'){ + line[readed-1] = '\0'; + if(line[readed-2] == '\r') + line[readed-2] = '\0'; + } + } + + return status; +} + +bool FastaReader::eof() { + if (mZipped) { + return gzeof(mZipFile); + } else { + return mFile.eof(); + } +} + +bool FastaReader::getChar(char& c) { + bool status = true; + if (mZipped) { + c = (char)gzgetc(mZipFile); + if(c == -1) + status = false; + } else { + mFile.get(c); + status = !mFile.fail(); + } + return status; +} + void FastaReader::readNext() { + const int maxLine = 1024; + char linebuf[maxLine]; + mCurrentID = ""; mCurrentDescription = ""; mCurrentSequence = ""; @@ -55,8 +107,8 @@ void FastaReader::readNext() stringstream ssSeq; stringstream ssHeader; while(true){ - mFastaFileStream.get(c); - if(c == '>' || mFastaFileStream.eof()) + getChar(c); + if(c == '>' || eof()) break; else { if (foundHeader){ @@ -68,10 +120,13 @@ void FastaReader::readNext() else ssHeader << c; } - - string line = ""; - getline(mFastaFileStream,line,'\n'); - + string line; + if(mZipped) { + getLine(linebuf, maxLine); + line = string(linebuf); + } else { + getline(mFile,line,'\n'); + } if(foundHeader == false) { ssHeader << line; @@ -89,11 +144,11 @@ void FastaReader::readNext() } bool FastaReader::hasNext() { - return !mFastaFileStream.eof(); + return !eof(); } void FastaReader::readAll() { - while(!mFastaFileStream.eof()){ + while(!eof()){ readNext(); mAllContigs[mCurrentID] = mCurrentSequence; } diff --git a/src/fastareader.h b/src/fastareader.h index 97ac496..13369a1 100644 --- a/src/fastareader.h +++ b/src/fastareader.h @@ -10,6 +10,7 @@ #include #include #include +#include "zlib/zlib.h" using namespace std; @@ -54,11 +55,16 @@ class FastaReader bool readLine(); bool endOfLine(char c); void setFastaSequenceIdDescription(); + bool getLine(char* line, int maxLine); + bool getChar(char& c); + bool eof(); private: - string mFastaFile; - ifstream mFastaFileStream; + string mFilename; bool mForceUpperCase; + gzFile mZipFile; + ifstream mFile; + bool mZipped; };