Skip to content

Commit

Permalink
Multiple bug-fixes (issue 15)
Browse files Browse the repository at this point in the history
* fixed database construction for very small samples (#kmers < #threads)
* fixed synchronization issues in new2all mode (non-deterministic row order in the output matrix).
* fixed deadlock during database construction when -multisample-fasta mode is run on more than one file.
  • Loading branch information
agudys authored Apr 19, 2022
1 parent 49f6231 commit 4829a56
Show file tree
Hide file tree
Showing 17 changed files with 306 additions and 146 deletions.
10 changes: 3 additions & 7 deletions .github/workflows/c-cpp.yml → .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
name: C/C++ CI
name: GitHub Actions CI

on:
push:
branches: [ master, v2, feature/tests ]
paths-ignore:
- '**.md'
pull_request:
branches: [ master, v2, feature/tests ]
branches: [ master, v2]
paths-ignore:
- '**.md'
workflow_dispatch:
Expand All @@ -22,7 +18,7 @@ jobs:
steps:
- uses: actions/checkout@v2
- name: make
run: make
run: make -j2

- name: build
run: |
Expand Down
148 changes: 148 additions & 0 deletions .github/workflows/self-hosted.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
name: Self-hosted CI

on:
pull_request:
branches: [ master, v2 ]
paths-ignore:
- '**.md'
workflow_dispatch:

jobs:

########################################################################################
checkout:
name: checkout
runs-on: [self-hosted, kmer-db]

steps:
- uses: actions/checkout@v2


########################################################################################
make-tests:
name: make
runs-on: [self-hosted, kmer-db]
needs: checkout
strategy:
fail-fast: false
matrix:
compiler: [9, 10, 11]


steps:
- name: make (g++-${{matrix.compiler}})
run: |
make -j32 CXX=g++-${{matrix.compiler}}
cp ./kmer-db ./kmer-db-${{matrix.compiler}}
make clean
########################################################################################
main-test:
name: Main tests
runs-on: [self-hosted, kmer-db]
needs: make-tests
strategy:
fail-fast: false
matrix:
compiler: [9, 10, 11]
threads: [1, 2, 16, 0]
env:
INPUT_DIR: ./test/virus

steps:

- name: build
run: |
./kmer-db-${{matrix.compiler}} build -t ${{matrix.threads}} ${INPUT_DIR}/seqs.part1.list k18.parts.db
- name: new2all
run: |
./kmer-db-${{matrix.compiler}} new2all -t ${{matrix.threads}} k18.parts.db ${INPUT_DIR}/seqs.part2.list k18.n2a.csv
cmp k18.n2a.csv ${INPUT_DIR}/k18.n2a.csv
- name: new2all (sparse)
run: |
./kmer-db-${{matrix.compiler}} new2all -t ${{matrix.threads}} -sparse k18.parts.db ${INPUT_DIR}/seqs.part2.list k18.n2a.sparse.csv
cmp k18.n2a.sparse.csv ${INPUT_DIR}/k18.n2a.sparse.csv
- name: extend
run: |
./kmer-db-${{matrix.compiler}} build -t ${{matrix.threads}} -extend -k 25 ${INPUT_DIR}/seqs.part2.list k18.parts.db
- name: all2all
run: |
./kmer-db-${{matrix.compiler}} all2all -t ${{matrix.threads}} k18.parts.db k18.csv
cmp k18.csv ${INPUT_DIR}/k18.csv
- name: all2all (sparse)
run: |
./kmer-db-${{matrix.compiler}} all2all -t ${{matrix.threads}} -sparse k18.parts.db k18.sparse.csv
cmp k18.sparse.csv ${INPUT_DIR}/k18.sparse.csv
- name: distance
run: |
./kmer-db-${{matrix.compiler}} distance jaccard min max cosine mash k18.csv
cmp k18.csv.jaccard ${INPUT_DIR}/k18.csv.jaccard
cmp k18.csv.min ${INPUT_DIR}/k18.csv.min
cmp k18.csv.max ${INPUT_DIR}/k18.csv.max
cmp k18.csv.cosine ${INPUT_DIR}/k18.csv.cosine
cmp k18.csv.mash ${INPUT_DIR}/k18.csv.mash
- name: build (default k) + all2all
run: |
./kmer-db-${{matrix.compiler}} build -t ${{matrix.threads}} ${INPUT_DIR}/seqs.list k18.db
./kmer-db-${{matrix.compiler}} all2all -t ${{matrix.threads}} k18.db k18.csv
cmp k18.csv ${INPUT_DIR}/k18.csv
- name: build (default k, multifasta) + all2all
run: |
./kmer-db-${{matrix.compiler}} build -t ${{matrix.threads}} -multisample-fasta ${INPUT_DIR}/multi.list k18.multi.db
./kmer-db-${{matrix.compiler}} all2all -t ${{matrix.threads}} k18.multi.db k18.multi.csv
cmp k18.multi.csv ${INPUT_DIR}/k18.csv
- name: build (default k, 2 x multifasta) + all2all
run: |
./kmer-db-${{matrix.compiler}} build -t ${{matrix.threads}} -multisample-fasta ${INPUT_DIR}/multi.split.list k18.multi.split.db
./kmer-db-${{matrix.compiler}} all2all -t ${{matrix.threads}} k18.multi.split.db k18.multi.split.csv
cmp k18.multi.split.csv ${INPUT_DIR}/k18.csv
- name: build (default k) + extend + all2all
run: |
./kmer-db-${{matrix.compiler}} build -t ${{matrix.threads}} ${INPUT_DIR}/seqs.part1.list k18.parts.db
./kmer-db-${{matrix.compiler}} build -t ${{matrix.threads}} -extend -k 25 ${INPUT_DIR}/seqs.part2.list k18.parts.db
./kmer-db-${{matrix.compiler}} all2all -t ${{matrix.threads}} k18.parts.db k18.parts.csv
cmp k18.parts.csv ${INPUT_DIR}/k18.csv
- name: build (default k, fraction 0.1) + all2all
run: |
./kmer-db-${{matrix.compiler}} build -t ${{matrix.threads}} -f 0.1 ${INPUT_DIR}/seqs.list k18.frac.db
./kmer-db-${{matrix.compiler}} all2all -t ${{matrix.threads}} k18.frac.db k18.frac.csv
cmp k18.frac.csv ${INPUT_DIR}/k18.frac.csv
- name: minhash (default k, fraction 0.1) + build + all2all
run: |
./kmer-db-${{matrix.compiler}} minhash 0.1 ${INPUT_DIR}/seqs.list
./kmer-db-${{matrix.compiler}} build -t ${{matrix.threads}} -from-minhash -k 25 ${INPUT_DIR}/seqs.list k18.minhash.db
./kmer-db-${{matrix.compiler}} all2all -t ${{matrix.threads}} k18.minhash.db k18.minhash.csv
cmp k18.minhash.csv ${INPUT_DIR}/k18.frac.csv
- name: build (k=24) + all2all
run: |
./kmer-db-${{matrix.compiler}} build -t ${{matrix.threads}} -k 24 ${INPUT_DIR}/seqs.list k24.db
./kmer-db-${{matrix.compiler}} all2all -t ${{matrix.threads}} k24.db k24.csv
cmp k24.csv ${INPUT_DIR}/k24.csv
- name: build (k=25, f=0.1) + one2all
run: |
./kmer-db-${{matrix.compiler}} build -t ${{matrix.threads}} -k 25 -f 0.1 ${INPUT_DIR}/seqs.part1.list k25.db
./kmer-db-${{matrix.compiler}} one2all -t ${{matrix.threads}} k25.db ${INPUT_DIR}/data/MT159713 MT159713.csv
cmp MT159713.csv ${INPUT_DIR}/MT159713.csv
- name: new2all (against itself)
run: |
./kmer-db-${{matrix.compiler}} new2all -t ${{matrix.threads}} k18.db ${INPUT_DIR}/seqs.list k18.n2a.itself.csv
cmp k18.n2a.itself.csv ${INPUT_DIR}/k18.n2a.itself.csv


2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Kmer-db
[![GitHub downloads](https://img.shields.io/github/downloads/refresh-bio/kmer-db/total.svg?style=flag&label=GitHub%20downloads)](https://github.com/refresh-bio/kmer-db/releases)
[![Bioconda downloads](https://img.shields.io/conda/dn/bioconda/kmer-db.svg?style=flag&label=Bioconda%20downloads)](https://anaconda.org/bioconda/kmer-db)
[![C/C++ CI](https://github.com/refresh-bio/kmer-db/workflows/C/C++%20CI/badge.svg)](https://github.com/refresh-bio/kmer-db/actions)
[![GitHub Actions CI](../../actions/workflows/main.yml/badge.svg)](../../actions/workflows/main.yml)
[![License](https://anaconda.org/bioconda/famsa/badges/license.svg)](https://www.gnu.org/licenses/gpl-3.0.html)


Expand Down
16 changes: 7 additions & 9 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ else
EXTRA_LIBS_DIR = ""
endif


CC = g++
LDFLAGS +=
CFLAGS += -Wall -O3 -m64 -std=c++11 $(OMP_FLAGS) -pthread
CFLAGS_AVX2 += $(CFLAGS) -mavx2 -I $(KMER_DB_LIBS_DIR) -I $(EXTRA_LIBS_DIR)
Expand All @@ -60,35 +58,35 @@ OBJS := $(KMER_DB_MAIN_DIR)/analyzer.o \
$(KMER_DB_LIBS_DIR)/mmer.o

$(KMER_DB_MAIN_DIR)/parallel_sorter.o: $(KMER_DB_MAIN_DIR)/parallel_sorter.cpp
$(CC) -O3 -mavx -m64 -std=c++11 -pthread $(OMP_FLAGS) -c $< -o $@
$(CXX) -O3 -mavx -m64 -std=c++11 -pthread $(OMP_FLAGS) -c $< -o $@

ifeq ($(HAVE_AVX2),)
## no avx2 support
AVX_OBJS := $(KMER_DB_MAIN_DIR)/row_add_avx.o
$(KMER_DB_MAIN_DIR)/row_add_avx.o: $(KMER_DB_MAIN_DIR)/row_add_avx.cpp
$(CC) $(CFLAGS) -DNO_AVX2 -c $< -o $@
$(CXX) $(CFLAGS) -DNO_AVX2 -c $< -o $@

else
# with avx2 support
AVX_OBJS := $(KMER_DB_MAIN_DIR)/row_add_avx.o \
$(KMER_DB_MAIN_DIR)/row_add_avx2.o
$(KMER_DB_MAIN_DIR)/row_add_avx.o: $(KMER_DB_MAIN_DIR)/row_add_avx.cpp
$(CC) $(CFLAGS) -c $< -o $@
$(CXX) $(CFLAGS) -c $< -o $@
$(KMER_DB_MAIN_DIR)/row_add_avx2.o: $(KMER_DB_MAIN_DIR)/row_add_avx2.cpp
$(CC) $(CFLAGS_AVX2) -c $< -o $@
$(CXX) $(CFLAGS_AVX2) -c $< -o $@

endif


%.o: %.cpp
$(CC) $(CFLAGS) -c $< -o $@
$(CXX) $(CFLAGS) -c $< -o $@

ifeq ($(INTERNAL_ZLIB),true)
kmer-db: $(OBJS) $(AVX_OBJS)
$(CC) $(CLINK) $(LDFLAGS) -o $(KMER_DB_ROOT_DIR)/$@ $(OBJS) $(AVX_OBJS) $(EXTRA_LIBS_DIR)/libz.a
$(CXX) $(CLINK) $(LDFLAGS) -o $(KMER_DB_ROOT_DIR)/$@ $(OBJS) $(AVX_OBJS) $(EXTRA_LIBS_DIR)/libz.a
else
kmer-db: $(OBJS) $(AVX_OBJS)
$(CC) $(CLINK) $(LDFLAGS) -o $(KMER_DB_ROOT_DIR)/$@ $(OBJS) $(AVX_OBJS) -lz
$(CXX) $(CLINK) $(LDFLAGS) -o $(KMER_DB_ROOT_DIR)/$@ $(OBJS) $(AVX_OBJS) -lz
endif

clean:
Expand Down
34 changes: 18 additions & 16 deletions src/console.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,18 +256,18 @@ int Console::runMinHash(const std::string& multipleKmcSamples, InputFile::Format

std::chrono::duration<double> loadingTime{ 0 }, processingTime{ 0 };

LOG_DEBUG << "Creating Loader object..." << endl;
LOG_DEBUG << "Creating Loader object..." << endl ;

auto filter = std::make_shared<MinHashFilter>(fraction, 0, kmerLength);

LoaderEx loader(filter, inputFormat, numReaderThreads, numThreads, multisampleFasta);
loader.configure(multipleKmcSamples);

LOG_DEBUG << "Starting loop..." << endl;
LOG_DEBUG << "Starting loop..." << endl ;
auto totalStart = std::chrono::high_resolution_clock::now();
for (int i = 0; !loader.isCompleted(); ++i) {
auto partialTime = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - totalStart);
LOG_VERBOSE << "Processing time: " << partialTime.count() << ", loader buffers: " << (loader.getBytes() >> 20) << " MB" << endl;
LOG_VERBOSE << "Processing time: " << partialTime.count() << ", loader buffers: " << (loader.getBytes() >> 20) << " MB" << endl ;

auto task = loader.popTask(i);

Expand Down Expand Up @@ -302,7 +302,7 @@ int Console::runBuildDatabase(
InputFile::Format inputFormat,
bool extendDb){

LOG_DEBUG << "Creating PrefixKmerDb object" << endl;
LOG_DEBUG << "Creating PrefixKmerDb object" << endl ;
AbstractKmerDb* db = new PrefixKmerDb(numThreads);
std::shared_ptr<MinHashFilter> filter;

Expand All @@ -323,17 +323,17 @@ int Console::runBuildDatabase(
std::chrono::duration<double> sortingTime{ 0 }, processingTime{ 0 };

cout << "Processing samples..." << endl;
LOG_DEBUG << "Creating Loader object..." << endl;
LOG_DEBUG << "Creating Loader object..." << endl ;

LoaderEx loader(filter, inputFormat, numReaderThreads, numThreads, multisampleFasta);
loader.configure(multipleSamples);

LOG_DEBUG << "Starting loop..." << endl;
LOG_DEBUG << "Starting loop..." << endl ;
auto totalStart = std::chrono::high_resolution_clock::now();
int sample_id = 0;
for (; !loader.isCompleted(); ++sample_id) {
auto partialTime = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - totalStart);
LOG_VERBOSE << "Processing time: " << partialTime.count() << ", loader buffers: " << (loader.getBytes() >> 20) << " MB" << endl;
LOG_VERBOSE << "Processing time: " << partialTime.count() << ", loader buffers: " << (loader.getBytes() >> 20) << " MB" << endl ;

auto task = loader.popTask(sample_id);

Expand Down Expand Up @@ -364,7 +364,7 @@ int Console::runBuildDatabase(
processingTime += std::chrono::high_resolution_clock::now() - start;

loader.releaseTask(*task);
LOG_VERBOSE << db->printProgress() << endl;
LOG_VERBOSE << db->printProgress() << endl ;
}
}

Expand Down Expand Up @@ -570,7 +570,7 @@ int Console::runNewVsAll(const std::string& dbFilename, const std::string& multi
dt = std::chrono::high_resolution_clock::now() - start;
cout << "OK (" << dt.count() << " seconds)" << endl << db.printStats() << endl;

LOG_DEBUG << "Creating Loader object..." << endl;
LOG_DEBUG << "Creating Loader object..." << endl ;
shared_ptr<MinHashFilter> filter = shared_ptr<MinHashFilter>(new MinHashFilter(db.getFraction(), db.getStartFraction(), db.getKmerLength()));

LoaderEx loader(filter, inputFormat, numReaderThreads, numThreads, multisampleFasta);
Expand All @@ -595,11 +595,11 @@ int Console::runNewVsAll(const std::string& dbFilename, const std::string& multi

for (int tid = 0; tid < numThreads; ++tid) {
workers[tid] = thread([&db, &loader, &freeBuffersQueue, &similarityQueue, &buffers, &calculator, &sample_id, tid]() {
int task_id = sample_id.fetch_add(1);
while (!loader.isCompleted()) {
int task_id = sample_id.fetch_add(1);
std::shared_ptr<SampleTask> task;

std::shared_ptr<SampleTask> task;
if ((task = loader.popTask(task_id)) && freeBuffersQueue.Pop(task->bufferId2)) {
LOG_DEBUG << "loader queue " << task_id + 1 << " -> (" << task->id + 1 << ", " << task->sampleName << ")" << endl ;
buffers[task->bufferId2].clear();

// only unique k-mers are needed
Expand All @@ -608,12 +608,14 @@ int Console::runNewVsAll(const std::string& dbFilename, const std::string& multi
calculator.one2all<false>(db, task->kmers, task->kmersCount, buffers[task->bufferId2]);
similarityQueue.Push(task_id, task);

LOG_DEBUG << "(" << task_id + 1 << ") -> similarity queue, tid:" << tid << ", buf:" << task->bufferId2 << endl;
LOG_DEBUG << "(" << task->id + 1 << ", " << task->sampleName << ") -> similarity queue, tid:" << tid << ", buf:" << task->bufferId2 << endl ;
task_id = sample_id.fetch_add(1);

}
}

similarityQueue.MarkCompleted();
LOG_DEBUG << "processing finished, tid: " << tid << endl;
LOG_DEBUG << "similarity thread completed: " << tid << endl ;
});
}

Expand Down Expand Up @@ -643,7 +645,7 @@ int Console::runNewVsAll(const std::string& dbFilename, const std::string& multi
cout << "\r" << task_id + 1 << "... " << std::flush;
}

LOG_DEBUG << "similarity queue -> (" << task_id + 1 << ", " << task->sampleName << "), buf:" << task->bufferId2 << endl;
LOG_DEBUG << "similarity queue -> (" << task_id + 1 << ", " << task->sampleName << "), buf:" << task->bufferId2 << endl ;
const auto& buf = buffers[task->bufferId2];

ptr = row;
Expand Down Expand Up @@ -837,7 +839,7 @@ int Console::runAnalyzeDatabase(const std::string & multipleKmcSamples, const st
LoaderEx loader(filter, InputFile::GENOME, numReaderThreads, numThreads, true);
int numSamples = loader.configure(multipleKmcSamples);

LOG_DEBUG << "Starting loop..." << endl;
LOG_DEBUG << "Starting loop..." << endl ;
for (int i = 0; i < numSamples; ++i) {

auto task = loader.popTask(i);
Expand Down
3 changes: 1 addition & 2 deletions src/console.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ class Console

int runMinHash(const std::string& multipleSamples, InputFile::Format inputFormat);
int runDistanceCalculation(const std::string& similarityFilename, const std::vector<string>& metricNames, bool usePhylip);
int runDistanceCalculationPhylip(const std::string& similarityFilename, const std::vector<string>& metricNames);


int runListPatterns(const std::string& dbFilename, const std::string& patternFile);
int runAnalyzeDatabase(const std::string& multipleKmcSamples, const std::string& dbFilename);

Expand Down
Loading

0 comments on commit 4829a56

Please sign in to comment.