Is it possible to speedup this reader's code? #4165
-
I'm trying to write a code that should run fast, hehe. Here I minimized my typical code to show how I use ADIOS2. Writer: #include <iostream>
#include <vector>
#include <random>
#include "adios2.h"
int main(int argc, char const *argv[])
{
std::random_device random_device;
std::mt19937_64 generator(random_device());
std::uniform_real_distribution<double> distribution(0.0, 1.0);
unsigned int nx = 9000;
unsigned int ny = 1000;
unsigned int nsteps = 10;
std::vector<double> vec(nx*ny, 0);
adios2::ADIOS adios = adios2::ADIOS();
adios2::IO dataio = adios.DeclareIO("DataWriter");
dataio.SetEngine("BP4");
dataio.AddTransport("File", {{"Library", "posix"}});
adios2::Engine writer = dataio.Open("data.bp", adios2::Mode::Write);
writer.LockWriterDefinitions();
adios2::Variable<double> WvarNumbers = dataio.DefineVariable<double>("numbers", {nx, ny}, {0, 0}, {nx, ny}, adios2::ConstantDims);
for (size_t i = 0; i < nsteps; ++i){
for (size_t i = 0; i < nx*ny; ++i) vec[i] = distribution(generator);
writer.BeginStep();
writer.Put(WvarNumbers, vec.data(), adios2::Mode::Deferred);
writer.PerformPuts();
writer.EndStep();
}
writer.Close();
return 0;
} Reader: #include <iostream>
#include <vector>
#include <string>
#include <memory>
#include <chrono>
#include "adios2.h"
#ifdef __ENABLE_CALLGRIND__
#include <valgrind/callgrind.h>
#endif // __ENABLE_CALLGRIND__
struct Timer{
uint64_t counter = 0;
std::chrono::system_clock::time_point t1 = std::chrono::high_resolution_clock::now();
std::chrono::system_clock::time_point t2 = t1; // std::chrono::high_resolution_clock::now();
std::chrono::duration<long double, std::milli> sum = std::chrono::duration<long double, std::milli>::zero();
void begin(){
t1 = std::chrono::high_resolution_clock::now();
t2 = t1;
}
void end(){
t2 = std::chrono::high_resolution_clock::now();
counter += 1;
sum += std::chrono::duration<long double, std::milli>(t2 - t1);
}
std::chrono::duration<long double, std::milli> get(){
if (counter > 0) return sum / counter;
else return sum;
}
};
std::string format_duration(std::chrono::duration<long double, std::milli> ms) {
std::chrono::seconds secs = std::chrono::duration_cast<std::chrono::seconds>(ms);
ms -= std::chrono::duration_cast<std::chrono::milliseconds>(secs);
std::chrono::minutes mins = std::chrono::duration_cast<std::chrono::minutes>(secs);
secs -= std::chrono::duration_cast<std::chrono::seconds>(mins);
std::chrono::hours hour = std::chrono::duration_cast<std::chrono::hours>(mins);
mins -= std::chrono::duration_cast<std::chrono::minutes>(hour);
return std::to_string(hour.count()) + ":" + std::to_string(mins.count()) + ":" + std::to_string(secs.count()) + ":" + std::to_string(ms.count());
}
int main(int argc, char const *argv[])
{
unsigned int nx = 9000;
unsigned int ny = 1000;
// unsigned int nsteps = 100;
std::unique_ptr<double[]> data_buff(new double[nx*ny]);
adios2::ADIOS adios = adios2::ADIOS();
adios2::IO dataio = adios.DeclareIO("DataReader");
dataio.SetEngine("BP4");
adios2::Engine reader = dataio.Open("data.bp", adios2::Mode::Read);
adios2::Variable<double> WvarNumbers;
adios2::Dims WvarNumbersShape;
#ifdef __ENABLE_TIMER__
Timer timer;
#endif // __ENABLE_TIMER__
#ifdef __ENABLE_CALLGRIND__
CALLGRIND_START_INSTRUMENTATION;
#endif // __ENABLE_CALLGRIND__
while (reader.CurrentStep() < reader.Steps() - 1){
if (reader.BeginStep() == adios2::StepStatus::EndOfStream) {
std::cerr << "Unexpected EOS" << std::endl;
break;
}
WvarNumbers = dataio.InquireVariable<double>("numbers");
if (!WvarNumbers) {
std::cerr << " Cannot get variable" << std::endl;
reader.EndStep();
continue;
}
#ifdef __ENABLE_CALLGRIND__
CALLGRIND_TOGGLE_COLLECT;
#endif // __ENABLE_CALLGRIND__
#ifdef __ENABLE_TIMER__
timer.begin();
#endif // __ENABLE_TIMER__
reader.Get(WvarNumbers, data_buff.get(), adios2::Mode::Deferred);
reader.PerformGets();
#ifdef __ENABLE_TIMER__
timer.end();
#endif // __ENABLE_TIMER__
#ifdef __ENABLE_CALLGRIND__
CALLGRIND_TOGGLE_COLLECT;
#endif // __ENABLE_CALLGRIND__
reader.EndStep();
}
#ifdef __ENABLE_CALLGRIND__
CALLGRIND_STOP_INSTRUMENTATION;
#endif // __ENABLE_CALLGRIND__
#ifdef __ENABLE_TIMER__
std::cout << format_duration(timer.get()) << std::endl;
#endif // __ENABLE_TIMER__
reader.Close();
return 0;
} Just in case cmake_minimum_required(VERSION 3.18.0)
project(ReadWrite VERSION 0.1.0 LANGUAGES C CXX)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED YES)
find_package(ADIOS2 REQUIRED)
if(DEFINED USE_CALLGRIND)
if(USE_CALLGRIND STREQUAL ON)
message(STATUS "Callgrind: on")
add_definitions(-D__ENABLE_CALLGRIND__)
else()
message(STATUS "Callgrind: off")
endif()
else()
message(STATUS "Callgrind: off")
endif()
if(DEFINED USE_TIMER)
if(USE_TIMER STREQUAL ON)
message(STATUS "Timer: on")
add_definitions(-D__ENABLE_TIMER__)
else()
message(STATUS "Timer: off")
endif()
else()
message(STATUS "Timer: off")
endif()
set(CMAKE_CXX_FLAGS "-g -O3")
add_executable(writer ${PROJECT_SOURCE_DIR}/writer.cpp)
add_executable(reader ${PROJECT_SOURCE_DIR}/reader.cpp)
target_link_libraries(writer PRIVATE adios2::cxx11)
target_link_libraries(reader PRIVATE adios2::cxx11)
|
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 5 replies
-
I've got some random notes on your code which I'll add at the end, but mostly there's not much on the reader side to customize. In simple situations like this, ADIOS read performance depends upon filesystem performance. Make sure ADIOS itself is compiled with optimization (Release build) and think about using BP5 (current default file engine). In more complex situations, for example a multi-dimensional global array with portions written by many different writer ranks where the reader wants only a slice of the whole, then performance will vary not only with the raw filesystem performance, but also with how much processing ADIOS has to do to assemble the requested data from the set of chunks it was written in. Notes:
|
Beta Was this translation helpful? Give feedback.
-
Historically, ADIOS has focused on minimizing writer overhead because in the HPC environment it's usually the data producer running at the largest scale which has the biggest problems with I/O. So it has considerable complexity that helps to optimize the write (aggregating data to a subset of ranks to avoid every rank writing, etc.), with the data provided in each timestep by each writer treated as a separate chunk to be managed. In general, this leaves the "chunks" of the global array scattered in storage, and the reader simply has to deal with the situation. When dealing with many chunks on disk (as opposed to the single chunk of a single writer) the reader will be slower to fetch the baseline data because it's multiple non-contiguous reads. The complexity of the assembly depends more upon how the dimensionality of the data and how it's decomposed. If your array is decomposed across it's slowest-changing-in-memory dimension, then the chunks written by each writer end up being contiguous in memory when assembled and that's quicker. If the opposite, then you'll have an unpleasant memory access pattern during the reassembly. One possible solution, if your application allows, is to use the BlocksInfo call to see how the writer put the data out there and then Get() and process each piece block by block. Potentially more complexity for the reader, but avoids reassembly costs.
Probably this is due to C++ demanding "safety". std::vector insists on initialzing all elements on .resize() even if you know you're going to overwrite all that on the next line. You could change when that init by calling resize() instead of reserve(), but it still happens somewhere. This isn't such a big deal if you're initing a handful of elements, but for a few million, dragging all that data through the cache multiple times hurts. However, if you reuse the same vector, it should only happen the first time if you read the same amount of data. |
Beta Was this translation helpful? Give feedback.
I've got some random notes on your code which I'll add at the end, but mostly there's not much on the reader side to customize. In simple situations like this, ADIOS read performance depends upon filesystem performance. Make sure ADIOS itself is compiled with optimization (Release build) and think about using BP5 (current default file engine). In more complex situations, for example a multi-dimensional global array with portions written by many different writer ranks where the reader wants only a slice of the whole, then performance will vary not only with the raw filesystem performance, but also with how much processing ADIOS has to do to assemble the requested data from the set of chunk…