Is it possible to speedup this reader's code? #4165

abkein · 2024-05-15T21:05:01Z

abkein
May 15, 2024

I'm trying to write a code that should run fast, hehe. Here I minimized my typical code to show how I use ADIOS2.
The critical is reader's code, usually I use ADIOS2 to process some data, not to generate it.
Is it ok or I do something dummy?

Writer:

#include <iostream>
#include <vector>
#include <random>

#include "adios2.h"

int main(int argc, char const *argv[])
{
    std::random_device random_device;
    std::mt19937_64 generator(random_device());
    std::uniform_real_distribution<double> distribution(0.0, 1.0);

    unsigned int nx = 9000;
    unsigned int ny = 1000;
    unsigned int nsteps = 10;
    std::vector<double> vec(nx*ny, 0);

    adios2::ADIOS adios = adios2::ADIOS();
    adios2::IO dataio = adios.DeclareIO("DataWriter");
    dataio.SetEngine("BP4");
    dataio.AddTransport("File", {{"Library", "posix"}});
    adios2::Engine writer = dataio.Open("data.bp", adios2::Mode::Write);
    writer.LockWriterDefinitions();

    adios2::Variable<double> WvarNumbers = dataio.DefineVariable<double>("numbers", {nx, ny}, {0, 0}, {nx, ny}, adios2::ConstantDims);

    for (size_t i = 0; i < nsteps; ++i){
        for (size_t i = 0; i < nx*ny; ++i) vec[i] = distribution(generator);

        writer.BeginStep();
        writer.Put(WvarNumbers, vec.data(), adios2::Mode::Deferred);
        writer.PerformPuts();
        writer.EndStep();
    }

    writer.Close();

    return 0;
}

Reader:

#include <iostream>
#include <vector>
#include <string>
#include <memory>
#include <chrono>

#include "adios2.h"
#ifdef __ENABLE_CALLGRIND__
    #include <valgrind/callgrind.h>
#endif // __ENABLE_CALLGRIND__

struct Timer{
    uint64_t counter = 0;
    std::chrono::system_clock::time_point t1 = std::chrono::high_resolution_clock::now();
    std::chrono::system_clock::time_point t2 = t1;  // std::chrono::high_resolution_clock::now();
    std::chrono::duration<long double, std::milli> sum = std::chrono::duration<long double, std::milli>::zero();

    void begin(){
        t1 = std::chrono::high_resolution_clock::now();
        t2 = t1;
    }
    void end(){
        t2 = std::chrono::high_resolution_clock::now();
        counter += 1;
        sum += std::chrono::duration<long double, std::milli>(t2 - t1);
    }

    std::chrono::duration<long double, std::milli> get(){
        if (counter > 0) return sum / counter;
        else return sum;
    }
};

std::string format_duration(std::chrono::duration<long double, std::milli> ms) {
    std::chrono::seconds secs = std::chrono::duration_cast<std::chrono::seconds>(ms);
    ms -= std::chrono::duration_cast<std::chrono::milliseconds>(secs);
    std::chrono::minutes mins = std::chrono::duration_cast<std::chrono::minutes>(secs);
    secs -= std::chrono::duration_cast<std::chrono::seconds>(mins);
    std::chrono::hours hour = std::chrono::duration_cast<std::chrono::hours>(mins);
    mins -= std::chrono::duration_cast<std::chrono::minutes>(hour);

    return std::to_string(hour.count()) + ":" + std::to_string(mins.count()) + ":" + std::to_string(secs.count()) + ":" + std::to_string(ms.count());
}

int main(int argc, char const *argv[])
{
    unsigned int nx = 9000;
    unsigned int ny = 1000;
    // unsigned int nsteps = 100;
    std::unique_ptr<double[]> data_buff(new double[nx*ny]);

    adios2::ADIOS adios = adios2::ADIOS();
    adios2::IO dataio = adios.DeclareIO("DataReader");
    dataio.SetEngine("BP4");
    adios2::Engine reader = dataio.Open("data.bp", adios2::Mode::Read);

    adios2::Variable<double> WvarNumbers;
    adios2::Dims WvarNumbersShape;

    #ifdef __ENABLE_TIMER__
        Timer timer;
    #endif // __ENABLE_TIMER__
    #ifdef __ENABLE_CALLGRIND__
        CALLGRIND_START_INSTRUMENTATION;
    #endif // __ENABLE_CALLGRIND__
    while (reader.CurrentStep() < reader.Steps() - 1){
        if (reader.BeginStep() == adios2::StepStatus::EndOfStream) {
            std::cerr << "Unexpected EOS" << std::endl;
            break;
        }

        WvarNumbers = dataio.InquireVariable<double>("numbers");
        if (!WvarNumbers) {
            std::cerr << " Cannot get variable" << std::endl;
            reader.EndStep();
            continue;
        }

        #ifdef __ENABLE_CALLGRIND__
            CALLGRIND_TOGGLE_COLLECT;
        #endif // __ENABLE_CALLGRIND__
        #ifdef __ENABLE_TIMER__
            timer.begin();
        #endif // __ENABLE_TIMER__

        reader.Get(WvarNumbers, data_buff.get(), adios2::Mode::Deferred);
        reader.PerformGets();

        #ifdef __ENABLE_TIMER__
            timer.end();
        #endif // __ENABLE_TIMER__
        #ifdef __ENABLE_CALLGRIND__
            CALLGRIND_TOGGLE_COLLECT;
        #endif // __ENABLE_CALLGRIND__

        reader.EndStep();
    }

    #ifdef __ENABLE_CALLGRIND__
        CALLGRIND_STOP_INSTRUMENTATION;
    #endif // __ENABLE_CALLGRIND__

    #ifdef __ENABLE_TIMER__
        std::cout << format_duration(timer.get()) << std::endl;
    #endif // __ENABLE_TIMER__

    reader.Close();

    return 0;
}

Just in case CMakeLists.txt

cmake_minimum_required(VERSION 3.18.0)
project(ReadWrite VERSION 0.1.0 LANGUAGES C CXX)

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED YES)

find_package(ADIOS2 REQUIRED)

if(DEFINED USE_CALLGRIND)
    if(USE_CALLGRIND STREQUAL ON)
        message(STATUS "Callgrind: on")
        add_definitions(-D__ENABLE_CALLGRIND__)
    else()
        message(STATUS "Callgrind: off")
    endif()
else()
    message(STATUS "Callgrind: off")
endif()

if(DEFINED USE_TIMER)
    if(USE_TIMER STREQUAL ON)
        message(STATUS "Timer: on")
        add_definitions(-D__ENABLE_TIMER__)
    else()
        message(STATUS "Timer: off")
    endif()
else()
    message(STATUS "Timer: off")
endif()


set(CMAKE_CXX_FLAGS "-g -O3")

add_executable(writer ${PROJECT_SOURCE_DIR}/writer.cpp)
add_executable(reader ${PROJECT_SOURCE_DIR}/reader.cpp)

target_link_libraries(writer PRIVATE adios2::cxx11)
target_link_libraries(reader PRIVATE adios2::cxx11)

Answered by eisenhauer

May 16, 2024

I've got some random notes on your code which I'll add at the end, but mostly there's not much on the reader side to customize. In simple situations like this, ADIOS read performance depends upon filesystem performance. Make sure ADIOS itself is compiled with optimization (Release build) and think about using BP5 (current default file engine). In more complex situations, for example a multi-dimensional global array with portions written by many different writer ranks where the reader wants only a slice of the whole, then performance will vary not only with the raw filesystem performance, but also with how much processing ADIOS has to do to assemble the requested data from the set of chunk…

View full answer

eisenhauer · 2024-05-16T13:05:19Z

eisenhauer
May 16, 2024
Maintainer

I've got some random notes on your code which I'll add at the end, but mostly there's not much on the reader side to customize. In simple situations like this, ADIOS read performance depends upon filesystem performance. Make sure ADIOS itself is compiled with optimization (Release build) and think about using BP5 (current default file engine). In more complex situations, for example a multi-dimensional global array with portions written by many different writer ranks where the reader wants only a slice of the whole, then performance will vary not only with the raw filesystem performance, but also with how much processing ADIOS has to do to assemble the requested data from the set of chunks it was written in.

Notes:

On the writer side, calling PerformPuts() right before EndStep() is not recommended. In some engines (like BP4) it didn't improve performance, but didn't damage it either. With BP5 it can damage performance in some circumstances. So the current recommendation is not to use PerformPuts() unless you suddenly need to reuse the buffers you supplied to a Put() in Deferred mode and want ADIOS to copy data out of those buffers (probably a rare circumstance).
LockWriterDefinitions currently has no effect except when streaming with the SST engine
On the reader side, doing a Get() with a pointer (rather than passing in a vector) can be dangerous because ADIOS can't guarantee that enough memory has been allocated to receive the requested data. But there are no performance implications per se.

1 reply

abkein May 16, 2024
Author

a multi-dimensional global array with portions written by many different writer ranks

So it matters, in how many ranks the array was written and in how many ranks it is read... I just encountered a problem, that simple programs like this run just fine (~20ms/~32MB written by 1 writer/1 reader), but reading the same arrays written by many ranks is much slower (~15s/~32MB written by 128 writers/1 reader).

Are there some advice on how to minimize "data assembling overhead" with and without adjusting the number of readers? The number of writers is usually constant.

doing a Get() with a pointer (rather than passing in a vector)

I profiled preallocating memory and passing a vector with valgrind, and it turns out, that passing a vector takes twice the time (one more memcpy call) than just passing a pointer to preallocated memory even if a vector.reserve() called before.
Probably I do something wrong, but in case of my example instead of creating unique_ptr I'm creating a std::vector(nx*ny, 0) vec and then pass it to ADIOS like reader.Get(WvarNumbers, vec)

Anyway, thanks for the reply!

eisenhauer · 2024-05-16T14:34:12Z

eisenhauer
May 16, 2024
Maintainer

So it matters, in how many ranks the array was written and in how many ranks it is read... I just encountered a problem, that simple programs like this run just fine (~20ms/~32MB written by 1 writer/1 reader), but reading the same arrays written by many ranks is much slower (~15s/~32MB written by 128 writers/1 reader).

Are there some advice on how to minimize "data assembling overhead" with and without adjusting the number of readers? The number of writers is usually constant.

Historically, ADIOS has focused on minimizing writer overhead because in the HPC environment it's usually the data producer running at the largest scale which has the biggest problems with I/O. So it has considerable complexity that helps to optimize the write (aggregating data to a subset of ranks to avoid every rank writing, etc.), with the data provided in each timestep by each writer treated as a separate chunk to be managed. In general, this leaves the "chunks" of the global array scattered in storage, and the reader simply has to deal with the situation. When dealing with many chunks on disk (as opposed to the single chunk of a single writer) the reader will be slower to fetch the baseline data because it's multiple non-contiguous reads. The complexity of the assembly depends more upon how the dimensionality of the data and how it's decomposed. If your array is decomposed across it's slowest-changing-in-memory dimension, then the chunks written by each writer end up being contiguous in memory when assembled and that's quicker. If the opposite, then you'll have an unpleasant memory access pattern during the reassembly. One possible solution, if your application allows, is to use the BlocksInfo call to see how the writer put the data out there and then Get() and process each piece block by block. Potentially more complexity for the reader, but avoids reassembly costs.

doing a Get() with a pointer (rather than passing in a vector)

I profiled preallocating memory and passing a vector with valgrind, and it turns out, that passing a vector takes twice the time (one more memcpy call) than just passing a pointer to preallocated memory even if a vector.reserve() called before. Probably I do something wrong, but in case of my example instead of creating unique_ptr I'm creating a std::vector(nx*ny, 0) vec and then pass it to ADIOS like reader.Get(WvarNumbers, vec)

Probably this is due to C++ demanding "safety". std::vector insists on initialzing all elements on .resize() even if you know you're going to overwrite all that on the next line. You could change when that init by calling resize() instead of reserve(), but it still happens somewhere. This isn't such a big deal if you're initing a handful of elements, but for a few million, dragging all that data through the cache multiple times hurts. However, if you reuse the same vector, it should only happen the first time if you read the same amount of data.

4 replies

abkein May 18, 2024
Author

By processing block by block did you meant something like this?
Assuming the shape of a variable is constant and I know in advance how the data is stored.
(In this example a variable is 2D nx*ny)

    // some previous code
    std::unique_ptr<double[]> data_buff(new double[nx*ny]);
    std::span<double> current_piece;
    
    // ADIOS2 initialization
    while (reader.CurrentStep() < reader.Steps() - 1){
        // some checks

        WvarAtoms = dataio.InquireVariable<double>("atoms");
        
        current_piece = std::span<double>(data_buff.get(), 0);
        WvarAtomsInfos = reader.BlocksInfo(WvarAtoms, reader.CurrentStep());
        for (const adios2::Variable<double>::Info& el : WvarAtomsInfos){
            WvarAtoms.SetBlockSelection(el.BlockID);
            current_piece = std::span<double>(current_piece.data(), el.Count[0]*el.Count[1]);
            reader.Get(WvarAtoms, current_piece.data(), adios2::Mode::Sync);

                // processing of the current_piece here

            current_piece = std::span<double>(current_piece.data() + el.Count[0]*el.Count[1], 0);
        }
        reader.EndStep();
    }

I just checked this code, and it turned out to be 4x-5x slower than just leave reassembling to the ADIOS2.

eisenhauer May 18, 2024
Maintainer

Well, not really. In this bit of code, you're still dealing with a nx*ny chunk of memory, reading into different bits of it and processing it as you read it. I.E. you still end up having the whole of the file data in memory at the end, you're now just interspersing processing with reading. It'd likely be better to use reader memory only as large as the largest writer chunk, and reuse that for each Get().

pnorbert May 20, 2024
Maintainer

This block-by-block read loop is reading each block one by one in Sync mode). If you read BP5 files and a variable with multiple blocks, the BP5 engine will use threads to speed up the read. I usually get about 4-5x speed difference when BP5 is using 8-16 threads in a serial reader. That you get this difference indicates to me that your block decomposition was on the slow dimension and therefore was no assembly cost of putting them together into the final array when you just read the whole variable. The name "atoms" indicates the same to me, you probably output each atom record as one row in a table, right? Can you share the bpls -lD output on the variable in question?

In this case, it worries me that for 128 blocks of a table, you get 1000x slower read than when it is a single block. Are you testing this on your laptop with an NVMe, or on a compute node with a parallel file system and hard disks?

abkein May 20, 2024
Author

You are right about data, it is LAMMPS generated dumps, variable atoms is the main data containing atom properties for each atom.

// .... other steps...
step 1003: 
          block   0: [      0:   7890, 0:8] = -0.816999 / 1.00008e+06
          block   1: [   7891:  15703, 0:8] = -0.770316 / 1.00008e+06
          block   2: [  15704:  23619, 0:8] = -0.748477 / 999937
          block   3: [  23620:  31477, 0:8] = -0.697995 / 1.00012e+06
          block   4: [  31478:  39266, 0:8] = -0.719114 / 999749
          block   5: [  39267:  47270, 0:8] = -0.672355 / 1.00015e+06
          block   6: [  47271:  55025, 0:8] = -0.715617 / 1.00016e+06
          block   7: [  55026:  62825, 0:8] = -0.681091 / 1.00013e+06
          block   8: [  62826:  70671, 0:8] = -0.729682 / 1.0001e+06
          block   9: [  70672:  78588, 0:8] = -0.762351 / 999927
          block  10: [  78589:  86328, 0:8] = -0.723222 / 999938
          block  11: [  86329:  94072, 0:8] = -0.859289 / 1.00016e+06
          block  12: [  94073: 101799, 0:8] = -0.830231 / 1.00002e+06
          block  13: [ 101800: 109590, 0:8] = -0.752145 / 999880
          block  14: [ 109591: 117464, 0:8] = -0.740052 / 1.00001e+06
          block  15: [ 117465: 125269, 0:8] = -0.6962 / 1.00018e+06
          block  16: [ 125270: 133176, 0:8] = -0.670985 / 1.00008e+06
          block  17: [ 133177: 140969, 0:8] = -0.759803 / 999795
          block  18: [ 140970: 148711, 0:8] = -0.719846 / 999953
          block  19: [ 148712: 156359, 0:8] = -0.718363 / 1.00004e+06
          block  20: [ 156360: 164315, 0:8] = -0.703935 / 999977
          block  21: [ 164316: 172286, 0:8] = -0.677116 / 1.00018e+06
          block  22: [ 172287: 180057, 0:8] = -0.684395 / 1.00017e+06
          block  23: [ 180058: 187816, 0:8] = -0.696232 / 999779
          block  24: [ 187817: 195626, 0:8] = -0.723749 / 999973
          block  25: [ 195627: 203362, 0:8] = -0.73108 / 1.00009e+06
          block  26: [ 203363: 211273, 0:8] = -0.73084 / 999852
          block  27: [ 211274: 219183, 0:8] = -0.715054 / 1.00003e+06
          block  28: [ 219184: 227194, 0:8] = -0.722824 / 1.00016e+06
          block  29: [ 227195: 234997, 0:8] = -0.646675 / 1.00006e+06
          block  30: [ 234998: 242739, 0:8] = -0.706972 / 1.00011e+06
          block  31: [ 242740: 250478, 0:8] = -0.752503 / 1.00019e+06
          block  32: [ 250479: 258208, 0:8] = -0.813134 / 1.00011e+06
          block  33: [ 258209: 266186, 0:8] = -0.692778 / 1.00019e+06
          block  34: [ 266187: 274099, 0:8] = -0.699227 / 999900
          block  35: [ 274100: 281879, 0:8] = -0.764986 / 999808
          block  36: [ 281880: 289824, 0:8] = -0.690354 / 1.00018e+06
          block  37: [ 289825: 297745, 0:8] = -0.79501 / 999948
          block  38: [ 297746: 305607, 0:8] = -0.747559 / 1.00014e+06
          block  39: [ 305608: 313399, 0:8] = -0.730438 / 1.00006e+06
          block  40: [ 313400: 321010, 0:8] = -0.703585 / 1.00017e+06
          block  41: [ 321011: 328797, 0:8] = -0.665741 / 1.00016e+06
          block  42: [ 328798: 336481, 0:8] = -0.680489 / 1.00012e+06
          block  43: [ 336482: 344265, 0:8] = -0.701804 / 1.00013e+06
          block  44: [ 344266: 352029, 0:8] = -0.749835 / 1.00018e+06
          block  45: [ 352030: 359908, 0:8] = -0.749268 / 1.00013e+06
          block  46: [ 359909: 367552, 0:8] = -0.675386 / 1.00018e+06
          block  47: [ 367553: 375535, 0:8] = -0.751487 / 1.00018e+06
          block  48: [ 375536: 383215, 0:8] = -0.763228 / 1.00016e+06
          block  49: [ 383216: 390918, 0:8] = -0.672257 / 1.00013e+06
          block  50: [ 390919: 398648, 0:8] = -0.703475 / 999685
          block  51: [ 398649: 406527, 0:8] = -0.69289 / 1.00009e+06

// .... other blocks...

Below I show "actual" processing function, but without actual computations to make it smaller.
Answering your question, I test it on real data on a compute node.
I have several BP storages each containing different number of steps. Before starting processing the generated data I distribute it between workers, it looks like this:

{
    "0": {
        "dumps/START0": [0, 3300]
    },
    "1": {
        "dumps/START0": [3300, 6600]
    },
    "2": {
        "dumps/START0": [6600, 9900]
    },
    "3": {
        "dumps/START0": [9900, 11400],
        "dumps/MAIN0": [0, 1800]
    },
    "4": {
        "dumps/MAIN0": [1800, 5100]
    },
    // ...another workers...
}

Root keys are numbers of workers, each worker can have several storages, each storage have two numbers — begin step and end step.
Usually I process data on one node with 32 cores, so I run 32 workers, each fully independent of others and each is a single MPI rank. Each worker consequently processes steps, each step can be processed independently, but unfortunately all atoms data must be known in order to process it. But if it is needed, I can read step block-by-block, but all blocks must be read before computations.
The full code can be found here, but it is such a mess.

    void MDN::run(){
        adios2::ADIOS adios;
        adios = adios2::ADIOS(args.adios_conf, MPI_COMM_SELF);

        adios2::IO dataio   = adios.DeclareIO("DataWriter");
        adios2::IO lmpsio   = adios.DeclareIO("LAMMPSReader");

        adios2::Variable<uint64_t> WvarNstep  = dataio.DefineVariable<uint64_t>("ntimestep");
        adios2::Variable<uint64_t> WvarN      = dataio.DefineVariable<uint64_t>("natoms");
        adios2::Variable<uint64_t> WvarDist   = dataio.DefineVariable<uint64_t>("dist",     {_Natoms + 1}, {0}, {_Natoms + 1}, adios2::ConstantDims);
        adios2::Variable<double>   WvarTemps  = dataio.DefineVariable<double>  ("temps",    {_Natoms + 1}, {0}, {_Natoms + 1}, adios2::ConstantDims);
        adios2::Variable<double>   WvarVol    = dataio.DefineVariable<double>  ("volume");
        adios2::Variable<double>   WvarTTemp  = dataio.DefineVariable<double>  ("total_temp");

        adios2::Engine writer = dataio.Open(args.outfile, adios2::Mode::Write, MPI_COMM_SELF);

        adios2::Variable<uint64_t> varNstep;
        adios2::Variable<uint64_t> varNatoms;
        adios2::Variable<double>   varBoxxhi;
        adios2::Variable<double>   varBoxyhi;
        adios2::Variable<double>   varBoxzhi;
        adios2::Variable<double>   varBoxxlo;
        adios2::Variable<double>   varBoxylo;
        adios2::Variable<double>   varBoxzlo;
        adios2::Variable<double>   varAtoms;

        memory.allocate(_Natoms);

        auto cluster_ids = memory.prop(PROPS::CID);
        auto masses = memory.prop(PROPS::MASS);
        auto velX = memory.prop(PROPS::VELX);
        auto velY = memory.prop(PROPS::VELY);
        auto velZ = memory.prop(PROPS::VELZ);

        timer TADIOS_get_data;
        timer TADIOS_write_data;

        total_steps = 0;
        for (const auto &[storage, steps] : storages) total_steps += steps.second - steps.first;

        uint64_t done_steps = 0;
        int storage_index = -1;
        for (const auto &[storage, steps] : storages){
            ++storage_index;
            uint64_t currentStep = 0;

                adios2::Engine reader = lmpsio.Open(storage, adios2::Mode::Read, MPI_COMM_SELF);

                while (currentStep != steps.first) {
                    if (reader.BeginStep() == adios2::StepStatus::EndOfStream) {
                        logger.error("End of stream happened while scrolling to begin step");
                        logger.error("Storage: {}, begin: {}, end: {}", storage.string(), steps.first, steps.second);
                        throw std::runtime_error("End of stream happened while scrolling to begin step");
                    }
                    currentStep = reader.CurrentStep();
                    reader.EndStep();
                }

                while (currentStep < steps.second) {
                    if (reader.BeginStep() == adios2::StepStatus::EndOfStream){
                            logger.trace("EOS reached");
                        break;
                    }

                    currentStep = reader.CurrentStep();

                    varNstep  = lmpsio.InquireVariable<uint64_t>(std::string(lcf::timestep));
                    varNatoms = lmpsio.InquireVariable<uint64_t>(std::string(lcf::natoms  ));
                    varBoxxhi = lmpsio.InquireVariable<double>  (std::string(lcf::boxxhi  ));
                    varBoxyhi = lmpsio.InquireVariable<double>  (std::string(lcf::boxyhi  ));
                    varBoxzhi = lmpsio.InquireVariable<double>  (std::string(lcf::boxzhi  ));
                    varBoxxlo = lmpsio.InquireVariable<double>  (std::string(lcf::boxxlo  ));
                    varBoxylo = lmpsio.InquireVariable<double>  (std::string(lcf::boxylo  ));
                    varBoxzlo = lmpsio.InquireVariable<double>  (std::string(lcf::boxzlo  ));
                    varAtoms  = lmpsio.InquireVariable<double>  (std::string(lcf::atoms   ));

                    if (!(varNstep && varNatoms && varBoxxhi && varBoxyhi && varBoxzhi && varBoxxlo && varBoxylo && varBoxzlo && varAtoms)){
                        reader.EndStep();
                        logger.error("Error on read variables at step {}, continuing to next step", currentStep);
                        continue;
                    }

                    #ifdef __MDN_PROFILING__
                        TADIOS_get_data.b();
                    #endif // __MDN_PROFILING__
                    reader.Get(varNstep, &timestep, adios2::Mode::Deferred);
                    reader.Get(varNatoms, &Natoms, adios2::Mode::Deferred);
                    reader.Get(varBoxxhi, &boxxhi, adios2::Mode::Deferred);
                    reader.Get(varBoxyhi, &boxyhi, adios2::Mode::Deferred);
                    reader.Get(varBoxzhi, &boxzhi, adios2::Mode::Deferred);
                    reader.Get(varBoxxlo, &boxxlo, adios2::Mode::Deferred);
                    reader.Get(varBoxylo, &boxylo, adios2::Mode::Deferred);
                    reader.Get(varBoxzlo, &boxzlo, adios2::Mode::Deferred);
                    reader.Get(varAtoms, memory.data(), adios2::Mode::Deferred);
                    reader.EndStep();
                    #ifdef __MDN_PROFILING__
                        TADIOS_get_data.e();
                    #endif // __MDN_PROFILING__

                   // here is actual data processing. 

                    #ifdef __MDN_PROFILING__
                        TADIOS_write_data.b();
                    #endif // !__MDN_PROFILING__
                    writer.BeginStep();
                    writer.Put(WvarNstep, &timestep);
                    writer.Put(WvarDist,  sizes_counts.data());
                    writer.Put(WvarVol,   &_Volume);
                    writer.Put(WvarN,     &Natoms);
                    writer.Put(WvarTTemp, &total_temp);
                    writer.Put(WvarTemps, temps_by_size.data());
                    writer.EndStep();
                    #ifdef __MDN_PROFILING__
                        TADIOS_write_data.e();
                    #endif // !__MDN_PROFILING__

                    ++done_steps;
                } // while loop
                reader.Close();
        } // for loop
        writer.Close();
    }

One more question: am I right about a data can not be appended to already existing BP storage? I.e. be opened in Append mode.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Is it possible to speedup this reader's code? #4165

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 2 comments 5 replies

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

{{title}}

Select a reply

Is it possible to speedup this reader's code? #4165

abkein May 15, 2024

Replies: 2 comments · 5 replies

eisenhauer May 16, 2024 Maintainer

abkein May 16, 2024 Author

eisenhauer May 16, 2024 Maintainer

abkein May 18, 2024 Author

eisenhauer May 18, 2024 Maintainer

pnorbert May 20, 2024 Maintainer

abkein May 20, 2024 Author

abkein
May 15, 2024

Replies: 2 comments 5 replies

eisenhauer
May 16, 2024
Maintainer

abkein May 16, 2024
Author

eisenhauer
May 16, 2024
Maintainer

abkein May 18, 2024
Author

eisenhauer May 18, 2024
Maintainer

pnorbert May 20, 2024
Maintainer

abkein May 20, 2024
Author