Merge pull request #9 from pravirkr/ddmt

Initial direct dedisp
pravirkr · Jun 6, 2024 · d141ec0 · d141ec0
2 parents f5f50cd + 305a462
commit d141ec0
Show file tree

Hide file tree

Showing 10 changed files with 420 additions and 38 deletions.
diff --git a/include/dmt/ddmt_base.hpp b/include/dmt/ddmt_base.hpp
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+using SizeType = std::size_t;
+
+struct DDMTPlan {
+    std::vector<float> dm_arr;
+    // ndm x nchan
+    std::vector<SizeType> delay_table;
+    size_t nchans;
+};
+
+class DDMT {
+public:
+    DDMT(float f_min,
+         float f_max,
+         SizeType nchans,
+         float tsamp,
+         float dm_max,
+         float dm_step,
+         float dm_min = 0.0F);
+
+    DDMT(float f_min,
+         float f_max,
+         SizeType nchans,
+         float tsamp,
+         const float* dm_arr,
+         SizeType dm_count);
+
+    DDMT(const DDMT&)            = delete;
+    DDMT& operator=(const DDMT&) = delete;
+    DDMT(DDMT&&)                 = delete;
+    DDMT& operator=(DDMT&&)      = delete;
+    virtual ~DDMT()              = default;
+
+    const DDMTPlan& get_plan() const;
+    std::vector<float> get_dm_grid() const;
+    static void set_log_level(int level);
+    virtual void execute(const float* __restrict waterfall,
+                         SizeType waterfall_size,
+                         float* __restrict dmt,
+                         SizeType dmt_size) = 0;
+
+private:
+    float m_f_min;
+    float m_f_max;
+    SizeType m_nchans;
+    float m_tsamp;
+    std::vector<float> m_dm_arr;
+
+    DDMTPlan m_ddmt_plan;
+
+    void validate_inputs() const;
+    void configure_ddmt_plan();
+    static std::vector<float>
+    generate_dm_arr(float dm_max, float dm_step, float dm_min);
+    static std::vector<float> generate_dm_arr(const float* dm_arr,
+                                              SizeType dm_count);
+};
diff --git a/include/dmt/ddmt_cpu.hpp b/include/dmt/ddmt_cpu.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <dmt/ddmt_base.hpp>
+
+class DDMTCPU : public DDMT {
+public:
+    DDMTCPU(float f_min,
+            float f_max,
+            SizeType nchans,
+            float tsamp,
+            float dm_max,
+            float dm_step,
+            float dm_min = 0.0F);
+
+    DDMTCPU(float f_min,
+            float f_max,
+            SizeType nchans,
+            float tsamp,
+            const float* dm_arr,
+            SizeType dm_count);
+
+    static void set_num_threads(int nthreads);
+    void execute(const float* __restrict waterfall,
+                 SizeType waterfall_size,
+                 float* __restrict dmt,
+                 SizeType dmt_size) override;
+
+private:
+    static void execute_dedisp(const float* __restrict__ d_in,
+                               size_t in_chan_stride,
+                               size_t in_samp_stride,
+                               float* __restrict__ d_out,
+                               size_t out_dm_stride,
+                               size_t out_samp_stride,
+                               const size_t* __restrict__ delay_table,
+                               size_t dm_count,
+                               size_t nchans,
+                               size_t nsamps_reduced);
+};
diff --git a/include/dmt/ddmt_gpu.hpp b/include/dmt/ddmt_gpu.hpp
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <dmt/ddmt_base.hpp>
+
+struct DDMTPlanD {
+    thrust::device_vector<float> dm_arr_d;
+    thrust::device_vector<int> delay_arr_d;
+    thrust::device_vector<int> kill_mask_d;
+};
+
+class DDMTGPU : public DDMT {
+public:
+    DDMTGPU(float f_min,
+            float f_max,
+            SizeType nchans,
+            SizeType nsamps,
+            float tsamp,
+            float dm_max,
+            float dm_step,
+            float dm_min = 0.0F);
+    void execute(const float* __restrict waterfall,
+                 SizeType waterfall_size,
+                 float* __restrict dmt,
+                 SizeType dmt_size) override;
+
+    void execute(const float* __restrict waterfall,
+                 SizeType waterfall_size,
+                 float* __restrict dmt,
+                 SizeType dmt_size,
+                 bool device_flags);
+
+private:
+    DDMTPlanD m_ddmt_plan_d;
+
+    static void transfer_plan_to_device(const DDMTPlan& plan,
+                                        DDMTPlanD& plan_d);
+    void execute_device(const float* __restrict waterfall,
+                        SizeType waterfall_size,
+                        float* __restrict dmt,
+                        SizeType dmt_size);
+};
diff --git a/lib/ddmt_base.cpp b/lib/ddmt_base.cpp
@@ -0,0 +1,90 @@
+#include <stdexcept>
+
+#include <spdlog/spdlog.h>
+
+#include <dmt/ddmt_base.hpp>
+#include <dmt/fdmt_utils.hpp>
+
+DDMT::DDMT(float f_min,
+           float f_max,
+           SizeType nchans,
+           float tsamp,
+           float dm_max,
+           float dm_step,
+           float dm_min)
+    : m_f_min(f_min),
+      m_f_max(f_max),
+      m_nchans(nchans),
+      m_tsamp(tsamp),
+      m_dm_arr(generate_dm_arr(dm_max, dm_step, dm_min)) {
+    validate_inputs();
+    configure_ddmt_plan();
+    spdlog::debug("DDMT: dm_max={}, dm_min={}, dm_step={}", dm_max, dm_min,
+                  dm_step);
+}
+
+DDMT::DDMT(float f_min,
+           float f_max,
+           SizeType nchans,
+           float tsamp,
+           const float* dm_arr,
+           SizeType dm_count)
+    : m_f_min(f_min),
+      m_f_max(f_max),
+      m_nchans(nchans),
+      m_tsamp(tsamp),
+      m_dm_arr(generate_dm_arr(dm_arr, dm_count)) {
+    validate_inputs();
+    configure_ddmt_plan();
+    spdlog::debug("DDMT: dm_count={}", dm_count);
+}
+
+const DDMTPlan& DDMT::get_plan() const { return m_ddmt_plan; }
+
+std::vector<float> DDMT::get_dm_grid() const { return m_ddmt_plan.dm_arr; }
+
+void DDMT::set_log_level(int level) {
+    if (level < static_cast<int>(spdlog::level::trace) ||
+        level > static_cast<int>(spdlog::level::off)) {
+        spdlog::set_level(spdlog::level::info);
+    }
+    spdlog::set_level(static_cast<spdlog::level::level_enum>(level));
+}
+
+void DDMT::configure_ddmt_plan() {
+    m_ddmt_plan.nchans = m_nchans;
+    m_ddmt_plan.dm_arr = m_dm_arr;
+    const auto df      = (m_f_max - m_f_min) / static_cast<float>(m_nchans);
+    m_ddmt_plan.delay_table = ddmt::generate_delay_table(
+        m_dm_arr.data(), m_dm_arr.size(), m_f_min, df, m_nchans, m_tsamp);
+}
+
+void DDMT::validate_inputs() const {
+    if (m_f_min >= m_f_max) {
+        throw std::invalid_argument("f_min must be less than f_max");
+    }
+    if (m_tsamp <= 0) {
+        throw std::invalid_argument("tsamp must be greater than 0");
+    }
+    if (m_nchans <= 0) {
+        throw std::invalid_argument("nchans must be greater than 0");
+    }
+    if (m_dm_arr.empty()) {
+        throw std::invalid_argument("dm_arr must not be empty");
+    }
+}
+
+std::vector<float>
+DDMT::generate_dm_arr(float dm_max, float dm_step, float dm_min) {
+    std::vector<float> dm_arr;
+    for (float dm = dm_min; dm <= dm_max; dm += dm_step) {
+        dm_arr.push_back(dm);
+    }
+    return dm_arr;
+}
+
+std::vector<float> DDMT::generate_dm_arr(const float* dm_arr,
+                                         SizeType dm_count) {
+    std::vector<float> dm_vec(dm_arr, dm_arr + dm_count);
+    return dm_vec;
+}
diff --git a/lib/ddmt_cpu.cpp b/lib/ddmt_cpu.cpp
@@ -0,0 +1,88 @@
+#include "dmt/ddmt_base.hpp"
+#include <cmath>
+#include <cstddef>
+#include <spdlog/spdlog.h>
+#ifdef USE_OPENMP
+#include <omp.h>
+#endif
+
+#include <dmt/ddmt_cpu.hpp>
+
+DDMTCPU::DDMTCPU(float f_min,
+                 float f_max,
+                 SizeType nchans,
+                 float tsamp,
+                 float dm_max,
+                 float dm_step,
+                 float dm_min)
+    : DDMT(f_min, f_max, nchans, tsamp, dm_max, dm_step, dm_min) {}
+
+DDMTCPU::DDMTCPU(float f_min,
+                 float f_max,
+                 SizeType nchans,
+                 float tsamp,
+                 const float* dm_arr,
+                 SizeType dm_count)
+    : DDMT(f_min, f_max, nchans, tsamp, dm_arr, dm_count) {}
+
+void DDMTCPU::set_num_threads(int nthreads) {
+#ifdef USE_OPENMP
+    omp_set_num_threads(nthreads);
+#endif
+}
+
+void DDMTCPU::execute(const float* __restrict waterfall,
+                      SizeType waterfall_size,
+                      float* __restrict dmt,
+                      SizeType dmt_size) {
+    const auto& plan           = get_plan();
+    const auto nchans          = plan.nchans;
+    const auto nsamps          = waterfall_size / nchans;
+    const auto max_delay       = plan.delay_table.back();
+    const auto nsamps_reduced  = nsamps - max_delay;
+    const auto out_dm_stride   = nsamps_reduced;
+    const auto out_samp_stride = 1;
+    const auto in_chan_stride  = nsamps;
+    const auto in_samp_stride  = 1;
+    const auto* delay_table    = plan.delay_table.data();
+    const auto dm_count        = plan.dm_arr.size();
+
+    if (dmt_size != dm_count * nsamps_reduced) {
+        spdlog::error("Output buffer size mismatch: expected {}, got {}",
+                      dm_count * nsamps_reduced, dmt_size);
+        return;
+    }
+
+    execute_dedisp(waterfall, in_chan_stride, in_samp_stride, dmt,
+                   out_dm_stride, out_samp_stride, delay_table, dm_count,
+                   nchans, nsamps_reduced);
+}
+
+void DDMTCPU::execute_dedisp(const float* __restrict__ d_in,
+                             size_t in_chan_stride,
+                             size_t in_samp_stride,
+                             float* __restrict__ d_out,
+                             size_t out_dm_stride,
+                             size_t out_samp_stride,
+                             const size_t* __restrict__ delay_table,
+                             size_t dm_count,
+                             size_t nchans,
+                             size_t nsamps_reduced) {
+#pragma omp parallel for default(none)                                         \
+    shared(d_in, d_out, delay_table, dm_count, nchans, nsamps_reduced,         \
+               in_chan_stride, in_samp_stride, out_dm_stride, out_samp_stride)
+    for (size_t i_dm = 0; i_dm < dm_count; ++i_dm) {
+        const auto& delays = &delay_table[i_dm * nchans];
+        const auto out_idx = i_dm * out_dm_stride;
+        for (size_t i_samp = 0; i_samp < nsamps_reduced; ++i_samp) {
+            float sum = 0.0F;
+#pragma omp simd reduction(+ : sum)
+            for (size_t i_chan = 0; i_chan < nchans; ++i_chan) {
+                const auto& delay = delays[i_chan];
+                sum += d_in[i_chan * in_chan_stride +
+                            (i_samp + delay) * in_samp_stride];
+            }
+            d_out[out_idx + i_samp * out_samp_stride] = sum;
+        }
+    }
+}
diff --git a/lib/dmt/cuda_utils.cuh b/lib/dmt/cuda_utils.cuh
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace error_checker {
+
+inline void
+check_cuda_error(const char* file, int line, const std::string& msg = "") {
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess) {
+        std::stringstream error_msg;
+        error_msg << "CUDA failed with error: " << cudaGetErrorString(error)
+                  << " (" << file << ":" << line << ")";
+        if (!msg.empty()) {
+            error_msg << " - " << msg;
+        }
+        throw std::runtime_error(error_msg.str());
+    }
+}
+
+inline void
+check_cuda_error_sync(const char* file, int line, const std::string& msg = "") {
+    cudaDeviceSynchronize();
+    check_cuda_error(file, line, msg);
+}
+
+void check_cuda(const std::string& msg = "") {
+    check_cuda_error(__FILE__, __LINE__, msg);
+}
+
+void check_cuda_sync(const std::string& msg = "") {
+    check_cuda_error_sync(__FILE__, __LINE__, msg);
+}
+
+} // namespace error_checker
diff --git a/lib/dmt/fdmt_utils.hpp b/lib/dmt/fdmt_utils.hpp
@@ -11,6 +11,15 @@ constexpr float kDispConst   = kDispConstMT;
 
 using SizeType = std::size_t;
 
+namespace ddmt {
+std::vector<SizeType> generate_delay_table(const float* dm_arr,
+                                           SizeType dm_count,
+                                           float f0,
+                                           float df,
+                                           SizeType nchans,
+                                           float tsamp);
+}
+
 namespace fdmt {
 
 float cff(float f1_start, float f1_end, float f2_start, float f2_end);