From bb10165df3083cb3a7184790af094b721e759986 Mon Sep 17 00:00:00 2001 From: npkamath Date: Tue, 2 Jan 2024 13:02:01 -0500 Subject: [PATCH 01/27] added Dynp class --- .github/PULL_REQUEST_TEMPLATE.md | 2 + Dynp/.gitignore | 40 ++++++ Dynp/.gitmodules | 4 + Dynp/CMakeLists.txt | 28 ++++ Dynp/README.md | 2 + Dynp/extern/pybind11 | 1 + Dynp/src/cpp/dupin.cpp | 198 ++++++++++++++++++++++++++ Dynp/src/cpp/dupin.h | 131 +++++++++++++++++ Dynp/src/interface/dupininterface.cpp | 20 +++ Dynp/src/interface/dupininterface.py | 114 +++++++++++++++ PULL_REQUEST_TEMPLATE.md | 16 +++ 11 files changed, 556 insertions(+) create mode 100644 Dynp/.gitignore create mode 100644 Dynp/.gitmodules create mode 100644 Dynp/CMakeLists.txt create mode 100644 Dynp/README.md create mode 160000 Dynp/extern/pybind11 create mode 100644 Dynp/src/cpp/dupin.cpp create mode 100644 Dynp/src/cpp/dupin.h create mode 100644 Dynp/src/interface/dupininterface.cpp create mode 100644 Dynp/src/interface/dupininterface.py create mode 100644 PULL_REQUEST_TEMPLATE.md diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 664d38d..cade1f9 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,9 +1,11 @@ ## Description + ## Motivation + ## Do Changes Introduce Breaking Changes ## Have you (if appropriate) diff --git a/Dynp/.gitignore b/Dynp/.gitignore new file mode 100644 index 0000000..9ba0cf9 --- /dev/null +++ b/Dynp/.gitignore @@ -0,0 +1,40 @@ +# Visual Studio build files +*.vcxproj +*.vcxproj.filters +*.vcxproj.user +*.sln +*.vcproj +*.vcxproj.* +*.vcxproj.*.user +*.vspscc +*.vssscc +*.publishsettings +*_i.c +*_p.c +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +*.swp +*.user +*.suo +*.tlh +*.bak +*.[Cc]ache +*.DS_Store +x64 +build +.venv diff --git a/Dynp/.gitmodules b/Dynp/.gitmodules new file mode 100644 index 0000000..062cc71 --- /dev/null +++ b/Dynp/.gitmodules @@ -0,0 +1,4 @@ +[submodule "extern/pybind11"] + path = extern/pybind11 + url = ../../pybind/pybind11 + branch = stable diff --git a/Dynp/CMakeLists.txt b/Dynp/CMakeLists.txt new file mode 100644 index 0000000..e495afa --- /dev/null +++ b/Dynp/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 3.8) +project(_dupin VERSION 1.0.0) # Replace with your actual version + +set(DEFAULT_BUILD_TYPE "Release") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE ${DEFAULT_BUILD_TYPE}) +endif() + +find_package(Eigen3 REQUIRED) +find_package(TBB REQUIRED) +add_subdirectory(extern/pybind11) + +pybind11_add_module(_dupin src/cpp/dupin.h src/cpp/dupin.cpp src/interface/dupininterface.cpp) + +target_include_directories(_dupin PRIVATE + ${EIGEN3_INCLUDE_DIR} + ${TBB_INCLUDE_DIRS} + "${CMAKE_CURRENT_SOURCE_DIR}/eigen-3.4.0" +) + +target_link_libraries(_dupin PRIVATE TBB::tbb) +target_compile_definitions(_dupin PRIVATE VERSION_INFO=${PROJECT_VERSION}) +target_compile_options(_dupin PRIVATE -O2 -march=native) + +set_target_properties(_dupin PROPERTIES + CXX_STANDARD 17 + CMAKE_CXX_STANDARD_REQUIRED True +) diff --git a/Dynp/README.md b/Dynp/README.md new file mode 100644 index 0000000..03e0cc7 --- /dev/null +++ b/Dynp/README.md @@ -0,0 +1,2 @@ +Requires installation of dupin.py +C++ dependencies: intel-tbb, eigen diff --git a/Dynp/extern/pybind11 b/Dynp/extern/pybind11 new file mode 160000 index 0000000..8b03ffa --- /dev/null +++ b/Dynp/extern/pybind11 @@ -0,0 +1 @@ +Subproject commit 8b03ffa7c06cd9c8a38297b1c8923695d1ff1b07 diff --git a/Dynp/src/cpp/dupin.cpp b/Dynp/src/cpp/dupin.cpp new file mode 100644 index 0000000..a6f9a47 --- /dev/null +++ b/Dynp/src/cpp/dupin.cpp @@ -0,0 +1,198 @@ +#include +#include +#include +#include +#include +#include "dupin.h" +#include +#include +#include +#include + + +using namespace std; +using namespace Eigen; + + +DynamicProgramming::DynamicProgramming() + : num_bkps(0), num_parameters(0), num_timesteps(0), jump(1), min_size(3) { + +} + +DynamicProgramming::DynamicProgramming(int num_bkps_, int num_parameters_, int num_timesteps_, int jump_, int min_size_) + : num_bkps(num_bkps_), num_parameters(num_parameters_), + num_timesteps(num_timesteps_), jump(jump_), min_size(min_size_) { + +} + +void DynamicProgramming::read_input() { + cin >> jump >> min_size >> num_bkps >> num_parameters >> num_timesteps; + datum.resize(num_timesteps, num_parameters); + + for (int i = 0; i < num_timesteps; ++i) { + for (int j = 0; j < num_parameters; ++j) { + cin >> datum(i, j); + } + } +} + + +void DynamicProgramming::scale_datum() { + VectorXd min_val = datum.colwise().minCoeff(); + VectorXd max_val = datum.colwise().maxCoeff(); + VectorXd range = max_val - min_val; + + for (int j = 0; j < num_parameters; ++j) { + if (range(j) == 0.0) { + datum.col(j).setZero(); + } else { + datum.col(j) = (datum.col(j).array() - min_val(j)) / range(j); + } + } +} +void DynamicProgramming::regression_setup(linear_fit_struct &lfit) { + lfit.x = VectorXd::LinSpaced(num_timesteps, 0, num_timesteps - 1) / (num_timesteps - 1); + lfit.y = datum; +} + +VectorXd DynamicProgramming::regression_line(int start, int end, int dim, linear_fit_struct &lfit) { + int n = end - start; + VectorXd x = lfit.x.segment(start, n); + VectorXd y = lfit.y.col(dim).segment(start, n); + + double x_mean = x.mean(); + double y_mean = y.mean(); + + VectorXd x_centered = x.array() - x_mean; + VectorXd y_centered = y.array() - y_mean; + + double slope = x_centered.dot(y_centered) / x_centered.squaredNorm(); + double intercept = y_mean - slope * x_mean; + + return x.unaryExpr([slope, intercept](double xi) { return slope * xi + intercept; }); +} + + +double DynamicProgramming::l2_cost(MatrixXd &predicted_y, int start, int end) { + MatrixXd diff = predicted_y.block(start, 0, end - start, num_parameters) - datum.block(start, 0, end - start, num_parameters); + return sqrt(diff.array().square().sum()); +} + + + +MatrixXd DynamicProgramming::predicted(int start, int end, linear_fit_struct &lfit) { + MatrixXd predicted_y(num_timesteps, num_parameters); + for (int i = 0; i < num_parameters; ++i) { + predicted_y.block(start, i, end - start, 1) = regression_line(start, end, i, lfit); + } + return predicted_y; +} + +double DynamicProgramming::cost_function(int start, int end) { + linear_fit_struct lfit; + regression_setup(lfit); + MatrixXd predicted_y = predicted(start, end, lfit); + return l2_cost(predicted_y, start, end); +} + +void DynamicProgramming::initialize_cost_matrix() { + scale_datum(); + cost_matrix.initialize(num_timesteps); + + tbb::parallel_for(tbb::blocked_range(0, num_timesteps), [&](const tbb::blocked_range& r) { + for (int i = r.begin(); i < r.end(); ++i) { + for (int j = i + min_size; j < num_timesteps; ++j) { + cost_matrix(i, j) = cost_function(i, j); + } + } + }); +} + +pair> DynamicProgramming::seg(int start, int end, int num_bkps) { + MemoKey key = {start, end, num_bkps}; + auto it = memo.find(key); + if (it != memo.end()) { + return it->second; + } +// no_memo++; + if (num_bkps == 0) { + return {cost_matrix(start, end), {end}}; + } + + pair> best = {numeric_limits::infinity(), {}}; + + for (int bkp = start + min_size; bkp < end; bkp++) { + if ((bkp - start) >= min_size && (end - bkp) >= min_size) { + auto left = seg(start, bkp, num_bkps - 1); + auto right = seg(bkp, end, 0); + double cost = left.first + right.first; + if (cost < best.first) { + best.first = cost; + best.second = left.second; + best.second.push_back(bkp); + best.second.insert(best.second.end(), right.second.begin(), right.second.end()); + } + } + } + + memo[key] = best; + return best; +} + +vector DynamicProgramming::return_breakpoints() { + auto result = seg(0, num_timesteps-1, num_bkps); + vector breakpoints = result.second; + sort(breakpoints.begin(), breakpoints.end()); + breakpoints.erase(unique(breakpoints.begin(), breakpoints.end()), breakpoints.end()); + return breakpoints; +} + +void set_parallelization(int num_threads) { + static tbb::global_control gc(tbb::global_control::max_allowed_parallelism, num_threads); +} + +int DynamicProgramming::get_num_timesteps() { + return num_timesteps; +} + +int DynamicProgramming::get_num_parameters() { + return num_parameters; +} + +int DynamicProgramming::get_num_bkps() { + return num_bkps; +} + +Eigen::MatrixXd& DynamicProgramming::getDatum() { + return datum; +} + +DynamicProgramming::upper_triangular_cost_matrix& DynamicProgramming::getCostMatrix() { + return cost_matrix; +} + +void DynamicProgramming::set_num_timesteps(int value) { + num_timesteps = value; +} + +void DynamicProgramming::set_num_parameters(int value) { + num_parameters = value; +} + +void DynamicProgramming::set_num_bkps(int value) { + num_bkps = value; +} + +void DynamicProgramming::setDatum(const Eigen::MatrixXd& value) { + datum = value; +} + +void DynamicProgramming::setCostMatrix(const DynamicProgramming::upper_triangular_cost_matrix& value) { + cost_matrix = value; +} + +int main() { + + return 0; + +} \ No newline at end of file diff --git a/Dynp/src/cpp/dupin.h b/Dynp/src/cpp/dupin.h new file mode 100644 index 0000000..5724511 --- /dev/null +++ b/Dynp/src/cpp/dupin.h @@ -0,0 +1,131 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + + + +// DynamicProgramming class for dynamic programming based segmentation. +class DynamicProgramming { //change name to Dynamic Programming +private: + class upper_triangular_cost_matrix { + private: + std::vector matrix; + int size; + + int index(int i, int j) const { + return i * (2 * size - i + 1) / 2 + (j - i); + } + + public: + upper_triangular_cost_matrix() : size(0) {} + + void initialize(int n) { + size = n; + matrix.resize(n * (n + 1) / 2, 0.0); + } + + double& operator()(int i, int j) { + return matrix[index(i, j)]; + } + + int getSize() const { + return size; + } + + }; + upper_triangular_cost_matrix cost_matrix; + // Struct for memoization key, combining start, end, and number of breakpoints. + struct MemoKey { + int start; + int end; + int num_bkps; + + // Comparison operator for MemoKey. + bool operator==(const MemoKey &other) const { + return start == other.start && end == other.end && num_bkps == other.num_bkps; + } + }; + + // Custom XOR-bit hash function for MemoKey, avoids clustering of data in unordered map to improve efficiency. + struct MemoKeyHash { //test without hash function + std::size_t operator()(const MemoKey& key) const { + return ((std::hash()(key.start) ^ (std::hash()(key.end) << 1)) >> 1) ^ std::hash()(key.num_bkps); + } + }; + + // Memoization map to store the cost and partition for given parameters. + std::unordered_map>, MemoKeyHash> memo; + + int num_bkps; // Number of breakpoints to detect. + int num_parameters; // Number of features in the dataset. + int num_timesteps; // Number of data points (time steps). + int jump; // Interval for checking potential breakpoints. + int min_size; // Minimum size of a segment. + Eigen::MatrixXd datum; // Matrix storing the dataset. + + + + // Structure for storing linear regression parameters. + struct linear_fit_struct { + Eigen::MatrixXd y; // Dependent variable (labels). + Eigen::VectorXd x; //z Independent variable (time steps). + }; + +public: + // Default constructor. + DynamicProgramming(); + + // Parameterized constructor. + DynamicProgramming(int num_bkps_, int num_parameters_, int num_timesteps_, int jump_, int min_size_); + + // Scales the dataset using min-max normalization. + void scale_datum(); + + // Reads input data from standard input - mainly for testing purposes. + void read_input(); + + // Prepares data for linear regression. + void regression_setup(linear_fit_struct& lfit); + + // Calculates the regression line for a given data segment. + Eigen::VectorXd regression_line(int start, int end, int dim, linear_fit_struct& lfit); + + // Generates predicted values based on the linear regression model. + Eigen::MatrixXd predicted(int start, int end, linear_fit_struct& lfit); + + // Calculates L2 cost (Euclidean distance) between predicted and actual data. + double l2_cost(Eigen::MatrixXd &predicted_y, int start, int end); + + // Computes the cost of a specific data segment using linear regression. + double cost_function(int start, int end); + + // Initializes and fills the cost matrix for all data segments. + void initialize_cost_matrix(); + + // Recursive function for dynamic programming segmentation. + std::pair> seg(int start, int end, int num_bkps); + + void set_parallelization(int num_threads); + // Returns the optimal set of breakpoints after segmentation. + std::vector return_breakpoints(); + + // Getter functions for accessing private class members. + int get_num_timesteps(); + int get_num_parameters(); + int get_num_bkps(); + Eigen::MatrixXd& getDatum(); + DynamicProgramming::upper_triangular_cost_matrix& getCostMatrix(); + + // Setter functions for modifying private class members. + void set_num_timesteps(int value); + void set_num_parameters(int value); + void set_num_bkps(int value); + void setDatum(const Eigen::MatrixXd& value); + void setCostMatrix(const DynamicProgramming::upper_triangular_cost_matrix& value); +}; diff --git a/Dynp/src/interface/dupininterface.cpp b/Dynp/src/interface/dupininterface.cpp new file mode 100644 index 0000000..5d63c94 --- /dev/null +++ b/Dynp/src/interface/dupininterface.cpp @@ -0,0 +1,20 @@ +#include +#include +#include +#include "dupin.h" + +namespace py = pybind11; + +PYBIND11_MODULE(_dupin, m) { + py::class_(m, "DynamicProgramming") + .def_property("datum", &DynamicProgramming::getDatum, &DynamicProgramming::setDatum) + .def_property("cost_matrix", &DynamicProgramming::getCostMatrix, &DynamicProgramming::setCostMatrix) + .def_property("num_bkps", &DynamicProgramming::get_num_bkps, &DynamicProgramming::set_num_bkps) + .def_property("num_timesteps", &DynamicProgramming::get_num_timesteps, &DynamicProgramming::set_num_timesteps) + .def_property("num_parameters", &DynamicProgramming::get_num_parameters, &DynamicProgramming::set_num_parameters) + .def(py::init<>()) + .def("read_input", &DynamicProgramming::read_input) + .def("initialize_cost_matrix", &DynamicProgramming::initialize_cost_matrix) + .def("return_breakpoints", &DynamicProgramming::return_breakpoints) + .def("set_threads", &DynamicProgramming::set_parallelization); +} diff --git a/Dynp/src/interface/dupininterface.py b/Dynp/src/interface/dupininterface.py new file mode 100644 index 0000000..58d9d1a --- /dev/null +++ b/Dynp/src/interface/dupininterface.py @@ -0,0 +1,114 @@ +import numpy as np +import _dupin +import random +import dupin as du +import ruptures as rpt +import time +import os +import psutil +from dupin.detect.costs import CostLinearFit + + +random.seed(445) +def compute_python_cost_matrix(signal: np.ndarray, min_size: int = 3): + cost = CostLinearFit(metric = "l2") + cost.fit(signal) + n_samples = signal.shape[0] + pycost_matrix = np.zeros((n_samples, n_samples)) + for start in range(n_samples): + for end in range(start + min_size, n_samples): + pycost_matrix[start, end] = cost.error(start, end) + return pycost_matrix + + +def compute_cplus_cost_matrix(signal: np.ndarray): + + algo = _dupin.DynamicProgramming() + algo.num_bkps = 4; + algo.num_timesteps = signal.shape[0] + algo.num_parameters = signal.shape[1] + algo.datum = np.asfortranarray(signal) + start1 = time.time() + algo.initialize_cost_matrix() + end1 = time.time() + topcppbkps = algo.return_breakpoints() + end2 = time.time() + cpptime = end1-start1 + cpptime2 = end2-end1 + print("costtime: ", cpptime, "\n", "segtime: ", cpptime2) + return topcppbkps, cpptime + +def compute_cplus_cost_matrix2d(signal: np.ndarray): + + algo = _dupin.DupinAlgo() + algo.num_bkps = 4; + algo.num_timesteps = signal.shape[0] + algo.num_parameters = signal.shape[1] + algo.datum = np.asfortranarray(signal) + start1 = time.time() + algo.initialize_cost_matrix2d() + + topcppbkps = algo.getTopDownBreakpoints("old") + end1 = time.time() + cpptime = end1-start1 + print("cpptime2d: ", cpptime) + return topcppbkps, cpptime + + + + +def generate_1_feature_data(size): + + data = (np.repeat([0, 200, 400, 600, 800], size / 5) + np.random.random(size)).reshape((-1, 1)) + return data + +def compute_cpp_operations(data): + topcppbkps, cpptime = compute_cplus_cost_matrix(data) + return topcppbkps, cpptime + +def compute_cpp_operations2d(data): + topcppbkps, cpptime = compute_cplus_cost_matrix2d(data) + return topcppbkps, cpptime + + +def compute_python_operations(data): + start_time = time.time() + dynp = rpt.Dynp(custom_cost=du.detect.CostLinearFit("l2"), jump=1) + sweepsweep = du.detect.SweepDetector(dynp, 10) + sweepsweep.fit(data) + end_time = time.time() + pythontime = end_time - start_time + return sweepsweep.change_points_, sweepsweep.opt_change_points_, pythontime + + +p = psutil.Process(os.getpid()) +#p.cpu_affinity([0]) # Limiting to core 0 + +def test_dupin(): + + datum = generate_1_feature_data(5000) + # C++ Operations + total_time = 0 +# for i in range(1,11): +# cppcost_matrix, topcppbkps, cpptime = compute_cpp_operations(current_qdata) +# total_time = total_time + cpptime +# print ("Average time for:",current_data.shape, ": ", total_time/10) + + + topcppbkps, cpptime = compute_cpp_operations(datum) + +# topcppbkps2, cpptime2 = compute_cpp_operations2d(datum) + print("Top Down C++ breakpoints:", topcppbkps) + # print("Top Down C++ breakpoints2d:", topcppbkps2) + + # Python Operations +# python_bkps, opt_python_bkps, pythontime = compute_python_operations(datum) + + # Comparing and printing results +# multiplier = pythontime / cpptime + # print("C++ is", multiplier, "times faster!") + # print(f"Python breakpoints: {python_bkps} opt change points: {opt_python_bkps}") + + +if __name__ == "__main__": + test_dupin() diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..2e41e67 --- /dev/null +++ b/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,16 @@ +## Description + +Added c++ implementation of the dynamic programming class to dupin. All changes are in the Dynp folder to be implemented in. + +## Motivation + +c++ implementation is faster and more memory efficient; allows for more optimization techniques than the python code. + +## Do Changes Introduce Breaking Changes +yes + +## Have you (if appropriate) +- [ ] Updated changelog +- [ ] Updated Documentation +- [ ] Add tests +- [ ] Added name to contributors From d13dfc7a5e7e314a54e55768807dad245815109d Mon Sep 17 00:00:00 2001 From: npkamath Date: Tue, 2 Jan 2024 13:11:29 -0500 Subject: [PATCH 02/27] reverted extra spaces --- .github/PULL_REQUEST_TEMPLATE.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index cade1f9..664d38d 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,11 +1,9 @@ ## Description - ## Motivation - ## Do Changes Introduce Breaking Changes ## Have you (if appropriate) From 5bbbaf29808f069dbbebb0bad8c4ad494e791803 Mon Sep 17 00:00:00 2001 From: Ninad Kmath <98561561+npkamath@users.noreply.github.com> Date: Tue, 2 Jan 2024 13:13:49 -0500 Subject: [PATCH 03/27] reverted spaces --- .github/PULL_REQUEST_TEMPLATE.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index cade1f9..664d38d 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,11 +1,9 @@ ## Description - ## Motivation - ## Do Changes Introduce Breaking Changes ## Have you (if appropriate) From 930430a696b2f90c8b3959139c23f2143a30e9ed Mon Sep 17 00:00:00 2001 From: npkamath Date: Tue, 2 Jan 2024 13:17:44 -0500 Subject: [PATCH 04/27] removed read_input function --- Dynp/src/cpp/dupin.cpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/Dynp/src/cpp/dupin.cpp b/Dynp/src/cpp/dupin.cpp index a6f9a47..de24d0f 100644 --- a/Dynp/src/cpp/dupin.cpp +++ b/Dynp/src/cpp/dupin.cpp @@ -25,17 +25,6 @@ DynamicProgramming::DynamicProgramming(int num_bkps_, int num_parameters_, int n } -void DynamicProgramming::read_input() { - cin >> jump >> min_size >> num_bkps >> num_parameters >> num_timesteps; - datum.resize(num_timesteps, num_parameters); - - for (int i = 0; i < num_timesteps; ++i) { - for (int j = 0; j < num_parameters; ++j) { - cin >> datum(i, j); - } - } -} - void DynamicProgramming::scale_datum() { VectorXd min_val = datum.colwise().minCoeff(); From e35cff1b0fa9928a9a47ee4fb5f79cf1744a6416 Mon Sep 17 00:00:00 2001 From: npkamath Date: Tue, 2 Jan 2024 13:20:59 -0500 Subject: [PATCH 05/27] removed read_input function --- Dynp/src/cpp/dupin.cpp | 1 - Dynp/src/cpp/dupin.h | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/Dynp/src/cpp/dupin.cpp b/Dynp/src/cpp/dupin.cpp index de24d0f..6491e69 100644 --- a/Dynp/src/cpp/dupin.cpp +++ b/Dynp/src/cpp/dupin.cpp @@ -25,7 +25,6 @@ DynamicProgramming::DynamicProgramming(int num_bkps_, int num_parameters_, int n } - void DynamicProgramming::scale_datum() { VectorXd min_val = datum.colwise().minCoeff(); VectorXd max_val = datum.colwise().maxCoeff(); diff --git a/Dynp/src/cpp/dupin.h b/Dynp/src/cpp/dupin.h index 5724511..01f6137 100644 --- a/Dynp/src/cpp/dupin.h +++ b/Dynp/src/cpp/dupin.h @@ -86,10 +86,7 @@ class DynamicProgramming { //change name to Dynamic Programming // Scales the dataset using min-max normalization. void scale_datum(); - - // Reads input data from standard input - mainly for testing purposes. - void read_input(); - + // Prepares data for linear regression. void regression_setup(linear_fit_struct& lfit); From e8ccb7c58fef45629c0a7cfd34d248420cae04b2 Mon Sep 17 00:00:00 2001 From: Ninad Kmath <98561561+npkamath@users.noreply.github.com> Date: Tue, 2 Jan 2024 13:23:20 -0500 Subject: [PATCH 06/27] Delete PULL_REQUEST_TEMPLATE.md --- PULL_REQUEST_TEMPLATE.md | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 PULL_REQUEST_TEMPLATE.md diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md deleted file mode 100644 index 2e41e67..0000000 --- a/PULL_REQUEST_TEMPLATE.md +++ /dev/null @@ -1,16 +0,0 @@ -## Description - -Added c++ implementation of the dynamic programming class to dupin. All changes are in the Dynp folder to be implemented in. - -## Motivation - -c++ implementation is faster and more memory efficient; allows for more optimization techniques than the python code. - -## Do Changes Introduce Breaking Changes -yes - -## Have you (if appropriate) -- [ ] Updated changelog -- [ ] Updated Documentation -- [ ] Add tests -- [ ] Added name to contributors From fccca35a3922edff72233bc0f659e6a7c3117212 Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Mon, 8 Jan 2024 19:11:02 -0500 Subject: [PATCH 07/27] refactor: Remove Dynp directory Also format C++ files using clang-format. --- CMakeLists.txt | 14 ++ Dynp/.gitignore | 40 ------ Dynp/.gitmodules | 4 - Dynp/CMakeLists.txt | 28 ---- Dynp/README.md | 2 - Dynp/extern/pybind11 | 1 - Dynp/src/cpp/dupin.cpp | 186 -------------------------- Dynp/src/cpp/dupin.h | 128 ------------------ Dynp/src/interface/dupininterface.cpp | 20 --- Dynp/src/interface/dupininterface.py | 114 ---------------- src/CMakeLists.txt | 17 +++ src/dupin.cpp | 176 ++++++++++++++++++++++++ src/dupin.h | 129 ++++++++++++++++++ src/dupininterface.cpp | 25 ++++ 14 files changed, 361 insertions(+), 523 deletions(-) create mode 100644 CMakeLists.txt delete mode 100644 Dynp/.gitignore delete mode 100644 Dynp/.gitmodules delete mode 100644 Dynp/CMakeLists.txt delete mode 100644 Dynp/README.md delete mode 160000 Dynp/extern/pybind11 delete mode 100644 Dynp/src/cpp/dupin.cpp delete mode 100644 Dynp/src/cpp/dupin.h delete mode 100644 Dynp/src/interface/dupininterface.cpp delete mode 100644 Dynp/src/interface/dupininterface.py create mode 100644 src/CMakeLists.txt create mode 100644 src/dupin.cpp create mode 100644 src/dupin.h create mode 100644 src/dupininterface.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..570a21b --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.8) +project(_dupin VERSION 0.0.1) + +set(DEFAULT_BUILD_TYPE "Release") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE ${DEFAULT_BUILD_TYPE}) +endif() + +find_package(Eigen3 REQUIRED) +find_package(TBB REQUIRED) +find_package(pybind11 CONFIG REQUIRED) + +include_directories(${PROJECT_SOURCE_DIR}/src) +add_subdirectory(src) diff --git a/Dynp/.gitignore b/Dynp/.gitignore deleted file mode 100644 index 9ba0cf9..0000000 --- a/Dynp/.gitignore +++ /dev/null @@ -1,40 +0,0 @@ -# Visual Studio build files -*.vcxproj -*.vcxproj.filters -*.vcxproj.user -*.sln -*.vcproj -*.vcxproj.* -*.vcxproj.*.user -*.vspscc -*.vssscc -*.publishsettings -*_i.c -*_p.c -*.ilk -*.meta -*.obj -*.pch -*.pdb -*.pgc -*.pgd -*.rsp -*.sbr -*.tlb -*.tli -*.tlh -*.tmp -*.tmp_proj -*.log -*.vspscc -*.vssscc -*.swp -*.user -*.suo -*.tlh -*.bak -*.[Cc]ache -*.DS_Store -x64 -build -.venv diff --git a/Dynp/.gitmodules b/Dynp/.gitmodules deleted file mode 100644 index 062cc71..0000000 --- a/Dynp/.gitmodules +++ /dev/null @@ -1,4 +0,0 @@ -[submodule "extern/pybind11"] - path = extern/pybind11 - url = ../../pybind/pybind11 - branch = stable diff --git a/Dynp/CMakeLists.txt b/Dynp/CMakeLists.txt deleted file mode 100644 index e495afa..0000000 --- a/Dynp/CMakeLists.txt +++ /dev/null @@ -1,28 +0,0 @@ -cmake_minimum_required(VERSION 3.8) -project(_dupin VERSION 1.0.0) # Replace with your actual version - -set(DEFAULT_BUILD_TYPE "Release") -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE ${DEFAULT_BUILD_TYPE}) -endif() - -find_package(Eigen3 REQUIRED) -find_package(TBB REQUIRED) -add_subdirectory(extern/pybind11) - -pybind11_add_module(_dupin src/cpp/dupin.h src/cpp/dupin.cpp src/interface/dupininterface.cpp) - -target_include_directories(_dupin PRIVATE - ${EIGEN3_INCLUDE_DIR} - ${TBB_INCLUDE_DIRS} - "${CMAKE_CURRENT_SOURCE_DIR}/eigen-3.4.0" -) - -target_link_libraries(_dupin PRIVATE TBB::tbb) -target_compile_definitions(_dupin PRIVATE VERSION_INFO=${PROJECT_VERSION}) -target_compile_options(_dupin PRIVATE -O2 -march=native) - -set_target_properties(_dupin PROPERTIES - CXX_STANDARD 17 - CMAKE_CXX_STANDARD_REQUIRED True -) diff --git a/Dynp/README.md b/Dynp/README.md deleted file mode 100644 index 03e0cc7..0000000 --- a/Dynp/README.md +++ /dev/null @@ -1,2 +0,0 @@ -Requires installation of dupin.py -C++ dependencies: intel-tbb, eigen diff --git a/Dynp/extern/pybind11 b/Dynp/extern/pybind11 deleted file mode 160000 index 8b03ffa..0000000 --- a/Dynp/extern/pybind11 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 8b03ffa7c06cd9c8a38297b1c8923695d1ff1b07 diff --git a/Dynp/src/cpp/dupin.cpp b/Dynp/src/cpp/dupin.cpp deleted file mode 100644 index 6491e69..0000000 --- a/Dynp/src/cpp/dupin.cpp +++ /dev/null @@ -1,186 +0,0 @@ -#include -#include -#include -#include -#include -#include "dupin.h" -#include -#include -#include -#include - - -using namespace std; -using namespace Eigen; - - -DynamicProgramming::DynamicProgramming() - : num_bkps(0), num_parameters(0), num_timesteps(0), jump(1), min_size(3) { - -} - -DynamicProgramming::DynamicProgramming(int num_bkps_, int num_parameters_, int num_timesteps_, int jump_, int min_size_) - : num_bkps(num_bkps_), num_parameters(num_parameters_), - num_timesteps(num_timesteps_), jump(jump_), min_size(min_size_) { - -} - -void DynamicProgramming::scale_datum() { - VectorXd min_val = datum.colwise().minCoeff(); - VectorXd max_val = datum.colwise().maxCoeff(); - VectorXd range = max_val - min_val; - - for (int j = 0; j < num_parameters; ++j) { - if (range(j) == 0.0) { - datum.col(j).setZero(); - } else { - datum.col(j) = (datum.col(j).array() - min_val(j)) / range(j); - } - } -} -void DynamicProgramming::regression_setup(linear_fit_struct &lfit) { - lfit.x = VectorXd::LinSpaced(num_timesteps, 0, num_timesteps - 1) / (num_timesteps - 1); - lfit.y = datum; -} - -VectorXd DynamicProgramming::regression_line(int start, int end, int dim, linear_fit_struct &lfit) { - int n = end - start; - VectorXd x = lfit.x.segment(start, n); - VectorXd y = lfit.y.col(dim).segment(start, n); - - double x_mean = x.mean(); - double y_mean = y.mean(); - - VectorXd x_centered = x.array() - x_mean; - VectorXd y_centered = y.array() - y_mean; - - double slope = x_centered.dot(y_centered) / x_centered.squaredNorm(); - double intercept = y_mean - slope * x_mean; - - return x.unaryExpr([slope, intercept](double xi) { return slope * xi + intercept; }); -} - - -double DynamicProgramming::l2_cost(MatrixXd &predicted_y, int start, int end) { - MatrixXd diff = predicted_y.block(start, 0, end - start, num_parameters) - datum.block(start, 0, end - start, num_parameters); - return sqrt(diff.array().square().sum()); -} - - - -MatrixXd DynamicProgramming::predicted(int start, int end, linear_fit_struct &lfit) { - MatrixXd predicted_y(num_timesteps, num_parameters); - for (int i = 0; i < num_parameters; ++i) { - predicted_y.block(start, i, end - start, 1) = regression_line(start, end, i, lfit); - } - return predicted_y; -} - -double DynamicProgramming::cost_function(int start, int end) { - linear_fit_struct lfit; - regression_setup(lfit); - MatrixXd predicted_y = predicted(start, end, lfit); - return l2_cost(predicted_y, start, end); -} - -void DynamicProgramming::initialize_cost_matrix() { - scale_datum(); - cost_matrix.initialize(num_timesteps); - - tbb::parallel_for(tbb::blocked_range(0, num_timesteps), [&](const tbb::blocked_range& r) { - for (int i = r.begin(); i < r.end(); ++i) { - for (int j = i + min_size; j < num_timesteps; ++j) { - cost_matrix(i, j) = cost_function(i, j); - } - } - }); -} - -pair> DynamicProgramming::seg(int start, int end, int num_bkps) { - MemoKey key = {start, end, num_bkps}; - auto it = memo.find(key); - if (it != memo.end()) { - return it->second; - } -// no_memo++; - if (num_bkps == 0) { - return {cost_matrix(start, end), {end}}; - } - - pair> best = {numeric_limits::infinity(), {}}; - - for (int bkp = start + min_size; bkp < end; bkp++) { - if ((bkp - start) >= min_size && (end - bkp) >= min_size) { - auto left = seg(start, bkp, num_bkps - 1); - auto right = seg(bkp, end, 0); - double cost = left.first + right.first; - if (cost < best.first) { - best.first = cost; - best.second = left.second; - best.second.push_back(bkp); - best.second.insert(best.second.end(), right.second.begin(), right.second.end()); - } - } - } - - memo[key] = best; - return best; -} - -vector DynamicProgramming::return_breakpoints() { - auto result = seg(0, num_timesteps-1, num_bkps); - vector breakpoints = result.second; - sort(breakpoints.begin(), breakpoints.end()); - breakpoints.erase(unique(breakpoints.begin(), breakpoints.end()), breakpoints.end()); - return breakpoints; -} - -void set_parallelization(int num_threads) { - static tbb::global_control gc(tbb::global_control::max_allowed_parallelism, num_threads); -} - -int DynamicProgramming::get_num_timesteps() { - return num_timesteps; -} - -int DynamicProgramming::get_num_parameters() { - return num_parameters; -} - -int DynamicProgramming::get_num_bkps() { - return num_bkps; -} - -Eigen::MatrixXd& DynamicProgramming::getDatum() { - return datum; -} - -DynamicProgramming::upper_triangular_cost_matrix& DynamicProgramming::getCostMatrix() { - return cost_matrix; -} - -void DynamicProgramming::set_num_timesteps(int value) { - num_timesteps = value; -} - -void DynamicProgramming::set_num_parameters(int value) { - num_parameters = value; -} - -void DynamicProgramming::set_num_bkps(int value) { - num_bkps = value; -} - -void DynamicProgramming::setDatum(const Eigen::MatrixXd& value) { - datum = value; -} - -void DynamicProgramming::setCostMatrix(const DynamicProgramming::upper_triangular_cost_matrix& value) { - cost_matrix = value; -} - -int main() { - - return 0; - -} \ No newline at end of file diff --git a/Dynp/src/cpp/dupin.h b/Dynp/src/cpp/dupin.h deleted file mode 100644 index 01f6137..0000000 --- a/Dynp/src/cpp/dupin.h +++ /dev/null @@ -1,128 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - - - -// DynamicProgramming class for dynamic programming based segmentation. -class DynamicProgramming { //change name to Dynamic Programming -private: - class upper_triangular_cost_matrix { - private: - std::vector matrix; - int size; - - int index(int i, int j) const { - return i * (2 * size - i + 1) / 2 + (j - i); - } - - public: - upper_triangular_cost_matrix() : size(0) {} - - void initialize(int n) { - size = n; - matrix.resize(n * (n + 1) / 2, 0.0); - } - - double& operator()(int i, int j) { - return matrix[index(i, j)]; - } - - int getSize() const { - return size; - } - - }; - upper_triangular_cost_matrix cost_matrix; - // Struct for memoization key, combining start, end, and number of breakpoints. - struct MemoKey { - int start; - int end; - int num_bkps; - - // Comparison operator for MemoKey. - bool operator==(const MemoKey &other) const { - return start == other.start && end == other.end && num_bkps == other.num_bkps; - } - }; - - // Custom XOR-bit hash function for MemoKey, avoids clustering of data in unordered map to improve efficiency. - struct MemoKeyHash { //test without hash function - std::size_t operator()(const MemoKey& key) const { - return ((std::hash()(key.start) ^ (std::hash()(key.end) << 1)) >> 1) ^ std::hash()(key.num_bkps); - } - }; - - // Memoization map to store the cost and partition for given parameters. - std::unordered_map>, MemoKeyHash> memo; - - int num_bkps; // Number of breakpoints to detect. - int num_parameters; // Number of features in the dataset. - int num_timesteps; // Number of data points (time steps). - int jump; // Interval for checking potential breakpoints. - int min_size; // Minimum size of a segment. - Eigen::MatrixXd datum; // Matrix storing the dataset. - - - - // Structure for storing linear regression parameters. - struct linear_fit_struct { - Eigen::MatrixXd y; // Dependent variable (labels). - Eigen::VectorXd x; //z Independent variable (time steps). - }; - -public: - // Default constructor. - DynamicProgramming(); - - // Parameterized constructor. - DynamicProgramming(int num_bkps_, int num_parameters_, int num_timesteps_, int jump_, int min_size_); - - // Scales the dataset using min-max normalization. - void scale_datum(); - - // Prepares data for linear regression. - void regression_setup(linear_fit_struct& lfit); - - // Calculates the regression line for a given data segment. - Eigen::VectorXd regression_line(int start, int end, int dim, linear_fit_struct& lfit); - - // Generates predicted values based on the linear regression model. - Eigen::MatrixXd predicted(int start, int end, linear_fit_struct& lfit); - - // Calculates L2 cost (Euclidean distance) between predicted and actual data. - double l2_cost(Eigen::MatrixXd &predicted_y, int start, int end); - - // Computes the cost of a specific data segment using linear regression. - double cost_function(int start, int end); - - // Initializes and fills the cost matrix for all data segments. - void initialize_cost_matrix(); - - // Recursive function for dynamic programming segmentation. - std::pair> seg(int start, int end, int num_bkps); - - void set_parallelization(int num_threads); - // Returns the optimal set of breakpoints after segmentation. - std::vector return_breakpoints(); - - // Getter functions for accessing private class members. - int get_num_timesteps(); - int get_num_parameters(); - int get_num_bkps(); - Eigen::MatrixXd& getDatum(); - DynamicProgramming::upper_triangular_cost_matrix& getCostMatrix(); - - // Setter functions for modifying private class members. - void set_num_timesteps(int value); - void set_num_parameters(int value); - void set_num_bkps(int value); - void setDatum(const Eigen::MatrixXd& value); - void setCostMatrix(const DynamicProgramming::upper_triangular_cost_matrix& value); -}; diff --git a/Dynp/src/interface/dupininterface.cpp b/Dynp/src/interface/dupininterface.cpp deleted file mode 100644 index 5d63c94..0000000 --- a/Dynp/src/interface/dupininterface.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#include -#include -#include -#include "dupin.h" - -namespace py = pybind11; - -PYBIND11_MODULE(_dupin, m) { - py::class_(m, "DynamicProgramming") - .def_property("datum", &DynamicProgramming::getDatum, &DynamicProgramming::setDatum) - .def_property("cost_matrix", &DynamicProgramming::getCostMatrix, &DynamicProgramming::setCostMatrix) - .def_property("num_bkps", &DynamicProgramming::get_num_bkps, &DynamicProgramming::set_num_bkps) - .def_property("num_timesteps", &DynamicProgramming::get_num_timesteps, &DynamicProgramming::set_num_timesteps) - .def_property("num_parameters", &DynamicProgramming::get_num_parameters, &DynamicProgramming::set_num_parameters) - .def(py::init<>()) - .def("read_input", &DynamicProgramming::read_input) - .def("initialize_cost_matrix", &DynamicProgramming::initialize_cost_matrix) - .def("return_breakpoints", &DynamicProgramming::return_breakpoints) - .def("set_threads", &DynamicProgramming::set_parallelization); -} diff --git a/Dynp/src/interface/dupininterface.py b/Dynp/src/interface/dupininterface.py deleted file mode 100644 index 58d9d1a..0000000 --- a/Dynp/src/interface/dupininterface.py +++ /dev/null @@ -1,114 +0,0 @@ -import numpy as np -import _dupin -import random -import dupin as du -import ruptures as rpt -import time -import os -import psutil -from dupin.detect.costs import CostLinearFit - - -random.seed(445) -def compute_python_cost_matrix(signal: np.ndarray, min_size: int = 3): - cost = CostLinearFit(metric = "l2") - cost.fit(signal) - n_samples = signal.shape[0] - pycost_matrix = np.zeros((n_samples, n_samples)) - for start in range(n_samples): - for end in range(start + min_size, n_samples): - pycost_matrix[start, end] = cost.error(start, end) - return pycost_matrix - - -def compute_cplus_cost_matrix(signal: np.ndarray): - - algo = _dupin.DynamicProgramming() - algo.num_bkps = 4; - algo.num_timesteps = signal.shape[0] - algo.num_parameters = signal.shape[1] - algo.datum = np.asfortranarray(signal) - start1 = time.time() - algo.initialize_cost_matrix() - end1 = time.time() - topcppbkps = algo.return_breakpoints() - end2 = time.time() - cpptime = end1-start1 - cpptime2 = end2-end1 - print("costtime: ", cpptime, "\n", "segtime: ", cpptime2) - return topcppbkps, cpptime - -def compute_cplus_cost_matrix2d(signal: np.ndarray): - - algo = _dupin.DupinAlgo() - algo.num_bkps = 4; - algo.num_timesteps = signal.shape[0] - algo.num_parameters = signal.shape[1] - algo.datum = np.asfortranarray(signal) - start1 = time.time() - algo.initialize_cost_matrix2d() - - topcppbkps = algo.getTopDownBreakpoints("old") - end1 = time.time() - cpptime = end1-start1 - print("cpptime2d: ", cpptime) - return topcppbkps, cpptime - - - - -def generate_1_feature_data(size): - - data = (np.repeat([0, 200, 400, 600, 800], size / 5) + np.random.random(size)).reshape((-1, 1)) - return data - -def compute_cpp_operations(data): - topcppbkps, cpptime = compute_cplus_cost_matrix(data) - return topcppbkps, cpptime - -def compute_cpp_operations2d(data): - topcppbkps, cpptime = compute_cplus_cost_matrix2d(data) - return topcppbkps, cpptime - - -def compute_python_operations(data): - start_time = time.time() - dynp = rpt.Dynp(custom_cost=du.detect.CostLinearFit("l2"), jump=1) - sweepsweep = du.detect.SweepDetector(dynp, 10) - sweepsweep.fit(data) - end_time = time.time() - pythontime = end_time - start_time - return sweepsweep.change_points_, sweepsweep.opt_change_points_, pythontime - - -p = psutil.Process(os.getpid()) -#p.cpu_affinity([0]) # Limiting to core 0 - -def test_dupin(): - - datum = generate_1_feature_data(5000) - # C++ Operations - total_time = 0 -# for i in range(1,11): -# cppcost_matrix, topcppbkps, cpptime = compute_cpp_operations(current_qdata) -# total_time = total_time + cpptime -# print ("Average time for:",current_data.shape, ": ", total_time/10) - - - topcppbkps, cpptime = compute_cpp_operations(datum) - -# topcppbkps2, cpptime2 = compute_cpp_operations2d(datum) - print("Top Down C++ breakpoints:", topcppbkps) - # print("Top Down C++ breakpoints2d:", topcppbkps2) - - # Python Operations -# python_bkps, opt_python_bkps, pythontime = compute_python_operations(datum) - - # Comparing and printing results -# multiplier = pythontime / cpptime - # print("C++ is", multiplier, "times faster!") - # print(f"Python breakpoints: {python_bkps} opt change points: {opt_python_bkps}") - - -if __name__ == "__main__": - test_dupin() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..bf0d409 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,17 @@ +pybind11_add_module(_dupin dupininterface.cpp + dupin.h dupin.cpp +) + +set_target_properties(_dupin PROPERTIES + CXX_STANDARD 17 + CMAKE_CXX_STANDARD_REQUIRED True +) + +target_include_directories(_dupin PRIVATE + ${EIGEN3_INCLUDE_DIR} + ${TBB_INCLUDE_DIRS} +) + +target_link_libraries(_dupin PRIVATE TBB::tbb) +target_compile_definitions(_dupin PRIVATE VERSION_INFO=${PROJECT_VERSION}) +target_compile_options(_dupin PRIVATE -O2 -march=native) diff --git a/src/dupin.cpp b/src/dupin.cpp new file mode 100644 index 0000000..0b14cc8 --- /dev/null +++ b/src/dupin.cpp @@ -0,0 +1,176 @@ +#include "dupin.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace Eigen; + +DynamicProgramming::DynamicProgramming() + : num_bkps(0), num_parameters(0), num_timesteps(0), jump(1), min_size(3) {} + +DynamicProgramming::DynamicProgramming(int num_bkps_, int num_parameters_, + int num_timesteps_, int jump_, + int min_size_) + : num_bkps(num_bkps_), num_parameters(num_parameters_), + num_timesteps(num_timesteps_), jump(jump_), min_size(min_size_) {} + +void DynamicProgramming::scale_datum() { + VectorXd min_val = datum.colwise().minCoeff(); + VectorXd max_val = datum.colwise().maxCoeff(); + VectorXd range = max_val - min_val; + + for (int j = 0; j < num_parameters; ++j) { + if (range(j) == 0.0) { + datum.col(j).setZero(); + } else { + datum.col(j) = (datum.col(j).array() - min_val(j)) / range(j); + } + } +} +void DynamicProgramming::regression_setup(linear_fit_struct &lfit) { + lfit.x = VectorXd::LinSpaced(num_timesteps, 0, num_timesteps - 1) / + (num_timesteps - 1); + lfit.y = datum; +} + +VectorXd DynamicProgramming::regression_line(int start, int end, int dim, + linear_fit_struct &lfit) { + int n = end - start; + VectorXd x = lfit.x.segment(start, n); + VectorXd y = lfit.y.col(dim).segment(start, n); + + double x_mean = x.mean(); + double y_mean = y.mean(); + + VectorXd x_centered = x.array() - x_mean; + VectorXd y_centered = y.array() - y_mean; + + double slope = x_centered.dot(y_centered) / x_centered.squaredNorm(); + double intercept = y_mean - slope * x_mean; + + return x.unaryExpr( + [slope, intercept](double xi) { return slope * xi + intercept; }); +} + +double DynamicProgramming::l2_cost(MatrixXd &predicted_y, int start, int end) { + MatrixXd diff = predicted_y.block(start, 0, end - start, num_parameters) - + datum.block(start, 0, end - start, num_parameters); + return sqrt(diff.array().square().sum()); +} + +MatrixXd DynamicProgramming::predicted(int start, int end, + linear_fit_struct &lfit) { + MatrixXd predicted_y(num_timesteps, num_parameters); + for (int i = 0; i < num_parameters; ++i) { + predicted_y.block(start, i, end - start, 1) = + regression_line(start, end, i, lfit); + } + return predicted_y; +} + +double DynamicProgramming::cost_function(int start, int end) { + linear_fit_struct lfit; + regression_setup(lfit); + MatrixXd predicted_y = predicted(start, end, lfit); + return l2_cost(predicted_y, start, end); +} + +void DynamicProgramming::initialize_cost_matrix() { + scale_datum(); + cost_matrix.initialize(num_timesteps); + + tbb::parallel_for(tbb::blocked_range(0, num_timesteps), + [&](const tbb::blocked_range &r) { + for (int i = r.begin(); i < r.end(); ++i) { + for (int j = i + min_size; j < num_timesteps; ++j) { + cost_matrix(i, j) = cost_function(i, j); + } + } + }); +} + +pair> DynamicProgramming::seg(int start, int end, + int num_bkps) { + MemoKey key = {start, end, num_bkps}; + auto it = memo.find(key); + if (it != memo.end()) { + return it->second; + } + // no_memo++; + if (num_bkps == 0) { + return {cost_matrix(start, end), {end}}; + } + + pair> best = {numeric_limits::infinity(), {}}; + + for (int bkp = start + min_size; bkp < end; bkp++) { + if ((bkp - start) >= min_size && (end - bkp) >= min_size) { + auto left = seg(start, bkp, num_bkps - 1); + auto right = seg(bkp, end, 0); + double cost = left.first + right.first; + if (cost < best.first) { + best.first = cost; + best.second = left.second; + best.second.push_back(bkp); + best.second.insert(best.second.end(), right.second.begin(), + right.second.end()); + } + } + } + + memo[key] = best; + return best; +} + +vector DynamicProgramming::return_breakpoints() { + auto result = seg(0, num_timesteps - 1, num_bkps); + vector breakpoints = result.second; + sort(breakpoints.begin(), breakpoints.end()); + breakpoints.erase(unique(breakpoints.begin(), breakpoints.end()), + breakpoints.end()); + return breakpoints; +} + +void set_parallelization(int num_threads) { + static tbb::global_control gc(tbb::global_control::max_allowed_parallelism, + num_threads); +} + +int DynamicProgramming::get_num_timesteps() { return num_timesteps; } + +int DynamicProgramming::get_num_parameters() { return num_parameters; } + +int DynamicProgramming::get_num_bkps() { return num_bkps; } + +Eigen::MatrixXd &DynamicProgramming::getDatum() { return datum; } + +DynamicProgramming::upper_triangular_cost_matrix & +DynamicProgramming::getCostMatrix() { + return cost_matrix; +} + +void DynamicProgramming::set_num_timesteps(int value) { num_timesteps = value; } + +void DynamicProgramming::set_num_parameters(int value) { + num_parameters = value; +} + +void DynamicProgramming::set_num_bkps(int value) { num_bkps = value; } + +void DynamicProgramming::setDatum(const Eigen::MatrixXd &value) { + datum = value; +} + +void DynamicProgramming::setCostMatrix( + const DynamicProgramming::upper_triangular_cost_matrix &value) { + cost_matrix = value; +} + +int main() { return 0; } diff --git a/src/dupin.h b/src/dupin.h new file mode 100644 index 0000000..f01022f --- /dev/null +++ b/src/dupin.h @@ -0,0 +1,129 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +// DynamicProgramming class for dynamic programming based segmentation. +class DynamicProgramming { // change name to Dynamic Programming +private: + class upper_triangular_cost_matrix { + private: + std::vector matrix; + int size; + + int index(int i, int j) const { + return i * (2 * size - i + 1) / 2 + (j - i); + } + + public: + upper_triangular_cost_matrix() : size(0) {} + + void initialize(int n) { + size = n; + matrix.resize(n * (n + 1) / 2, 0.0); + } + + double &operator()(int i, int j) { return matrix[index(i, j)]; } + + int getSize() const { return size; } + }; + upper_triangular_cost_matrix cost_matrix; + // Struct for memoization key, combining start, end, and number of + // breakpoints. + struct MemoKey { + int start; + int end; + int num_bkps; + + // Comparison operator for MemoKey. + bool operator==(const MemoKey &other) const { + return start == other.start && end == other.end && + num_bkps == other.num_bkps; + } + }; + + // Custom XOR-bit hash function for MemoKey, avoids clustering of data in + // unordered map to improve efficiency. + struct MemoKeyHash { // test without hash function + std::size_t operator()(const MemoKey &key) const { + return ((std::hash()(key.start) ^ + (std::hash()(key.end) << 1)) >> + 1) ^ + std::hash()(key.num_bkps); + } + }; + + // Memoization map to store the cost and partition for given parameters. + std::unordered_map>, MemoKeyHash> + memo; + + int num_bkps; // Number of breakpoints to detect. + int num_parameters; // Number of features in the dataset. + int num_timesteps; // Number of data points (time steps). + int jump; // Interval for checking potential breakpoints. + int min_size; // Minimum size of a segment. + Eigen::MatrixXd datum; // Matrix storing the dataset. + + // Structure for storing linear regression parameters. + struct linear_fit_struct { + Eigen::MatrixXd y; // Dependent variable (labels). + Eigen::VectorXd x; // z Independent variable (time steps). + }; + +public: + // Default constructor. + DynamicProgramming(); + + // Parameterized constructor. + DynamicProgramming(int num_bkps_, int num_parameters_, int num_timesteps_, + int jump_, int min_size_); + + // Scales the dataset using min-max normalization. + void scale_datum(); + + // Prepares data for linear regression. + void regression_setup(linear_fit_struct &lfit); + + // Calculates the regression line for a given data segment. + Eigen::VectorXd regression_line(int start, int end, int dim, + linear_fit_struct &lfit); + + // Generates predicted values based on the linear regression model. + Eigen::MatrixXd predicted(int start, int end, linear_fit_struct &lfit); + + // Calculates L2 cost (Euclidean distance) between predicted and actual data. + double l2_cost(Eigen::MatrixXd &predicted_y, int start, int end); + + // Computes the cost of a specific data segment using linear regression. + double cost_function(int start, int end); + + // Initializes and fills the cost matrix for all data segments. + void initialize_cost_matrix(); + + // Recursive function for dynamic programming segmentation. + std::pair> seg(int start, int end, int num_bkps); + + void set_parallelization(int num_threads); + // Returns the optimal set of breakpoints after segmentation. + std::vector return_breakpoints(); + + // Getter functions for accessing private class members. + int get_num_timesteps(); + int get_num_parameters(); + int get_num_bkps(); + Eigen::MatrixXd &getDatum(); + DynamicProgramming::upper_triangular_cost_matrix &getCostMatrix(); + + // Setter functions for modifying private class members. + void set_num_timesteps(int value); + void set_num_parameters(int value); + void set_num_bkps(int value); + void setDatum(const Eigen::MatrixXd &value); + void + setCostMatrix(const DynamicProgramming::upper_triangular_cost_matrix &value); +}; diff --git a/src/dupininterface.cpp b/src/dupininterface.cpp new file mode 100644 index 0000000..63fa2c2 --- /dev/null +++ b/src/dupininterface.cpp @@ -0,0 +1,25 @@ +#include "dupin.h" +#include +#include +#include + +namespace py = pybind11; + +PYBIND11_MODULE(_dupin, m) { + py::class_(m, "DynamicProgramming") + .def(py::init<>()) + .def_property("datum", &DynamicProgramming::getDatum, + &DynamicProgramming::setDatum) + .def_property("cost_matrix", &DynamicProgramming::getCostMatrix, + &DynamicProgramming::setCostMatrix) + .def_property("num_bkps", &DynamicProgramming::get_num_bkps, + &DynamicProgramming::set_num_bkps) + .def_property("num_timesteps", &DynamicProgramming::get_num_timesteps, + &DynamicProgramming::set_num_timesteps) + .def_property("num_parameters", &DynamicProgramming::get_num_parameters, + &DynamicProgramming::set_num_parameters) + .def("initialize_cost_matrix", + &DynamicProgramming::initialize_cost_matrix) + .def("return_breakpoints", &DynamicProgramming::return_breakpoints) + .def("set_threads", &DynamicProgramming::set_parallelization); +} From 0c649500cb83fe798fca0ca1b461fa0b20a3a841 Mon Sep 17 00:00:00 2001 From: npkamath Date: Thu, 11 Jan 2024 23:43:06 -0500 Subject: [PATCH 08/27] fixed namespace, naming, and cleaned up comments --- src/dupin.cpp | 59 +++++++++++++++++++++++++-------------------------- src/dupin.h | 22 +++++++++---------- 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/src/dupin.cpp b/src/dupin.cpp index 0b14cc8..71d136d 100644 --- a/src/dupin.cpp +++ b/src/dupin.cpp @@ -1,13 +1,13 @@ -#include "dupin.h" -#include -#include #include +#include #include +#include +#include +#include #include #include #include -#include -#include +#include "dupin.h" using namespace std; using namespace Eigen; @@ -22,9 +22,9 @@ DynamicProgramming::DynamicProgramming(int num_bkps_, int num_parameters_, num_timesteps(num_timesteps_), jump(jump_), min_size(min_size_) {} void DynamicProgramming::scale_datum() { - VectorXd min_val = datum.colwise().minCoeff(); - VectorXd max_val = datum.colwise().maxCoeff(); - VectorXd range = max_val - min_val; + Eigen::VectorXd min_val = datum.colwise().minCoeff(); + Eigen::VectorXd max_val = datum.colwise().maxCoeff(); + Eigen::VectorXd range = max_val - min_val; for (int j = 0; j < num_parameters; ++j) { if (range(j) == 0.0) { @@ -35,22 +35,22 @@ void DynamicProgramming::scale_datum() { } } void DynamicProgramming::regression_setup(linear_fit_struct &lfit) { - lfit.x = VectorXd::LinSpaced(num_timesteps, 0, num_timesteps - 1) / + lfit.x = Eigen::VectorXd::LinSpaced(num_timesteps, 0, num_timesteps - 1) / (num_timesteps - 1); lfit.y = datum; } -VectorXd DynamicProgramming::regression_line(int start, int end, int dim, +Eigen::VectorXd DynamicProgramming::regression_line(int start, int end, int dim, linear_fit_struct &lfit) { int n = end - start; - VectorXd x = lfit.x.segment(start, n); - VectorXd y = lfit.y.col(dim).segment(start, n); + Eigen::VectorXd x = lfit.x.segment(start, n); + Eigen::VectorXd y = lfit.y.col(dim).segment(start, n); double x_mean = x.mean(); double y_mean = y.mean(); - VectorXd x_centered = x.array() - x_mean; - VectorXd y_centered = y.array() - y_mean; + Eigen::VectorXd x_centered = x.array() - x_mean; + Eigen::VectorXd y_centered = y.array() - y_mean; double slope = x_centered.dot(y_centered) / x_centered.squaredNorm(); double intercept = y_mean - slope * x_mean; @@ -59,15 +59,15 @@ VectorXd DynamicProgramming::regression_line(int start, int end, int dim, [slope, intercept](double xi) { return slope * xi + intercept; }); } -double DynamicProgramming::l2_cost(MatrixXd &predicted_y, int start, int end) { - MatrixXd diff = predicted_y.block(start, 0, end - start, num_parameters) - +double DynamicProgramming::l2_cost(Eigen::MatrixXd &predicted_y, int start, int end) { + Eigen::MatrixXd diff = predicted_y.block(start, 0, end - start, num_parameters) - datum.block(start, 0, end - start, num_parameters); - return sqrt(diff.array().square().sum()); + return std::sqrt(diff.array().square().sum()); } -MatrixXd DynamicProgramming::predicted(int start, int end, +Eigen::MatrixXd DynamicProgramming::predicted(int start, int end, linear_fit_struct &lfit) { - MatrixXd predicted_y(num_timesteps, num_parameters); + Eigen::MatrixXd predicted_y(num_timesteps, num_parameters); for (int i = 0; i < num_parameters; ++i) { predicted_y.block(start, i, end - start, 1) = regression_line(start, end, i, lfit); @@ -78,7 +78,7 @@ MatrixXd DynamicProgramming::predicted(int start, int end, double DynamicProgramming::cost_function(int start, int end) { linear_fit_struct lfit; regression_setup(lfit); - MatrixXd predicted_y = predicted(start, end, lfit); + Eigen::MatrixXd predicted_y = predicted(start, end, lfit); return l2_cost(predicted_y, start, end); } @@ -96,19 +96,18 @@ void DynamicProgramming::initialize_cost_matrix() { }); } -pair> DynamicProgramming::seg(int start, int end, +std::pair> DynamicProgramming::seg(int start, int end, int num_bkps) { MemoKey key = {start, end, num_bkps}; auto it = memo.find(key); if (it != memo.end()) { return it->second; } - // no_memo++; if (num_bkps == 0) { return {cost_matrix(start, end), {end}}; } - pair> best = {numeric_limits::infinity(), {}}; + std::pair> best = {std::numeric_limits::infinity(), {}}; for (int bkp = start + min_size; bkp < end; bkp++) { if ((bkp - start) >= min_size && (end - bkp) >= min_size) { @@ -129,11 +128,11 @@ pair> DynamicProgramming::seg(int start, int end, return best; } -vector DynamicProgramming::return_breakpoints() { +std::vector DynamicProgramming::return_breakpoints() { auto result = seg(0, num_timesteps - 1, num_bkps); - vector breakpoints = result.second; - sort(breakpoints.begin(), breakpoints.end()); - breakpoints.erase(unique(breakpoints.begin(), breakpoints.end()), + std::vector breakpoints = result.second; + std::sort(breakpoints.begin(), breakpoints.end()); + breakpoints.erase(std::unique(breakpoints.begin(), breakpoints.end()), breakpoints.end()); return breakpoints; } @@ -151,7 +150,7 @@ int DynamicProgramming::get_num_bkps() { return num_bkps; } Eigen::MatrixXd &DynamicProgramming::getDatum() { return datum; } -DynamicProgramming::upper_triangular_cost_matrix & +DynamicProgramming::UpperTriangularMatrix & DynamicProgramming::getCostMatrix() { return cost_matrix; } @@ -169,8 +168,8 @@ void DynamicProgramming::setDatum(const Eigen::MatrixXd &value) { } void DynamicProgramming::setCostMatrix( - const DynamicProgramming::upper_triangular_cost_matrix &value) { + const DynamicProgramming::UpperTriangularMatrix &value) { cost_matrix = value; } -int main() { return 0; } +int main() { return 0; } \ No newline at end of file diff --git a/src/dupin.h b/src/dupin.h index f01022f..9c52e35 100644 --- a/src/dupin.h +++ b/src/dupin.h @@ -1,38 +1,38 @@ #pragma once -#include #include #include #include -#include #include #include +#include + // DynamicProgramming class for dynamic programming based segmentation. class DynamicProgramming { // change name to Dynamic Programming private: - class upper_triangular_cost_matrix { + class UpperTriangularMatrix{ private: std::vector matrix; - int size; + int length; int index(int i, int j) const { - return i * (2 * size - i + 1) / 2 + (j - i); + return i * (2 * length - i + 1) / 2 + (j - i); } public: - upper_triangular_cost_matrix() : size(0) {} + UpperTriangularMatrix() : length(0) {} void initialize(int n) { - size = n; + length = n; matrix.resize(n * (n + 1) / 2, 0.0); } double &operator()(int i, int j) { return matrix[index(i, j)]; } - int getSize() const { return size; } + int getSize() const { return length; } }; - upper_triangular_cost_matrix cost_matrix; + UpperTriangularMatrix cost_matrix; // Struct for memoization key, combining start, end, and number of // breakpoints. struct MemoKey { @@ -117,7 +117,7 @@ class DynamicProgramming { // change name to Dynamic Programming int get_num_parameters(); int get_num_bkps(); Eigen::MatrixXd &getDatum(); - DynamicProgramming::upper_triangular_cost_matrix &getCostMatrix(); + DynamicProgramming::UpperTriangularMatrix &getCostMatrix(); // Setter functions for modifying private class members. void set_num_timesteps(int value); @@ -125,5 +125,5 @@ class DynamicProgramming { // change name to Dynamic Programming void set_num_bkps(int value); void setDatum(const Eigen::MatrixXd &value); void - setCostMatrix(const DynamicProgramming::upper_triangular_cost_matrix &value); + setCostMatrix(const DynamicProgramming::UpperTriangularMatrix &value); }; From fe6bac1a027e24fb89fb58d493e242f2f6ed12ed Mon Sep 17 00:00:00 2001 From: npkamath Date: Sun, 14 Jan 2024 16:59:21 -0500 Subject: [PATCH 09/27] added dynp.py file, fixed constructors, added column_indices for faster indexing --- dupin/detect/dynp.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 dupin/detect/dynp.py diff --git a/dupin/detect/dynp.py b/dupin/detect/dynp.py new file mode 100644 index 0000000..e69de29 From 245a92fc72281afd44fd644430459fa4f75fb634 Mon Sep 17 00:00:00 2001 From: npkamath Date: Sun, 14 Jan 2024 19:09:49 -0500 Subject: [PATCH 10/27] fixed index system --- src/dupin.cpp | 38 +++++++++++++++++------------------ src/dupin.h | 45 ++++++++++++++++++++++++++++-------------- src/dupininterface.cpp | 2 +- 3 files changed, 50 insertions(+), 35 deletions(-) diff --git a/src/dupin.cpp b/src/dupin.cpp index 71d136d..3c77262 100644 --- a/src/dupin.cpp +++ b/src/dupin.cpp @@ -13,31 +13,33 @@ using namespace std; using namespace Eigen; DynamicProgramming::DynamicProgramming() - : num_bkps(0), num_parameters(0), num_timesteps(0), jump(1), min_size(3) {} - -DynamicProgramming::DynamicProgramming(int num_bkps_, int num_parameters_, - int num_timesteps_, int jump_, - int min_size_) - : num_bkps(num_bkps_), num_parameters(num_parameters_), - num_timesteps(num_timesteps_), jump(jump_), min_size(min_size_) {} + : num_bkps(1), num_parameters(0), num_timesteps(0), jump(1), min_size(3) {} + +DynamicProgramming::DynamicProgramming(const Eigen::MatrixXd &data, int num_bkps_, + int jump_, int min_size_) + : data(data), num_bkps(num_bkps_), + jump(jump_), min_size(min_size_) { + num_timesteps = data.rows(); + num_parameters = data.cols(); +} -void DynamicProgramming::scale_datum() { - Eigen::VectorXd min_val = datum.colwise().minCoeff(); - Eigen::VectorXd max_val = datum.colwise().maxCoeff(); +void DynamicProgramming::scale_data() { + Eigen::VectorXd min_val = data.colwise().minCoeff(); + Eigen::VectorXd max_val = data.colwise().maxCoeff(); Eigen::VectorXd range = max_val - min_val; for (int j = 0; j < num_parameters; ++j) { if (range(j) == 0.0) { - datum.col(j).setZero(); + data.col(j).setZero(); } else { - datum.col(j) = (datum.col(j).array() - min_val(j)) / range(j); + data.col(j) = (data.col(j).array() - min_val(j)) / range(j); } } } void DynamicProgramming::regression_setup(linear_fit_struct &lfit) { lfit.x = Eigen::VectorXd::LinSpaced(num_timesteps, 0, num_timesteps - 1) / (num_timesteps - 1); - lfit.y = datum; + lfit.y = data; } Eigen::VectorXd DynamicProgramming::regression_line(int start, int end, int dim, @@ -61,7 +63,7 @@ Eigen::VectorXd DynamicProgramming::regression_line(int start, int end, int dim, double DynamicProgramming::l2_cost(Eigen::MatrixXd &predicted_y, int start, int end) { Eigen::MatrixXd diff = predicted_y.block(start, 0, end - start, num_parameters) - - datum.block(start, 0, end - start, num_parameters); + data.block(start, 0, end - start, num_parameters); return std::sqrt(diff.array().square().sum()); } @@ -83,7 +85,7 @@ double DynamicProgramming::cost_function(int start, int end) { } void DynamicProgramming::initialize_cost_matrix() { - scale_datum(); + scale_data(); cost_matrix.initialize(num_timesteps); tbb::parallel_for(tbb::blocked_range(0, num_timesteps), @@ -148,7 +150,7 @@ int DynamicProgramming::get_num_parameters() { return num_parameters; } int DynamicProgramming::get_num_bkps() { return num_bkps; } -Eigen::MatrixXd &DynamicProgramming::getDatum() { return datum; } +Eigen::MatrixXd &DynamicProgramming::getDatum() { return data; } DynamicProgramming::UpperTriangularMatrix & DynamicProgramming::getCostMatrix() { @@ -161,10 +163,8 @@ void DynamicProgramming::set_num_parameters(int value) { num_parameters = value; } -void DynamicProgramming::set_num_bkps(int value) { num_bkps = value; } - void DynamicProgramming::setDatum(const Eigen::MatrixXd &value) { - datum = value; + data = value; } void DynamicProgramming::setCostMatrix( diff --git a/src/dupin.h b/src/dupin.h index 9c52e35..412fe5f 100644 --- a/src/dupin.h +++ b/src/dupin.h @@ -9,30 +9,44 @@ // DynamicProgramming class for dynamic programming based segmentation. -class DynamicProgramming { // change name to Dynamic Programming +class DynamicProgramming { private: - class UpperTriangularMatrix{ + class UpperTriangularMatrix { private: std::vector matrix; + std::vector row_indices; int length; - int index(int i, int j) const { - return i * (2 * length - i + 1) / 2 + (j - i); + int index(int row, int col) const { + return row_indices[row] + col - row; } public: UpperTriangularMatrix() : length(0) {} - void initialize(int n) { - length = n; - matrix.resize(n * (n + 1) / 2, 0.0); + UpperTriangularMatrix(int n) : length(n), matrix(n * (n + 1) / 2, 0.0), + row_indices(n) { + for (int row = 0; row < n; ++row) { + row_indices[row] = row * (2 * length - row + 1) / 2; + } } - double &operator()(int i, int j) { return matrix[index(i, j)]; } + void initialize(int n) { + length = n; + matrix.resize(n * (n + 1) / 2, 0.0); + row_indices.resize(n); + for (int row = 0; row < n; ++row) { + row_indices[row] = row * (2 * length - row + 1) / 2; + } + } + double &operator()(int row, int col) { + return matrix[index(row, col)]; + } int getSize() const { return length; } - }; +}; UpperTriangularMatrix cost_matrix; + // Struct for memoization key, combining start, end, and number of // breakpoints. struct MemoKey { @@ -49,7 +63,7 @@ class DynamicProgramming { // change name to Dynamic Programming // Custom XOR-bit hash function for MemoKey, avoids clustering of data in // unordered map to improve efficiency. - struct MemoKeyHash { // test without hash function + struct MemoKeyHash { std::size_t operator()(const MemoKey &key) const { return ((std::hash()(key.start) ^ (std::hash()(key.end) << 1)) >> @@ -67,7 +81,7 @@ class DynamicProgramming { // change name to Dynamic Programming int num_timesteps; // Number of data points (time steps). int jump; // Interval for checking potential breakpoints. int min_size; // Minimum size of a segment. - Eigen::MatrixXd datum; // Matrix storing the dataset. + Eigen::MatrixXd data; // Matrix storing the dataset. // Structure for storing linear regression parameters. struct linear_fit_struct { @@ -80,11 +94,11 @@ class DynamicProgramming { // change name to Dynamic Programming DynamicProgramming(); // Parameterized constructor. - DynamicProgramming(int num_bkps_, int num_parameters_, int num_timesteps_, - int jump_, int min_size_); + DynamicProgramming(const Eigen::MatrixXd &data, int num_bkps_, int jump_, + int min_size_); // Scales the dataset using min-max normalization. - void scale_datum(); + void scale_data(); // Prepares data for linear regression. void regression_setup(linear_fit_struct &lfit); @@ -108,7 +122,9 @@ class DynamicProgramming { // change name to Dynamic Programming // Recursive function for dynamic programming segmentation. std::pair> seg(int start, int end, int num_bkps); + //sets number of threads for parallelization void set_parallelization(int num_threads); + // Returns the optimal set of breakpoints after segmentation. std::vector return_breakpoints(); @@ -122,7 +138,6 @@ class DynamicProgramming { // change name to Dynamic Programming // Setter functions for modifying private class members. void set_num_timesteps(int value); void set_num_parameters(int value); - void set_num_bkps(int value); void setDatum(const Eigen::MatrixXd &value); void setCostMatrix(const DynamicProgramming::UpperTriangularMatrix &value); diff --git a/src/dupininterface.cpp b/src/dupininterface.cpp index 63fa2c2..aa6a607 100644 --- a/src/dupininterface.cpp +++ b/src/dupininterface.cpp @@ -8,7 +8,7 @@ namespace py = pybind11; PYBIND11_MODULE(_dupin, m) { py::class_(m, "DynamicProgramming") .def(py::init<>()) - .def_property("datum", &DynamicProgramming::getDatum, + .def_property("data", &DynamicProgramming::getDatum, &DynamicProgramming::setDatum) .def_property("cost_matrix", &DynamicProgramming::getCostMatrix, &DynamicProgramming::setCostMatrix) From 0af1b08cf67832e5c4af40a8285373d37d431cf9 Mon Sep 17 00:00:00 2001 From: npkamath Date: Mon, 15 Jan 2024 17:56:28 -0500 Subject: [PATCH 11/27] reorganized class variables and added dynp.py functions and fit --- CMakeLists.txt | 2 +- dupin/detect/dynp.py | 46 ++++++++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 2 +- src/dupin.cpp | 14 ++++++------- src/dupin.h | 36 +++++++++++++++++---------------- src/dupininterface.cpp | 8 ++++---- 6 files changed, 77 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 570a21b..810bbb7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ endif() find_package(Eigen3 REQUIRED) find_package(TBB REQUIRED) -find_package(pybind11 CONFIG REQUIRED) +add_subdirectory(extern/pybind11) include_directories(${PROJECT_SOURCE_DIR}/src) add_subdirectory(src) diff --git a/dupin/detect/dynp.py b/dupin/detect/dynp.py index e69de29..0fd38c2 100644 --- a/dupin/detect/dynp.py +++ b/dupin/detect/dynp.py @@ -0,0 +1,46 @@ +import _DynP + +class DynP: + """Dynamic Programming class for calculating optimal segmentation + + Attributes: + data (np.ndarray): Matrix storing the dataset. + num_bkps (int): Number of breakpoints to detect. + jump (int): Interval for checking potential breakpoints. + min_size (int): Minimum size of a segment. + """ + + def __init__(self, data: np.ndarray, num_bkps: int, jump: int, min_size: int): + """Initializes the DynamicProgramming instance with given parameters.""" + self.dynp = _DynP.DynamicProgramming(data, num_bkps, jump, min_size) + + def set_num_threads(self, num_threads: int): + """Sets the number of threads for parallelization. + + Args: + num_threads (int): The number of threads to use. + """ + self.dynp.set_threads(num_threads) + + def return_breakpoints(self) -> list: + """Returns the optimal set of breakpoints after segmentation. + + Returns: + list: A list of integers representing the breakpoints. + """ + return self.dynp.return_breakpoints() + + def initialize_cost_matrix(self): + """Initializes and fills the upper triangular cost matrix for all data segments.""" + self.dynp.initialize_cost_matrix() + + def fit(self) -> list: + """Calculates the cost matrix and returns the breakpoints. + + Returns: + list: A list of integers representing the breakpoints. + """ + return self.dynp.fit() + + + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bf0d409..6fc4fa5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,5 @@ pybind11_add_module(_dupin dupininterface.cpp - dupin.h dupin.cpp + dupin.h dupin.cpp ) set_target_properties(_dupin PROPERTIES diff --git a/src/dupin.cpp b/src/dupin.cpp index 3c77262..814cc27 100644 --- a/src/dupin.cpp +++ b/src/dupin.cpp @@ -87,7 +87,6 @@ double DynamicProgramming::cost_function(int start, int end) { void DynamicProgramming::initialize_cost_matrix() { scale_data(); cost_matrix.initialize(num_timesteps); - tbb::parallel_for(tbb::blocked_range(0, num_timesteps), [&](const tbb::blocked_range &r) { for (int i = r.begin(); i < r.end(); ++i) { @@ -130,7 +129,7 @@ std::pair> DynamicProgramming::seg(int start, int end, return best; } -std::vector DynamicProgramming::return_breakpoints() { +std::vector DynamicProgramming::compute_breakpoints() { auto result = seg(0, num_timesteps - 1, num_bkps); std::vector breakpoints = result.second; std::sort(breakpoints.begin(), breakpoints.end()); @@ -139,6 +138,11 @@ std::vector DynamicProgramming::return_breakpoints() { return breakpoints; } +std::vector DynamicProgramming::fit(){ + initialize_cost_matrix(); + return compute_breakpoints(); +} + void set_parallelization(int num_threads) { static tbb::global_control gc(tbb::global_control::max_allowed_parallelism, num_threads); @@ -157,12 +161,6 @@ DynamicProgramming::getCostMatrix() { return cost_matrix; } -void DynamicProgramming::set_num_timesteps(int value) { num_timesteps = value; } - -void DynamicProgramming::set_num_parameters(int value) { - num_parameters = value; -} - void DynamicProgramming::setDatum(const Eigen::MatrixXd &value) { data = value; } diff --git a/src/dupin.h b/src/dupin.h index 412fe5f..c48a32d 100644 --- a/src/dupin.h +++ b/src/dupin.h @@ -88,16 +88,7 @@ class DynamicProgramming { Eigen::MatrixXd y; // Dependent variable (labels). Eigen::VectorXd x; // z Independent variable (time steps). }; - -public: - // Default constructor. - DynamicProgramming(); - - // Parameterized constructor. - DynamicProgramming(const Eigen::MatrixXd &data, int num_bkps_, int jump_, - int min_size_); - - // Scales the dataset using min-max normalization. + // Scales the dataset using min-max normalization. void scale_data(); // Prepares data for linear regression. @@ -116,17 +107,30 @@ class DynamicProgramming { // Computes the cost of a specific data segment using linear regression. double cost_function(int start, int end); + // Recursive function for dynamic programming segmentation. + std::pair> seg(int start, int end, int num_bkps); + + +public: + // Default constructor. + DynamicProgramming(); + + // Parameterized constructor. + DynamicProgramming(const Eigen::MatrixXd &data, int num_bkps_, int jump_, + int min_size_); + // Initializes and fills the cost matrix for all data segments. void initialize_cost_matrix(); - // Recursive function for dynamic programming segmentation. - std::pair> seg(int start, int end, int num_bkps); //sets number of threads for parallelization void set_parallelization(int num_threads); // Returns the optimal set of breakpoints after segmentation. - std::vector return_breakpoints(); + std::vector compute_breakpoints(); + + // Calculates the cost matrix and return the breakpoints + std::vector fit(); // Getter functions for accessing private class members. int get_num_timesteps(); @@ -136,9 +140,7 @@ class DynamicProgramming { DynamicProgramming::UpperTriangularMatrix &getCostMatrix(); // Setter functions for modifying private class members. - void set_num_timesteps(int value); - void set_num_parameters(int value); + void setDatum(const Eigen::MatrixXd &value); - void - setCostMatrix(const DynamicProgramming::UpperTriangularMatrix &value); + void setCostMatrix(const DynamicProgramming::UpperTriangularMatrix &value); }; diff --git a/src/dupininterface.cpp b/src/dupininterface.cpp index aa6a607..2752228 100644 --- a/src/dupininterface.cpp +++ b/src/dupininterface.cpp @@ -5,21 +5,21 @@ namespace py = pybind11; -PYBIND11_MODULE(_dupin, m) { +PYBIND11_MODULE(_DynP, m) { py::class_(m, "DynamicProgramming") .def(py::init<>()) .def_property("data", &DynamicProgramming::getDatum, &DynamicProgramming::setDatum) .def_property("cost_matrix", &DynamicProgramming::getCostMatrix, &DynamicProgramming::setCostMatrix) - .def_property("num_bkps", &DynamicProgramming::get_num_bkps, - &DynamicProgramming::set_num_bkps) + .def("num_bkps", &DynamicProgramming::get_num_bkps) .def_property("num_timesteps", &DynamicProgramming::get_num_timesteps, &DynamicProgramming::set_num_timesteps) .def_property("num_parameters", &DynamicProgramming::get_num_parameters, &DynamicProgramming::set_num_parameters) .def("initialize_cost_matrix", &DynamicProgramming::initialize_cost_matrix) - .def("return_breakpoints", &DynamicProgramming::return_breakpoints) + .def("return_breakpoints", &DynamicProgramming::compute_breakpoints) + .def("fit", &DynamicProgramming::fit) .def("set_threads", &DynamicProgramming::set_parallelization); } From 66e210427b5f12b37371783e8216a710d8e96d46 Mon Sep 17 00:00:00 2001 From: npkamath Date: Fri, 19 Jan 2024 02:12:40 -0500 Subject: [PATCH 12/27] fit function added with parameter input, removed whitespace with precommit, rearranged class to private --- CMakeLists.txt | 2 +- dupin/detect/dynp.py | 64 +++++++++++++++++++++++------------------- src/CMakeLists.txt | 2 +- src/dupin.cpp | 26 ++++++----------- src/dupin.h | 51 ++++++++++++--------------------- src/dupininterface.cpp | 10 ------- 6 files changed, 64 insertions(+), 91 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 810bbb7..570a21b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ endif() find_package(Eigen3 REQUIRED) find_package(TBB REQUIRED) -add_subdirectory(extern/pybind11) +find_package(pybind11 CONFIG REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/src) add_subdirectory(src) diff --git a/dupin/detect/dynp.py b/dupin/detect/dynp.py index 0fd38c2..bf26407 100644 --- a/dupin/detect/dynp.py +++ b/dupin/detect/dynp.py @@ -1,46 +1,52 @@ +"""Implements dynamic programming class for optimal segementation algorithm.""" import _DynP +import numpy as np -class DynP: - """Dynamic Programming class for calculating optimal segmentation - Attributes: - data (np.ndarray): Matrix storing the dataset. - num_bkps (int): Number of breakpoints to detect. - jump (int): Interval for checking potential breakpoints. - min_size (int): Minimum size of a segment. +class DynP: + """Detects the change points in a time series. + + Attributes + ---------- + data: np.ndarray + Matrix storing the time series data. + num_bkps: int + Number of change points to detect. + jump: int + Interval for checking potential change points. Changing will + not provide optimal detection, but will reduce runtime. + min_size: int + Minimum size of a segment. Changing will not provide optimal + detection, but will reduce runtime. """ - def __init__(self, data: np.ndarray, num_bkps: int, jump: int, min_size: int): - """Initializes the DynamicProgramming instance with given parameters.""" + def __init__( + self, data: np.ndarray, num_bkps: int, jump: int, min_size: int + ): + """Initialize the DynamicProgramming instance with given parameters.""" self.dynp = _DynP.DynamicProgramming(data, num_bkps, jump, min_size) def set_num_threads(self, num_threads: int): - """Sets the number of threads for parallelization. + """Set the number of threads for parallelization. - Args: - num_threads (int): The number of threads to use. + Parameters + ---------- + num_threads: int + The number of threads to use during computation. Default + is determined automatically. """ self.dynp.set_threads(num_threads) - def return_breakpoints(self) -> list: - """Returns the optimal set of breakpoints after segmentation. - - Returns: - list: A list of integers representing the breakpoints. - """ - return self.dynp.return_breakpoints() - - def initialize_cost_matrix(self): - """Initializes and fills the upper triangular cost matrix for all data segments.""" - self.dynp.initialize_cost_matrix() + def fit(self, num_bkps: int) -> list: + """Calculate the cost matrix and return the breakpoints. - def fit(self) -> list: - """Calculates the cost matrix and returns the breakpoints. + Parameters + ---------- + num_bkps: int + number of change points to detect. - Returns: + Returns + ------- list: A list of integers representing the breakpoints. """ return self.dynp.fit() - - - diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6fc4fa5..bf0d409 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,5 @@ pybind11_add_module(_dupin dupininterface.cpp - dupin.h dupin.cpp + dupin.h dupin.cpp ) set_target_properties(_dupin PROPERTIES diff --git a/src/dupin.cpp b/src/dupin.cpp index 814cc27..6a53f6d 100644 --- a/src/dupin.cpp +++ b/src/dupin.cpp @@ -15,9 +15,9 @@ using namespace Eigen; DynamicProgramming::DynamicProgramming() : num_bkps(1), num_parameters(0), num_timesteps(0), jump(1), min_size(3) {} -DynamicProgramming::DynamicProgramming(const Eigen::MatrixXd &data, int num_bkps_, +DynamicProgramming::DynamicProgramming(const Eigen::MatrixXd &data, int num_bkps_, int jump_, int min_size_) - : data(data), num_bkps(num_bkps_), + : data(data), num_bkps(num_bkps_), jump(jump_), min_size(min_size_) { num_timesteps = data.rows(); num_parameters = data.cols(); @@ -95,6 +95,7 @@ void DynamicProgramming::initialize_cost_matrix() { } } }); + cost_computed = true; } std::pair> DynamicProgramming::seg(int start, int end, @@ -138,9 +139,12 @@ std::vector DynamicProgramming::compute_breakpoints() { return breakpoints; } -std::vector DynamicProgramming::fit(){ +std::vector DynamicProgramming::fit(int num_bkps_in){ + num_bkps = num_bkps_in; + if (!cost_computed){ initialize_cost_matrix(); - return compute_breakpoints(); + } + return compute_breakpoints(); } void set_parallelization(int num_threads) { @@ -148,26 +152,14 @@ void set_parallelization(int num_threads) { num_threads); } -int DynamicProgramming::get_num_timesteps() { return num_timesteps; } - -int DynamicProgramming::get_num_parameters() { return num_parameters; } - -int DynamicProgramming::get_num_bkps() { return num_bkps; } - -Eigen::MatrixXd &DynamicProgramming::getDatum() { return data; } - DynamicProgramming::UpperTriangularMatrix & DynamicProgramming::getCostMatrix() { return cost_matrix; } -void DynamicProgramming::setDatum(const Eigen::MatrixXd &value) { - data = value; -} - void DynamicProgramming::setCostMatrix( const DynamicProgramming::UpperTriangularMatrix &value) { cost_matrix = value; } -int main() { return 0; } \ No newline at end of file +int main() { return 0; } diff --git a/src/dupin.h b/src/dupin.h index c48a32d..515049c 100644 --- a/src/dupin.h +++ b/src/dupin.h @@ -8,9 +8,11 @@ #include -// DynamicProgramming class for dynamic programming based segmentation. -class DynamicProgramming { +// Calculates optimal breakpoints in time-series data using memoization +class DynamicProgramming { private: + + //stores upper triangular cost matrix efficiently class UpperTriangularMatrix { private: std::vector matrix; @@ -24,13 +26,6 @@ class DynamicProgramming { public: UpperTriangularMatrix() : length(0) {} - UpperTriangularMatrix(int n) : length(n), matrix(n * (n + 1) / 2, 0.0), - row_indices(n) { - for (int row = 0; row < n; ++row) { - row_indices[row] = row * (2 * length - row + 1) / 2; - } - } - void initialize(int n) { length = n; matrix.resize(n * (n + 1) / 2, 0.0); @@ -45,8 +40,6 @@ class DynamicProgramming { } int getSize() const { return length; } }; - UpperTriangularMatrix cost_matrix; - // Struct for memoization key, combining start, end, and number of // breakpoints. struct MemoKey { @@ -63,7 +56,7 @@ class DynamicProgramming { // Custom XOR-bit hash function for MemoKey, avoids clustering of data in // unordered map to improve efficiency. - struct MemoKeyHash { + struct MemoKeyHash { std::size_t operator()(const MemoKey &key) const { return ((std::hash()(key.start) ^ (std::hash()(key.end) << 1)) >> @@ -81,8 +74,10 @@ class DynamicProgramming { int num_timesteps; // Number of data points (time steps). int jump; // Interval for checking potential breakpoints. int min_size; // Minimum size of a segment. - Eigen::MatrixXd data; // Matrix storing the dataset. + Eigen::MatrixXd data; // Matrix storing the dataset. + UpperTriangularMatrix cost_matrix; //Matrix storing costs + bool cost_computed = false; // Structure for storing linear regression parameters. struct linear_fit_struct { Eigen::MatrixXd y; // Dependent variable (labels). @@ -110,37 +105,27 @@ class DynamicProgramming { // Recursive function for dynamic programming segmentation. std::pair> seg(int start, int end, int num_bkps); +// Initializes and fills the cost matrix for all data segments. + void initialize_cost_matrix(); + + // Returns the optimal set of breakpoints after segmentation. + std::vector compute_breakpoints(); public: // Default constructor. DynamicProgramming(); // Parameterized constructor. - DynamicProgramming(const Eigen::MatrixXd &data, int num_bkps_, int jump_, + DynamicProgramming(const Eigen::MatrixXd &data, int num_bkps_, int jump_, int min_size_); - // Initializes and fills the cost matrix for all data segments. - void initialize_cost_matrix(); - - - //sets number of threads for parallelization + //Sets number of threads for parallelization void set_parallelization(int num_threads); - // Returns the optimal set of breakpoints after segmentation. - std::vector compute_breakpoints(); + // Calculates optimal breakpoints with given number of points. + std::vector fit(int num_bkps_in); - // Calculates the cost matrix and return the breakpoints - std::vector fit(); - - // Getter functions for accessing private class members. - int get_num_timesteps(); - int get_num_parameters(); - int get_num_bkps(); - Eigen::MatrixXd &getDatum(); + // Getter functions for cost matrix. DynamicProgramming::UpperTriangularMatrix &getCostMatrix(); - - // Setter functions for modifying private class members. - - void setDatum(const Eigen::MatrixXd &value); void setCostMatrix(const DynamicProgramming::UpperTriangularMatrix &value); }; diff --git a/src/dupininterface.cpp b/src/dupininterface.cpp index 2752228..c4e757a 100644 --- a/src/dupininterface.cpp +++ b/src/dupininterface.cpp @@ -8,18 +8,8 @@ namespace py = pybind11; PYBIND11_MODULE(_DynP, m) { py::class_(m, "DynamicProgramming") .def(py::init<>()) - .def_property("data", &DynamicProgramming::getDatum, - &DynamicProgramming::setDatum) .def_property("cost_matrix", &DynamicProgramming::getCostMatrix, &DynamicProgramming::setCostMatrix) - .def("num_bkps", &DynamicProgramming::get_num_bkps) - .def_property("num_timesteps", &DynamicProgramming::get_num_timesteps, - &DynamicProgramming::set_num_timesteps) - .def_property("num_parameters", &DynamicProgramming::get_num_parameters, - &DynamicProgramming::set_num_parameters) - .def("initialize_cost_matrix", - &DynamicProgramming::initialize_cost_matrix) - .def("return_breakpoints", &DynamicProgramming::compute_breakpoints) .def("fit", &DynamicProgramming::fit) .def("set_threads", &DynamicProgramming::set_parallelization); } From 9ccf1fe06b2e12d1f4ddda9470d6a9d382467e8b Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Wed, 24 Jan 2024 11:16:00 -0500 Subject: [PATCH 13/27] build: Switch to scikit-build-core. --- CMakeLists.txt | 6 ++++-- dupin/CMakeLists.txt | 2 ++ pyproject.toml | 4 ++-- src/CMakeLists.txt | 8 +++++--- 4 files changed, 13 insertions(+), 7 deletions(-) create mode 100644 dupin/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 570a21b..c3227aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.8) -project(_dupin VERSION 0.0.1) +project(dupin VERSION 0.0.1 LANGUAGES CXX) set(DEFAULT_BUILD_TYPE "Release") if(NOT CMAKE_BUILD_TYPE) @@ -8,7 +8,9 @@ endif() find_package(Eigen3 REQUIRED) find_package(TBB REQUIRED) +# Use modern method for Python binding +set(PYBIND11_NEWPYTHON ON) find_package(pybind11 CONFIG REQUIRED) -include_directories(${PROJECT_SOURCE_DIR}/src) add_subdirectory(src) +add_subdirectory(dupin) diff --git a/dupin/CMakeLists.txt b/dupin/CMakeLists.txt new file mode 100644 index 0000000..37fcf95 --- /dev/null +++ b/dupin/CMakeLists.txt @@ -0,0 +1,2 @@ + # Defaults to site-packages so only need package name + install(DIRECTORY ./ DESTINATION dupin FILES_MATCHING PATTERN "*.py") diff --git a/pyproject.toml b/pyproject.toml index f701a67..d1433f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -build-backend = "setuptools.build_meta" -requires = ["setuptools >= 64.0.0"] +requires = ["scikit-build-core>=0.7.0", "pybind11"] +build-backend = "scikit_build_core.build" [project] name = "dupin" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bf0d409..cc57a68 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ -pybind11_add_module(_dupin dupininterface.cpp - dupin.h dupin.cpp -) +list(APPEND dupin_cxx "dupininterface.cpp" "dupin.h" "dupin.cpp") + +pybind11_add_module(_dupin ${dupin_cxx}) set_target_properties(_dupin PROPERTIES CXX_STANDARD 17 @@ -15,3 +15,5 @@ target_include_directories(_dupin PRIVATE target_link_libraries(_dupin PRIVATE TBB::tbb) target_compile_definitions(_dupin PRIVATE VERSION_INFO=${PROJECT_VERSION}) target_compile_options(_dupin PRIVATE -O2 -march=native) +# Installs C++ extension into the root of the Python package +install(TARGETS _dupin LIBRARY DESTINATION dupin) From 8da0fe076274e91532f1b74eb8ddeaadb9a4c41c Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Wed, 24 Jan 2024 11:28:53 -0500 Subject: [PATCH 14/27] ci: Add reusable workflow to install system packages Installs: - TBB - Eigen3 - Ninja --- .../install-nopython-dependencies.yml | 21 +++++++++++++++++++ .github/workflows/publish-packages.yml | 3 ++- .github/workflows/run-pytest.yml | 4 +++- 3 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/install-nopython-dependencies.yml diff --git a/.github/workflows/install-nopython-dependencies.yml b/.github/workflows/install-nopython-dependencies.yml new file mode 100644 index 0000000..de47594 --- /dev/null +++ b/.github/workflows/install-nopython-dependencies.yml @@ -0,0 +1,21 @@ +name: Install Non-Python Dependencies + +on: + workflow_call: + +jobs: + install-dependencies-linux: + runs-on: ubuntu-latest + steps: + - name: Install dependencies + run: sudo apt-get install tbb tbblib eigen3 ninja + install-dependencies-macos: + runs-on: macos-latest + steps: + - name: Install dependencies + run: brew install tbb eigen3 ninja + install-dependencies-windows: + runs-on: ubuntu-latest + steps: + - name: Install dependencies + run: choco install tbb eigen3 ninja diff --git a/.github/workflows/publish-packages.yml b/.github/workflows/publish-packages.yml index 71ecc96..645c4c8 100644 --- a/.github/workflows/publish-packages.yml +++ b/.github/workflows/publish-packages.yml @@ -34,10 +34,11 @@ jobs: -r requirements/requirements-test.txt \ -r requirements/requirements-jit.txt \ -r requirements/requirements-data.txt - - name: Install pypa/build run: python -m pip install build + - name: Install System Packages + with: ./.github/workflows/install-nopython-dependencies.yml - name: Build a binary wheel and a source tarball run: python -m build --outdir dist/ . diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index d7526b1..21cfdd0 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -45,7 +45,7 @@ jobs: python-version: ${{ matrix.python }} - name: Update pip/build packages run: | - pip install setuptools --upgrade + pip install pip --upgrade - name: Install newest dependencies run: | pip install -r requirements/requirements-test.txt @@ -60,6 +60,8 @@ jobs: run: | pip install -r requirements/requirements-jit.txt if: ${{ matrix.python != '3.12' }} + - name: Install system packages + uses: ./.github/workflows/install-nopython-dependencies.yml - name: Install the package run: | pip install -e . From 943a6c02f563447449d049772f8e53d98df77e71 Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Wed, 24 Jan 2024 11:53:01 -0500 Subject: [PATCH 15/27] ci: Fix package installation by using composite action Cannot use a reusable workflow as a step only as a job --- .../install-nopython-dependencies/action.yml | 14 +++++++++++++ .../install-nopython-dependencies.yml | 21 ------------------- .github/workflows/publish-packages.yml | 2 +- .github/workflows/run-pytest.yml | 2 +- 4 files changed, 16 insertions(+), 23 deletions(-) create mode 100644 .github/actions/install-nopython-dependencies/action.yml delete mode 100644 .github/workflows/install-nopython-dependencies.yml diff --git a/.github/actions/install-nopython-dependencies/action.yml b/.github/actions/install-nopython-dependencies/action.yml new file mode 100644 index 0000000..afb5936 --- /dev/null +++ b/.github/actions/install-nopython-dependencies/action.yml @@ -0,0 +1,14 @@ +name: Install Non-Python Dependencies +description: Installs system packages needed to build dupin. +runs: + using: "composite" + steps: + - install-dependencies-linux: + if: runner.os == 'Linux' + run: sudo apt-get install tbb tbblib eigen3 ninja + - install-dependencies-macos: + if: runner.os == 'macOS' + run: brew install tbb eigen3 ninja + - install-dependencies-windows: + if: runner.os == 'Windows' + run: choco install tbb eigen3 ninja diff --git a/.github/workflows/install-nopython-dependencies.yml b/.github/workflows/install-nopython-dependencies.yml deleted file mode 100644 index de47594..0000000 --- a/.github/workflows/install-nopython-dependencies.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: Install Non-Python Dependencies - -on: - workflow_call: - -jobs: - install-dependencies-linux: - runs-on: ubuntu-latest - steps: - - name: Install dependencies - run: sudo apt-get install tbb tbblib eigen3 ninja - install-dependencies-macos: - runs-on: macos-latest - steps: - - name: Install dependencies - run: brew install tbb eigen3 ninja - install-dependencies-windows: - runs-on: ubuntu-latest - steps: - - name: Install dependencies - run: choco install tbb eigen3 ninja diff --git a/.github/workflows/publish-packages.yml b/.github/workflows/publish-packages.yml index 645c4c8..bf86484 100644 --- a/.github/workflows/publish-packages.yml +++ b/.github/workflows/publish-packages.yml @@ -38,7 +38,7 @@ jobs: run: python -m pip install build - name: Install System Packages - with: ./.github/workflows/install-nopython-dependencies.yml + uses: ./.github/actions/install-nopython-dependencies - name: Build a binary wheel and a source tarball run: python -m build --outdir dist/ . diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 21cfdd0..4b2fa04 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -61,7 +61,7 @@ jobs: pip install -r requirements/requirements-jit.txt if: ${{ matrix.python != '3.12' }} - name: Install system packages - uses: ./.github/workflows/install-nopython-dependencies.yml + uses: ./.github/actions/install-nopython-dependencies - name: Install the package run: | pip install -e . From 98ee45c3ae74833f16f1e8b1854fdbcb36a26026 Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Wed, 24 Jan 2024 11:59:16 -0500 Subject: [PATCH 16/27] ci: Correctly specify shell for custom action --- .github/actions/install-nopython-dependencies/action.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/actions/install-nopython-dependencies/action.yml b/.github/actions/install-nopython-dependencies/action.yml index afb5936..be959e4 100644 --- a/.github/actions/install-nopython-dependencies/action.yml +++ b/.github/actions/install-nopython-dependencies/action.yml @@ -5,10 +5,13 @@ runs: steps: - install-dependencies-linux: if: runner.os == 'Linux' + shell: bash run: sudo apt-get install tbb tbblib eigen3 ninja - install-dependencies-macos: if: runner.os == 'macOS' + shell: bash run: brew install tbb eigen3 ninja - install-dependencies-windows: if: runner.os == 'Windows' + shell: bash run: choco install tbb eigen3 ninja From 744b9284a5f6810e642f6a6c887d85be3f4cecfd Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Wed, 24 Jan 2024 12:02:24 -0500 Subject: [PATCH 17/27] ci: Fix action step name formatting --- .github/actions/install-nopython-dependencies/action.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/install-nopython-dependencies/action.yml b/.github/actions/install-nopython-dependencies/action.yml index be959e4..6879eb0 100644 --- a/.github/actions/install-nopython-dependencies/action.yml +++ b/.github/actions/install-nopython-dependencies/action.yml @@ -3,15 +3,15 @@ description: Installs system packages needed to build dupin. runs: using: "composite" steps: - - install-dependencies-linux: + - name: install-dependencies-linux: if: runner.os == 'Linux' shell: bash run: sudo apt-get install tbb tbblib eigen3 ninja - - install-dependencies-macos: + - name: install-dependencies-macos: if: runner.os == 'macOS' shell: bash run: brew install tbb eigen3 ninja - - install-dependencies-windows: + - name: install-dependencies-windows: if: runner.os == 'Windows' shell: bash run: choco install tbb eigen3 ninja From b8dd0f253a4f6a17613f2963af0c86025ea45191 Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Wed, 24 Jan 2024 13:04:07 -0500 Subject: [PATCH 18/27] ci: Remove trailing ":" --- .github/actions/install-nopython-dependencies/action.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/install-nopython-dependencies/action.yml b/.github/actions/install-nopython-dependencies/action.yml index 6879eb0..0b34644 100644 --- a/.github/actions/install-nopython-dependencies/action.yml +++ b/.github/actions/install-nopython-dependencies/action.yml @@ -3,15 +3,15 @@ description: Installs system packages needed to build dupin. runs: using: "composite" steps: - - name: install-dependencies-linux: + - name: install-dependencies-linux if: runner.os == 'Linux' shell: bash run: sudo apt-get install tbb tbblib eigen3 ninja - - name: install-dependencies-macos: + - name: install-dependencies-macos if: runner.os == 'macOS' shell: bash run: brew install tbb eigen3 ninja - - name: install-dependencies-windows: + - name: install-dependencies-windows if: runner.os == 'Windows' shell: bash run: choco install tbb eigen3 ninja From c7e69208a70c16e265c123a17ff81c9958ac1e63 Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Wed, 24 Jan 2024 13:10:29 -0500 Subject: [PATCH 19/27] ci: Update package manager caches before installing --- .github/actions/install-nopython-dependencies/action.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/actions/install-nopython-dependencies/action.yml b/.github/actions/install-nopython-dependencies/action.yml index 0b34644..378506b 100644 --- a/.github/actions/install-nopython-dependencies/action.yml +++ b/.github/actions/install-nopython-dependencies/action.yml @@ -6,11 +6,15 @@ runs: - name: install-dependencies-linux if: runner.os == 'Linux' shell: bash - run: sudo apt-get install tbb tbblib eigen3 ninja + run: | + sudo apt-get update + sudo apt-get install tbb tbblib eigen3 ninja - name: install-dependencies-macos if: runner.os == 'macOS' shell: bash - run: brew install tbb eigen3 ninja + run: | + brew update + brew install tbb eigen3 ninja - name: install-dependencies-windows if: runner.os == 'Windows' shell: bash From 7771157c87f1ca8abfcfeb09e06a1114768e648f Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Wed, 24 Jan 2024 13:24:18 -0500 Subject: [PATCH 20/27] ci: Fix apt-get package names --- .github/actions/install-nopython-dependencies/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/install-nopython-dependencies/action.yml b/.github/actions/install-nopython-dependencies/action.yml index 378506b..84625a5 100644 --- a/.github/actions/install-nopython-dependencies/action.yml +++ b/.github/actions/install-nopython-dependencies/action.yml @@ -8,7 +8,7 @@ runs: shell: bash run: | sudo apt-get update - sudo apt-get install tbb tbblib eigen3 ninja + sudo apt-get install libtbb libtbb-dev libeigen3-dev ninja-build - name: install-dependencies-macos if: runner.os == 'macOS' shell: bash From f7287f8c32998596ddff56cbca776e9a6e1e9ea1 Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Wed, 24 Jan 2024 13:37:00 -0500 Subject: [PATCH 21/27] ci: Fix one last package name --- .github/actions/install-nopython-dependencies/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/install-nopython-dependencies/action.yml b/.github/actions/install-nopython-dependencies/action.yml index 84625a5..9cb9ada 100644 --- a/.github/actions/install-nopython-dependencies/action.yml +++ b/.github/actions/install-nopython-dependencies/action.yml @@ -8,7 +8,7 @@ runs: shell: bash run: | sudo apt-get update - sudo apt-get install libtbb libtbb-dev libeigen3-dev ninja-build + sudo apt-get install libtbb12 libtbb-dev libeigen3-dev ninja-build - name: install-dependencies-macos if: runner.os == 'macOS' shell: bash From 47aedc0fd9ef70bb1d25a16c2a2b35a4b9436795 Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Wed, 24 Jan 2024 13:40:42 -0500 Subject: [PATCH 22/27] ci: Yet another package name change --- .github/actions/install-nopython-dependencies/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/install-nopython-dependencies/action.yml b/.github/actions/install-nopython-dependencies/action.yml index 9cb9ada..67dbd04 100644 --- a/.github/actions/install-nopython-dependencies/action.yml +++ b/.github/actions/install-nopython-dependencies/action.yml @@ -14,7 +14,7 @@ runs: shell: bash run: | brew update - brew install tbb eigen3 ninja + brew install tbb eigen ninja - name: install-dependencies-windows if: runner.os == 'Windows' shell: bash From 4fd3a58b78ef2c53786bc370b9d6bbfe7ba4382c Mon Sep 17 00:00:00 2001 From: npkamath Date: Mon, 26 Feb 2024 04:29:37 -0500 Subject: [PATCH 23/27] documentation and renaming added --- dupin/detect/dynp.py | 32 ++++++++++++++++++++++++++++---- src/CMakeLists.txt | 2 +- src/dupininterface.cpp | 15 --------------- 3 files changed, 29 insertions(+), 20 deletions(-) delete mode 100644 src/dupininterface.cpp diff --git a/dupin/detect/dynp.py b/dupin/detect/dynp.py index bf26407..c896977 100644 --- a/dupin/detect/dynp.py +++ b/dupin/detect/dynp.py @@ -1,5 +1,5 @@ """Implements dynamic programming class for optimal segementation algorithm.""" -import _DynP +import _dupin import numpy as np @@ -18,13 +18,37 @@ class DynP: min_size: int Minimum size of a segment. Changing will not provide optimal detection, but will reduce runtime. + + + Methods + ------- + __init__(self, data: np.ndarray, num_bkps: int, jump: int, min_size: int) + Initializes the DynamicProgramming instance with the time series data + and parameters. + set_num_threads(self, num_threads: int) + Sets the number of threads to be used for parallel computation. + fit(self, num_bkps: int) -> list + Calculates the cost matrix and identifies the optimal breakpoints in + the time series data. + + Example Usage + ------------- + >>> import numpy as np + >>> from dynp import DynP + >>> data = np.random.rand(100, 1) # Simulated time series data + >>> num_bkps = 3 # Number of breakpoints to detect + >>> jump = 1 # Interval for checking potential breakpoints + >>> min_size = 3 # Minimum size of a segment + >>> model = _dupin(data, num_bkps, jump, min_size) + >>> breakpoints = model.fit() + >>> print(breakpoints) """ def __init__( self, data: np.ndarray, num_bkps: int, jump: int, min_size: int ): """Initialize the DynamicProgramming instance with given parameters.""" - self.dynp = _DynP.DynamicProgramming(data, num_bkps, jump, min_size) + self._dupin = _dupin.DynamicProgramming(data, num_bkps, jump, min_size) def set_num_threads(self, num_threads: int): """Set the number of threads for parallelization. @@ -35,7 +59,7 @@ def set_num_threads(self, num_threads: int): The number of threads to use during computation. Default is determined automatically. """ - self.dynp.set_threads(num_threads) + self._dupin.set_threads(num_threads) def fit(self, num_bkps: int) -> list: """Calculate the cost matrix and return the breakpoints. @@ -49,4 +73,4 @@ def fit(self, num_bkps: int) -> list: ------- list: A list of integers representing the breakpoints. """ - return self.dynp.fit() + return self._dupin.fit() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cc57a68..7fda63a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,4 @@ -list(APPEND dupin_cxx "dupininterface.cpp" "dupin.h" "dupin.cpp") +list(APPEND dupin_cxx "module.cpp" "dupin.h" "dupin.cpp") pybind11_add_module(_dupin ${dupin_cxx}) diff --git a/src/dupininterface.cpp b/src/dupininterface.cpp deleted file mode 100644 index c4e757a..0000000 --- a/src/dupininterface.cpp +++ /dev/null @@ -1,15 +0,0 @@ -#include "dupin.h" -#include -#include -#include - -namespace py = pybind11; - -PYBIND11_MODULE(_DynP, m) { - py::class_(m, "DynamicProgramming") - .def(py::init<>()) - .def_property("cost_matrix", &DynamicProgramming::getCostMatrix, - &DynamicProgramming::setCostMatrix) - .def("fit", &DynamicProgramming::fit) - .def("set_threads", &DynamicProgramming::set_parallelization); -} From 4dd27f86e10c65e02eb771f557541b0502afb2af Mon Sep 17 00:00:00 2001 From: npkamath Date: Mon, 26 Feb 2024 05:45:24 -0500 Subject: [PATCH 24/27] upper triangular restructured and dynp restructured --- dupin/detect/dynp.py | 6 +++--- src/dupin.cpp | 30 ++++++++++++++---------------- src/dupin.h | 31 +++++++++++++++---------------- 3 files changed, 32 insertions(+), 35 deletions(-) diff --git a/dupin/detect/dynp.py b/dupin/detect/dynp.py index c896977..4661666 100644 --- a/dupin/detect/dynp.py +++ b/dupin/detect/dynp.py @@ -39,8 +39,8 @@ class DynP: >>> num_bkps = 3 # Number of breakpoints to detect >>> jump = 1 # Interval for checking potential breakpoints >>> min_size = 3 # Minimum size of a segment - >>> model = _dupin(data, num_bkps, jump, min_size) - >>> breakpoints = model.fit() + >>> model = Dynp(data, num_bkps, jump, min_size) + >>> breakpoints = model.fit(num_bkps) >>> print(breakpoints) """ @@ -73,4 +73,4 @@ def fit(self, num_bkps: int) -> list: ------- list: A list of integers representing the breakpoints. """ - return self._dupin.fit() + return self._dupin.fit(num_bkps) diff --git a/src/dupin.cpp b/src/dupin.cpp index 6a53f6d..5a93a45 100644 --- a/src/dupin.cpp +++ b/src/dupin.cpp @@ -13,14 +13,14 @@ using namespace std; using namespace Eigen; DynamicProgramming::DynamicProgramming() - : num_bkps(1), num_parameters(0), num_timesteps(0), jump(1), min_size(3) {} + : num_features(0), num_timesteps(0), jump(1), min_size(3), cost_matrix(0) {} -DynamicProgramming::DynamicProgramming(const Eigen::MatrixXd &data, int num_bkps_, - int jump_, int min_size_) - : data(data), num_bkps(num_bkps_), - jump(jump_), min_size(min_size_) { + +DynamicProgramming::DynamicProgramming(const Eigen::MatrixXd &data, + int jump_, int min_size_) + : data(data), jump(jump_), min_size(min_size_), cost_matrix(data.rows()) { num_timesteps = data.rows(); - num_parameters = data.cols(); + num_features = data.cols(); } void DynamicProgramming::scale_data() { @@ -28,7 +28,7 @@ void DynamicProgramming::scale_data() { Eigen::VectorXd max_val = data.colwise().maxCoeff(); Eigen::VectorXd range = max_val - min_val; - for (int j = 0; j < num_parameters; ++j) { + for (int j = 0; j (0, num_timesteps), [&](const tbb::blocked_range &r) { for (int i = r.begin(); i < r.end(); ++i) { @@ -130,7 +129,7 @@ std::pair> DynamicProgramming::seg(int start, int end, return best; } -std::vector DynamicProgramming::compute_breakpoints() { +std::vector DynamicProgramming::compute_breakpoints(int num_bkps) { auto result = seg(0, num_timesteps - 1, num_bkps); std::vector breakpoints = result.second; std::sort(breakpoints.begin(), breakpoints.end()); @@ -139,12 +138,11 @@ std::vector DynamicProgramming::compute_breakpoints() { return breakpoints; } -std::vector DynamicProgramming::fit(int num_bkps_in){ - num_bkps = num_bkps_in; +std::vector DynamicProgramming::fit(int num_bkps){ if (!cost_computed){ initialize_cost_matrix(); } - return compute_breakpoints(); + return compute_breakpoints(num_bkps); } void set_parallelization(int num_threads) { diff --git a/src/dupin.h b/src/dupin.h index 515049c..29bf4a5 100644 --- a/src/dupin.h +++ b/src/dupin.h @@ -19,25 +19,25 @@ class DynamicProgramming { std::vector row_indices; int length; - int index(int row, int col) const { - return row_indices[row] + col - row; + // Helper function to compute the row_indices vector + void compute_row_indices() { + row_indices.resize(length); + for (int row = 0; row < length; ++row) { + row_indices[row] = row * (2 * length - row + 1) / 2; + } } public: - UpperTriangularMatrix() : length(0) {} - - void initialize(int n) { - length = n; - matrix.resize(n * (n + 1) / 2, 0.0); - row_indices.resize(n); - for (int row = 0; row < n; ++row) { - row_indices[row] = row * (2 * length - row + 1) / 2; - } + // Constructor that initializes the matrix and row_indices + UpperTriangularMatrix(int n) : length(n), matrix(n * (n + 1) / 2, 0.0) { + compute_row_indices(); } double &operator()(int row, int col) { - return matrix[index(row, col)]; + int idx = row_indices[row] + col - row; // Use precomputed index + return matrix[idx]; } + int getSize() const { return length; } }; // Struct for memoization key, combining start, end, and number of @@ -69,8 +69,7 @@ class DynamicProgramming { std::unordered_map>, MemoKeyHash> memo; - int num_bkps; // Number of breakpoints to detect. - int num_parameters; // Number of features in the dataset. + int num_features; // Number of features in the dataset. int num_timesteps; // Number of data points (time steps). int jump; // Interval for checking potential breakpoints. int min_size; // Minimum size of a segment. @@ -109,14 +108,14 @@ class DynamicProgramming { void initialize_cost_matrix(); // Returns the optimal set of breakpoints after segmentation. - std::vector compute_breakpoints(); + std::vector compute_breakpoints(int num_bkps); public: // Default constructor. DynamicProgramming(); // Parameterized constructor. - DynamicProgramming(const Eigen::MatrixXd &data, int num_bkps_, int jump_, + DynamicProgramming(const Eigen::MatrixXd &data, int jump_, int min_size_); //Sets number of threads for parallelization From f51ff1c043a1e2b041a6b8039157b3dc6c7786ac Mon Sep 17 00:00:00 2001 From: npkamath Date: Mon, 26 Feb 2024 05:46:11 -0500 Subject: [PATCH 25/27] upper triangular restructured and dynp restructured --- src/module.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 src/module.cpp diff --git a/src/module.cpp b/src/module.cpp new file mode 100644 index 0000000..4ec5843 --- /dev/null +++ b/src/module.cpp @@ -0,0 +1,15 @@ +#include "dupin.h" +#include +#include +#include + +namespace py = pybind11; + +PYBIND11_MODULE(_dupin, m) { + py::class_(m, "DynamicProgramming") + .def(py::init<>()) + .def_property("cost_matrix", &DynamicProgramming::getCostMatrix, + &DynamicProgramming::setCostMatrix) + .def("fit", &DynamicProgramming::fit) + .def("set_threads", &DynamicProgramming::set_parallelization); +} From c0afe6f0707fe8039f7cd33506ca791fd5dfc9de Mon Sep 17 00:00:00 2001 From: Ninad Kmath <98561561+npkamath@users.noreply.github.com> Date: Mon, 26 Feb 2024 06:46:45 -0500 Subject: [PATCH 26/27] conditionals Co-authored-by: Brandon Butler --- src/dupin.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/dupin.cpp b/src/dupin.cpp index 5a93a45..581b545 100644 --- a/src/dupin.cpp +++ b/src/dupin.cpp @@ -111,7 +111,9 @@ std::pair> DynamicProgramming::seg(int start, int end, std::pair> best = {std::numeric_limits::infinity(), {}}; for (int bkp = start + min_size; bkp < end; bkp++) { - if ((bkp - start) >= min_size && (end - bkp) >= min_size) { + if ((bkp - start) < min_size || (end - bkp) < min_size) { + continue; + } auto left = seg(start, bkp, num_bkps - 1); auto right = seg(bkp, end, 0); double cost = left.first + right.first; From 77e3e0057e977e5a86e8d41445caeb35e58f7dff Mon Sep 17 00:00:00 2001 From: npkamath Date: Mon, 26 Feb 2024 06:40:53 -0500 Subject: [PATCH 27/27] naming and other cpp changes --- dupin/detect/dynp.py | 4 +-- src/dupin.cpp | 63 ++++++++++++++++++++++++-------------------- src/dupin.h | 21 +++++++-------- 3 files changed, 46 insertions(+), 42 deletions(-) diff --git a/dupin/detect/dynp.py b/dupin/detect/dynp.py index 4661666..201b6cb 100644 --- a/dupin/detect/dynp.py +++ b/dupin/detect/dynp.py @@ -61,7 +61,7 @@ def set_num_threads(self, num_threads: int): """ self._dupin.set_threads(num_threads) - def fit(self, num_bkps: int) -> list: + def fit(self, num_breakpoints: int) -> list[int]: """Calculate the cost matrix and return the breakpoints. Parameters @@ -73,4 +73,4 @@ def fit(self, num_bkps: int) -> list: ------- list: A list of integers representing the breakpoints. """ - return self._dupin.fit(num_bkps) + return self._dupin.fit(num_breakpoints) diff --git a/src/dupin.cpp b/src/dupin.cpp index 581b545..a5681c5 100644 --- a/src/dupin.cpp +++ b/src/dupin.cpp @@ -42,45 +42,53 @@ void DynamicProgramming::regression_setup(linear_fit_struct &lfit) { lfit.y = data; } -Eigen::VectorXd DynamicProgramming::regression_line(int start, int end, int dim, - linear_fit_struct &lfit) { - int n = end - start; - Eigen::VectorXd x = lfit.x.segment(start, n); - Eigen::VectorXd y = lfit.y.col(dim).segment(start, n); +//work in progress, the rowwise colwise is messing up +Eigen::MatrixXd DynamicProgramming::regression_lines(int start, int end, linear_fit_struct &lfit) { + int n = end - start; + Eigen::VectorXd x = lfit.x.segment(start, n); + Eigen::MatrixXd y = lfit.y.block(start, 0, n, num_features); - double x_mean = x.mean(); - double y_mean = y.mean(); + // Ensure x is in a two-dimensional form for broadcasting + Eigen::MatrixXd x_matrix = x.replicate(1, num_features); - Eigen::VectorXd x_centered = x.array() - x_mean; - Eigen::VectorXd y_centered = y.array() - y_mean; + // Calculate means + double x_mean = x.mean(); + Eigen::VectorXd y_mean = y.colwise().mean(); - double slope = x_centered.dot(y_centered) / x_centered.squaredNorm(); - double intercept = y_mean - slope * x_mean; + // Center the data around 0 + Eigen::MatrixXd x_centered = x_matrix.colwise() - Eigen::VectorXd::Constant(n, x_mean); + Eigen::MatrixXd y_centered = y.rowwise() - y_mean.transpose(); - return x.unaryExpr( - [slope, intercept](double xi) { return slope * xi + intercept; }); + // Calculate slopes for each feature + Eigen::VectorXd slope = (x_centered.array() * y_centered.array()).colwise().sum() / x_centered.array().square().sum(); + + // Calculate intercepts for each feature + Eigen::VectorXd intercept = y_mean.array() - slope.array() * x_mean; + + // everything till this line is functioning fine; I might be overcomplicating it + Eigen::MatrixXd regression_lines = (x_matrix.array().colwise() - x_mean).colwise() * slope.array() + intercept.transpose().array(); + + return regression_lines; } -double DynamicProgramming::l2_cost(Eigen::MatrixXd &predicted_y, int start, int end) { - Eigen::MatrixXd diff = predicted_y.block(start, 0, end - start, num_features) - - data.block(start, 0, end - start, num_features); - return std::sqrt(diff.array().square().sum()); +double DynamicProgramming::l2_cost(const Eigen::MatrixXd &predicted_y, int start, int end) { + Eigen::MatrixXd diff = predicted_y.block(start, 0, end - start, num_features) - + data.block(start, 0, end - start, num_features); + return std::sqrt(diff.array().square().sum()); } -Eigen::MatrixXd DynamicProgramming::predicted(int start, int end, - linear_fit_struct &lfit) { - Eigen::MatrixXd predicted_y(num_timesteps, num_features); - for (int i = 0; i < num_features; ++i) { - predicted_y.block(start, i, end - start, 1) = - regression_line(start, end, i, lfit); - } - return predicted_y; +void DynamicProgramming::predicted(int start, int end, linear_fit_struct &lfit, + Eigen::MatrixXd &predicted_y) { + predicted_y.block(start, 0, end - start, num_features) = regression_lines(start, end, lfit); } double DynamicProgramming::cost_function(int start, int end) { linear_fit_struct lfit; regression_setup(lfit); - Eigen::MatrixXd predicted_y = predicted(start, end, lfit); + + Eigen::MatrixXd predicted_y(num_timesteps, num_features); + predicted(start, end, lfit, predicted_y); // Fill the predicted_y matrix + return l2_cost(predicted_y, start, end); } @@ -134,9 +142,6 @@ std::pair> DynamicProgramming::seg(int start, int end, std::vector DynamicProgramming::compute_breakpoints(int num_bkps) { auto result = seg(0, num_timesteps - 1, num_bkps); std::vector breakpoints = result.second; - std::sort(breakpoints.begin(), breakpoints.end()); - breakpoints.erase(std::unique(breakpoints.begin(), breakpoints.end()), - breakpoints.end()); return breakpoints; } diff --git a/src/dupin.h b/src/dupin.h index 29bf4a5..18df333 100644 --- a/src/dupin.h +++ b/src/dupin.h @@ -82,26 +82,25 @@ class DynamicProgramming { Eigen::MatrixXd y; // Dependent variable (labels). Eigen::VectorXd x; // z Independent variable (time steps). }; - // Scales the dataset using min-max normalization. + // Scales the dataset using min-max normalization. void scale_data(); - // Prepares data for linear regression. + // Prepares data for linear regression, setting up the independent variable 'x'. void regression_setup(linear_fit_struct &lfit); - // Calculates the regression line for a given data segment. - Eigen::VectorXd regression_line(int start, int end, int dim, - linear_fit_struct &lfit); + // Computes regression parameters (slope and intercept) for all dimensions simultaneously. + Eigen::MatrixXd regression_lines(int start, int end, linear_fit_struct &lfit); - // Generates predicted values based on the linear regression model. - Eigen::MatrixXd predicted(int start, int end, linear_fit_struct &lfit); + // Generates predicted values based on the linear regression model for all features. + void predicted(int start, int end, linear_fit_struct &lfit, Eigen::MatrixXd &predicted_y); - // Calculates L2 cost (Euclidean distance) between predicted and actual data. - double l2_cost(Eigen::MatrixXd &predicted_y, int start, int end); + // Calculates L2 cost (Euclidean distance) between predicted and actual data for a given segment. + double l2_cost(const Eigen::MatrixXd &predicted_y, int start, int end); - // Computes the cost of a specific data segment using linear regression. + // Computes the cost of a specific data segment using linear regression and L2 cost. double cost_function(int start, int end); - // Recursive function for dynamic programming segmentation. + // Recursive function for dynamic programming segmentation. std::pair> seg(int start, int end, int num_bkps); // Initializes and fills the cost matrix for all data segments.