Skip to content

Commit

Permalink
support column batch for polynomial division CPU backend
Browse files Browse the repository at this point in the history
  • Loading branch information
ShanieWinitz committed Nov 13, 2024
1 parent 657b7ac commit e875302
Show file tree
Hide file tree
Showing 3 changed files with 497 additions and 443 deletions.
63 changes: 39 additions & 24 deletions icicle/backend/cpu/src/field/cpu_vec_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -803,12 +803,19 @@ REGISTER_SLICE_EXT_FIELD_BACKEND("CPU", cpu_slice<extension_t>);

/*********************************** Highest non-zero idx ***********************************/
template <typename T>
eIcicleError cpu_highest_non_zero_idx(
const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)
eIcicleError cpu_highest_non_zero_idx_internal(
const Device& device,
const T* input,
uint64_t size,
const VecOpsConfig& config,
int64_t* out_idx /*OUT*/,
int32_t idx_in_batch_to_calc)
{
ICICLE_ASSERT(input && out_idx && size != 0) << "Error: Invalid argument";
uint64_t stride = config.columns_batch ? config.batch_size : 1;
for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
uint32_t start_idx = (idx_in_batch_to_calc == -1) ? 0 : idx_in_batch_to_calc;
uint32_t end_idx = (idx_in_batch_to_calc == -1) ? config.batch_size : idx_in_batch_to_calc + 1;
for (uint64_t idx_in_batch = start_idx; idx_in_batch < end_idx; ++idx_in_batch) {
out_idx[idx_in_batch] = -1; // zero vector is considered '-1' since 0 would be zero in vec[0]
const T* curr_input =
config.columns_batch ? input + idx_in_batch : input + idx_in_batch * size; // Pointer to the current vector
Expand All @@ -822,6 +829,13 @@ eIcicleError cpu_highest_non_zero_idx(
return eIcicleError::SUCCESS;
}

template <typename T>
eIcicleError cpu_highest_non_zero_idx(
const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)
{
return cpu_highest_non_zero_idx_internal(device, input, size, config, out_idx, -1);
}

REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx<scalar_t>);

/*********************************** Polynomial evaluation ***********************************/
Expand Down Expand Up @@ -887,39 +901,40 @@ eIcicleError cpu_poly_divide(
T* r_out /*OUT*/,
uint64_t r_size)
{
if (config.batch_size != 1 && config.columns_batch) {
ICICLE_LOG_ERROR << "polynomial division is not implemented for column batch. Planned for v3.2";
return eIcicleError::API_NOT_IMPLEMENTED;
}

uint32_t stride = config.columns_batch ? config.batch_size : 1;
auto numerator_deg = std::make_unique<int64_t[]>(config.batch_size);
auto denominator_deg = std::make_unique<int64_t[]>(config.batch_size);
auto deg_r = std::make_unique<int64_t[]>(config.batch_size);
cpu_highest_non_zero_idx(device, numerator, numerator_size, config, numerator_deg.get());
cpu_highest_non_zero_idx(device, denominator, denominator_size, config, denominator_deg.get());
memset(r_out, 0, sizeof(T) * (r_size * config.batch_size));
memcpy(r_out, numerator, sizeof(T) * (numerator_size * config.batch_size));

for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
ICICLE_ASSERT(r_size >= numerator_deg[idx_in_batch] + 1)
<< "polynomial division expects r(x) size to be similar to numerator size and higher than numerator "
"degree(x).\nr_size = "
<< r_size << ", numerator_deg[" << idx_in_batch << "] = " << numerator_deg[idx_in_batch];
ICICLE_ASSERT(q_size >= (numerator_deg[idx_in_batch] - denominator_deg[idx_in_batch] + 1))
<< "polynomial division expects q(x) size to be at least deg(numerator)-deg(denominator)+1.\nq_size = " << q_size
<< ", numerator_deg[" << idx_in_batch << "] = " << numerator_deg[idx_in_batch] << ", denominator_deg["
<< idx_in_batch << "] = " << denominator_deg[idx_in_batch];
const T* curr_numerator =
config.columns_batch ? numerator + idx_in_batch : numerator + idx_in_batch * numerator_size;
const T* curr_denominator =
config.columns_batch ? denominator + idx_in_batch : denominator + idx_in_batch * denominator_size;
T* curr_q_out = config.columns_batch ? q_out + idx_in_batch : q_out + idx_in_batch * q_size;
T* curr_r_out = config.columns_batch ? r_out + idx_in_batch : r_out + idx_in_batch * r_size;

int64_t numerator_deg, denominator_deg;
cpu_highest_non_zero_idx(device, curr_numerator, numerator_size, default_vec_ops_config(), &numerator_deg);
cpu_highest_non_zero_idx(device, curr_denominator, denominator_size, default_vec_ops_config(), &denominator_deg);
ICICLE_ASSERT(r_size >= numerator_deg + 1)
<< "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)";
ICICLE_ASSERT(q_size >= (numerator_deg - denominator_deg + 1))
<< "polynomial division expects q(x) size to be at least deg(numerator)-deg(denominator)+1";

memset(curr_r_out, 0, sizeof(T) * r_size);
memcpy(curr_r_out, curr_numerator, sizeof(T) * (numerator_deg + 1));

// invert largest coeff of b
const T& lc_b_inv = T::inverse(curr_denominator[denominator_deg * stride]);
int64_t deg_r = numerator_deg;
while (deg_r >= denominator_deg) {
const T& lc_b_inv = T::inverse(curr_denominator[denominator_deg[idx_in_batch] * stride]);
deg_r[idx_in_batch] = numerator_deg[idx_in_batch];
while (deg_r[idx_in_batch] >= denominator_deg[idx_in_batch]) {
// each iteration is removing the largest monomial in r until deg(r)<deg(b)
school_book_division_step_cpu(curr_r_out, curr_q_out, curr_denominator, deg_r, denominator_deg, lc_b_inv, stride);
school_book_division_step_cpu(
curr_r_out, curr_q_out, curr_denominator, deg_r[idx_in_batch], denominator_deg[idx_in_batch], lc_b_inv, stride);
// compute degree of r
cpu_highest_non_zero_idx(device, r_out, deg_r, default_vec_ops_config(), &deg_r);
cpu_highest_non_zero_idx_internal(device, r_out, deg_r[idx_in_batch], config, deg_r.get(), idx_in_batch);
}
}
return eIcicleError::SUCCESS;
Expand Down
82 changes: 29 additions & 53 deletions icicle/tests/test_field_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -504,23 +504,6 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
}
};

// Option 1: Initialize each input matrix in the batch with the same ascending values
// for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
// for (uint32_t i = 0; i < R * C; i++) {
// if(columns_batch){
// h_inout[idx_in_batch + batch_size * i] = TypeParam::from(i);
// } else {
// h_inout[idx_in_batch * R * C + i] = TypeParam::from(i);
// }
// }
// }

// Option 2: Initialize the entire input array with ascending values
// for (int i = 0; i < total_size; i++) {
// h_inout[i] = TypeParam::from(i);
// }

// Option 3: Initialize the entire input array with random values
TypeParam::rand_host_many(h_inout.get(), total_size);

// Reference implementation
Expand Down Expand Up @@ -589,23 +572,6 @@ TYPED_TEST(FieldApiTest, bitReverse)
END_TIMER(BIT_REVERSE, oss.str().c_str(), measure);
};

// // Option 1: Initialize each input vector in the batch with the same ascending values
// for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
// for (uint32_t i = 0; i < N; i++) {
// if(columns_batch){
// in_a[idx_in_batch + batch_size * i] = TypeParam::from(i);
// } else {
// in_a[idx_in_batch * N + i] = TypeParam::from(i);
// }
// }
// }

// // Option 2: Initialize the entire input array with ascending values
// for (int i = 0; i < total_size; i++) {
// in_a[i] = TypeParam::from(i);
// }

// Option 3: Initialize the entire input array with random values
FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);

// Reference implementation
Expand Down Expand Up @@ -797,42 +763,53 @@ TEST_F(FieldApiTestBase, polynomialEval)

TEST_F(FieldApiTestBase, polynomialDivision)
{
const uint64_t numerator_size = 1 << 4;
const uint64_t denominator_size = 1 << 3;
int seed = time(0);
srand(seed);
const uint64_t numerator_size = 1 << (rand() % 3 + 5);
const uint64_t denominator_size = 1 << (rand() % 2 + 3);
const uint64_t q_size = numerator_size - denominator_size + 1;
const uint64_t r_size = numerator_size;
const int batch_size = 10 + rand() % 10;

// basically we compute q(x),r(x) for a(x)=q(x)b(x)+r(x) by dividing a(x)/b(x)

// randomize matrix with rows/cols as polynomials
auto numerator = std::make_unique<scalar_t[]>(numerator_size * batch_size);
auto denominator = std::make_unique<scalar_t[]>(denominator_size * batch_size);
scalar_t::rand_host_many(numerator.get(), numerator_size * batch_size);
scalar_t::rand_host_many(denominator.get(), denominator_size * batch_size);

// Add padding to each row so that the degree is lower than the size
const int zero_pad_length = 5;
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < zero_pad_length; ++j) {
numerator[i * numerator_size + numerator_size - zero_pad_length + j] = scalar_t::zero();
denominator[i * denominator_size + denominator_size - zero_pad_length + j] = scalar_t::zero();
}
}

for (auto device : s_registered_devices) {
ICICLE_CHECK(icicle_set_device(device));
for (int columns_batch = 0; columns_batch <= 1; columns_batch++) {
ICICLE_LOG_DEBUG << "testing polynomial division on device " << device << " [column_batch=" << columns_batch
<< "]";
ICICLE_LOG_INFO << "testing polynomial division on device " << device << " [column_batch=" << columns_batch
<< "]";

// randomize matrix with rows/cols as polynomials
scalar_t::rand_host_many(numerator.get(), numerator_size * batch_size);
scalar_t::rand_host_many(denominator.get(), denominator_size * batch_size);

// Add padding to each vector so that the degree is lower than the size
const int zero_pad_length = 1;
if (columns_batch) {
for (int i = 0; i < batch_size * zero_pad_length; i++) {
numerator[batch_size * numerator_size - batch_size * zero_pad_length + i] = scalar_t::zero();
denominator[batch_size * denominator_size - batch_size * zero_pad_length + i] = scalar_t::zero();
}
} else {
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < zero_pad_length; ++j) {
numerator[i * numerator_size + numerator_size - zero_pad_length + j] = scalar_t::zero();
denominator[i * denominator_size + denominator_size - zero_pad_length + j] = scalar_t::zero();
}
}
}

auto q = std::make_unique<scalar_t[]>(q_size * batch_size);
auto r = std::make_unique<scalar_t[]>(r_size * batch_size);

auto config = default_vec_ops_config();
config.batch_size = columns_batch ? batch_size - zero_pad_length : batch_size; // skip the zero cols
config.batch_size = batch_size;
config.columns_batch = columns_batch;
// TODO v3.2 support column batch for this API
if (columns_batch) {
if (columns_batch && device == "CUDA") {
ICICLE_LOG_INFO << "Skipping polynomial division column batch";
continue;
}
Expand All @@ -853,7 +830,6 @@ TEST_F(FieldApiTestBase, polynomialDivision)
polynomial_eval(r.get(), r_size, &rand_x, 1, config, rx.get());

for (int i = 0; i < config.batch_size; ++i) {
// ICICLE_LOG_DEBUG << "ax=" << ax[i] << ", bx=" << bx[i] << ", qx=" << qx[i] << ", rx=" << rx[i];
ASSERT_EQ(ax[i], qx[i] * bx[i] + rx[i]);
}
}
Expand Down
Loading

0 comments on commit e875302

Please sign in to comment.