diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 22c257023..09e3d8095 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -803,12 +803,19 @@ REGISTER_SLICE_EXT_FIELD_BACKEND("CPU", cpu_slice<extension_t>);
 
 /*********************************** Highest non-zero idx ***********************************/
 template <typename T>
-eIcicleError cpu_highest_non_zero_idx(
-  const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)
+eIcicleError cpu_highest_non_zero_idx_internal(
+  const Device& device,
+  const T* input,
+  uint64_t size,
+  const VecOpsConfig& config,
+  int64_t* out_idx /*OUT*/,
+  int32_t idx_in_batch_to_calc)
 {
   ICICLE_ASSERT(input && out_idx && size != 0) << "Error: Invalid argument";
   uint64_t stride = config.columns_batch ? config.batch_size : 1;
-  for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
+  uint32_t start_idx = (idx_in_batch_to_calc == -1) ? 0 : idx_in_batch_to_calc;
+  uint32_t end_idx = (idx_in_batch_to_calc == -1) ? config.batch_size : idx_in_batch_to_calc + 1;
+  for (uint64_t idx_in_batch = start_idx; idx_in_batch < end_idx; ++idx_in_batch) {
     out_idx[idx_in_batch] = -1; // zero vector is considered '-1' since 0 would be zero in vec[0]
     const T* curr_input =
       config.columns_batch ? input + idx_in_batch : input + idx_in_batch * size; // Pointer to the current vector
@@ -822,6 +829,13 @@ eIcicleError cpu_highest_non_zero_idx(
   return eIcicleError::SUCCESS;
 }
 
+template <typename T>
+eIcicleError cpu_highest_non_zero_idx(
+  const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)
+{
+  return cpu_highest_non_zero_idx_internal(device, input, size, config, out_idx, -1);
+}
+
 REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx<scalar_t>);
 
 /*********************************** Polynomial evaluation ***********************************/
@@ -887,13 +901,24 @@ eIcicleError cpu_poly_divide(
   T* r_out /*OUT*/,
   uint64_t r_size)
 {
-  if (config.batch_size != 1 && config.columns_batch) {
-    ICICLE_LOG_ERROR << "polynomial division is not implemented for column batch. Planned for v3.2";
-    return eIcicleError::API_NOT_IMPLEMENTED;
-  }
-
   uint32_t stride = config.columns_batch ? config.batch_size : 1;
+  auto numerator_deg = std::make_unique<int64_t[]>(config.batch_size);
+  auto denominator_deg = std::make_unique<int64_t[]>(config.batch_size);
+  auto deg_r = std::make_unique<int64_t[]>(config.batch_size);
+  cpu_highest_non_zero_idx(device, numerator, numerator_size, config, numerator_deg.get());
+  cpu_highest_non_zero_idx(device, denominator, denominator_size, config, denominator_deg.get());
+  memset(r_out, 0, sizeof(T) * (r_size * config.batch_size));
+  memcpy(r_out, numerator, sizeof(T) * (numerator_size * config.batch_size));
+
   for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
+    ICICLE_ASSERT(r_size >= numerator_deg[idx_in_batch] + 1)
+      << "polynomial division expects r(x) size to be similar to numerator size and higher than numerator "
+         "degree(x).\nr_size = "
+      << r_size << ", numerator_deg[" << idx_in_batch << "] = " << numerator_deg[idx_in_batch];
+    ICICLE_ASSERT(q_size >= (numerator_deg[idx_in_batch] - denominator_deg[idx_in_batch] + 1))
+      << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denominator)+1.\nq_size = " << q_size
+      << ", numerator_deg[" << idx_in_batch << "] = " << numerator_deg[idx_in_batch] << ", denominator_deg["
+      << idx_in_batch << "] = " << denominator_deg[idx_in_batch];
     const T* curr_numerator =
       config.columns_batch ? numerator + idx_in_batch : numerator + idx_in_batch * numerator_size;
     const T* curr_denominator =
@@ -901,25 +926,15 @@ eIcicleError cpu_poly_divide(
     T* curr_q_out = config.columns_batch ? q_out + idx_in_batch : q_out + idx_in_batch * q_size;
     T* curr_r_out = config.columns_batch ? r_out + idx_in_batch : r_out + idx_in_batch * r_size;
 
-    int64_t numerator_deg, denominator_deg;
-    cpu_highest_non_zero_idx(device, curr_numerator, numerator_size, default_vec_ops_config(), &numerator_deg);
-    cpu_highest_non_zero_idx(device, curr_denominator, denominator_size, default_vec_ops_config(), &denominator_deg);
-    ICICLE_ASSERT(r_size >= numerator_deg + 1)
-      << "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)";
-    ICICLE_ASSERT(q_size >= (numerator_deg - denominator_deg + 1))
-      << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denominator)+1";
-
-    memset(curr_r_out, 0, sizeof(T) * r_size);
-    memcpy(curr_r_out, curr_numerator, sizeof(T) * (numerator_deg + 1));
-
     // invert largest coeff of b
-    const T& lc_b_inv = T::inverse(curr_denominator[denominator_deg * stride]);
-    int64_t deg_r = numerator_deg;
-    while (deg_r >= denominator_deg) {
+    const T& lc_b_inv = T::inverse(curr_denominator[denominator_deg[idx_in_batch] * stride]);
+    deg_r[idx_in_batch] = numerator_deg[idx_in_batch];
+    while (deg_r[idx_in_batch] >= denominator_deg[idx_in_batch]) {
       // each iteration is removing the largest monomial in r until deg(r)<deg(b)
-      school_book_division_step_cpu(curr_r_out, curr_q_out, curr_denominator, deg_r, denominator_deg, lc_b_inv, stride);
+      school_book_division_step_cpu(
+        curr_r_out, curr_q_out, curr_denominator, deg_r[idx_in_batch], denominator_deg[idx_in_batch], lc_b_inv, stride);
       // compute degree of r
-      cpu_highest_non_zero_idx(device, r_out, deg_r, default_vec_ops_config(), &deg_r);
+      cpu_highest_non_zero_idx_internal(device, r_out, deg_r[idx_in_batch], config, deg_r.get(), idx_in_batch);
     }
   }
   return eIcicleError::SUCCESS;
diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 703018797..48c938f2f 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -504,23 +504,6 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
     }
   };
 
-  // Option 1: Initialize each input matrix in the batch with the same ascending values
-  // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
-  //   for (uint32_t i = 0; i < R * C; i++) {
-  //     if(columns_batch){
-  //       h_inout[idx_in_batch + batch_size * i] = TypeParam::from(i);
-  //     } else {
-  //       h_inout[idx_in_batch * R * C + i] = TypeParam::from(i);
-  //     }
-  //   }
-  // }
-
-  // Option 2: Initialize the entire input array with ascending values
-  // for (int i = 0; i < total_size; i++) {
-  //   h_inout[i] = TypeParam::from(i);
-  // }
-
-  // Option 3: Initialize the entire input array with random values
   TypeParam::rand_host_many(h_inout.get(), total_size);
 
   // Reference implementation
@@ -589,23 +572,6 @@ TYPED_TEST(FieldApiTest, bitReverse)
     END_TIMER(BIT_REVERSE, oss.str().c_str(), measure);
   };
 
-  // // Option 1: Initialize each input vector in the batch with the same ascending values
-  // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
-  //   for (uint32_t i = 0; i < N; i++) {
-  //     if(columns_batch){
-  //       in_a[idx_in_batch + batch_size * i] = TypeParam::from(i);
-  //     } else {
-  //       in_a[idx_in_batch * N + i] = TypeParam::from(i);
-  //     }
-  //   }
-  // }
-
-  // // Option 2: Initialize the entire input array with ascending values
-  // for (int i = 0; i < total_size; i++) {
-  //   in_a[i] = TypeParam::from(i);
-  // }
-
-  // Option 3: Initialize the entire input array with random values
   FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
 
   // Reference implementation
@@ -797,42 +763,53 @@ TEST_F(FieldApiTestBase, polynomialEval)
 
 TEST_F(FieldApiTestBase, polynomialDivision)
 {
-  const uint64_t numerator_size = 1 << 4;
-  const uint64_t denominator_size = 1 << 3;
+  int seed = time(0);
+  srand(seed);
+  const uint64_t numerator_size = 1 << (rand() % 3 + 5);
+  const uint64_t denominator_size = 1 << (rand() % 2 + 3);
   const uint64_t q_size = numerator_size - denominator_size + 1;
   const uint64_t r_size = numerator_size;
   const int batch_size = 10 + rand() % 10;
 
   // basically we compute q(x),r(x) for a(x)=q(x)b(x)+r(x) by dividing a(x)/b(x)
 
-  // randomize matrix with rows/cols as polynomials
   auto numerator = std::make_unique<scalar_t[]>(numerator_size * batch_size);
   auto denominator = std::make_unique<scalar_t[]>(denominator_size * batch_size);
-  scalar_t::rand_host_many(numerator.get(), numerator_size * batch_size);
-  scalar_t::rand_host_many(denominator.get(), denominator_size * batch_size);
-
-  // Add padding to each row so that the degree is lower than the size
-  const int zero_pad_length = 5;
-  for (int i = 0; i < batch_size; ++i) {
-    for (int j = 0; j < zero_pad_length; ++j) {
-      numerator[i * numerator_size + numerator_size - zero_pad_length + j] = scalar_t::zero();
-      denominator[i * denominator_size + denominator_size - zero_pad_length + j] = scalar_t::zero();
-    }
-  }
 
   for (auto device : s_registered_devices) {
     ICICLE_CHECK(icicle_set_device(device));
     for (int columns_batch = 0; columns_batch <= 1; columns_batch++) {
-      ICICLE_LOG_DEBUG << "testing polynomial division on device " << device << " [column_batch=" << columns_batch
-                       << "]";
+      ICICLE_LOG_INFO << "testing polynomial division on device " << device << " [column_batch=" << columns_batch
+                      << "]";
+
+      // randomize matrix with rows/cols as polynomials
+      scalar_t::rand_host_many(numerator.get(), numerator_size * batch_size);
+      scalar_t::rand_host_many(denominator.get(), denominator_size * batch_size);
+
+      // Add padding to each vector so that the degree is lower than the size
+      const int zero_pad_length = 1;
+      if (columns_batch) {
+        for (int i = 0; i < batch_size * zero_pad_length; i++) {
+          numerator[batch_size * numerator_size - batch_size * zero_pad_length + i] = scalar_t::zero();
+          denominator[batch_size * denominator_size - batch_size * zero_pad_length + i] = scalar_t::zero();
+        }
+      } else {
+        for (int i = 0; i < batch_size; ++i) {
+          for (int j = 0; j < zero_pad_length; ++j) {
+            numerator[i * numerator_size + numerator_size - zero_pad_length + j] = scalar_t::zero();
+            denominator[i * denominator_size + denominator_size - zero_pad_length + j] = scalar_t::zero();
+          }
+        }
+      }
+
       auto q = std::make_unique<scalar_t[]>(q_size * batch_size);
       auto r = std::make_unique<scalar_t[]>(r_size * batch_size);
 
       auto config = default_vec_ops_config();
-      config.batch_size = columns_batch ? batch_size - zero_pad_length : batch_size; // skip the zero cols
+      config.batch_size = batch_size;
       config.columns_batch = columns_batch;
       // TODO v3.2 support column batch for this API
-      if (columns_batch) {
+      if (columns_batch && device == "CUDA") {
         ICICLE_LOG_INFO << "Skipping polynomial division column batch";
         continue;
       }
@@ -853,7 +830,6 @@ TEST_F(FieldApiTestBase, polynomialDivision)
       polynomial_eval(r.get(), r_size, &rand_x, 1, config, rx.get());
 
       for (int i = 0; i < config.batch_size; ++i) {
-        // ICICLE_LOG_DEBUG << "ax=" << ax[i] << ", bx=" << bx[i] << ", qx=" << qx[i] << ", rx=" << rx[i];
         ASSERT_EQ(ax[i], qx[i] * bx[i] + rx[i]);
       }
     }
diff --git a/icicle/tests/test_polynomial_api.cpp b/icicle/tests/test_polynomial_api.cpp
index c859cefd4..124eee26e 100644
--- a/icicle/tests/test_polynomial_api.cpp
+++ b/icicle/tests/test_polynomial_api.cpp
@@ -47,12 +47,6 @@ class PolynomialTest : public ::testing::Test
       auto init_domain_config = default_ntt_init_domain_config();
       ntt_init_domain(scalar_t::omega(MAX_NTT_LOG_SIZE), init_domain_config);
     }
-
-    // choose random device for testing
-    srand(time(NULL));
-    const int dev_idx = rand() % s_registered_devices.size();
-    icicle_set_device(s_registered_devices.at(dev_idx));
-    ICICLE_LOG_INFO << "setting device " << s_registered_devices.at(dev_idx) << " for polynomial tests";
   }
 
   static void TearDownTestSuite() {}
@@ -137,395 +131,446 @@ class PolynomialTest : public ::testing::Test
 
 TEST_F(PolynomialTest, evaluation)
 {
-  const scalar_t coeffs[3] = {one, two, three};
-  auto f = Polynomial_t::from_coefficients(coeffs, 3);
-  scalar_t x = scalar_t::rand_host();
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const scalar_t coeffs[3] = {one, two, three};
+    auto f = Polynomial_t::from_coefficients(coeffs, 3);
+    scalar_t x = scalar_t::rand_host();
 
-  auto f_x = f(x); // evaluation
+    auto f_x = f(x); // evaluation
 
-  auto expected_f_x = one + two * x + three * x * x;
+    auto expected_f_x = one + two * x + three * x * x;
 
-  EXPECT_EQ(f_x, expected_f_x);
+    EXPECT_EQ(f_x, expected_f_x);
+  }
 }
 
 TEST_F(PolynomialTest, evaluationOnDomain)
 {
-  int size = 1 << 5;
-  auto f = PolynomialTest::randomize_polynomial(size);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    int size = 1 << 5;
+    auto f = PolynomialTest::randomize_polynomial(size);
 
-  size *= 2; // evaluating on a larger domain
-  scalar_t w;
-  ICICLE_CHECK(get_root_of_unity_from_domain<scalar_t>((int)log2(size), &w));
+    size *= 2; // evaluating on a larger domain
+    scalar_t w;
+    ICICLE_CHECK(get_root_of_unity_from_domain<scalar_t>((int)log2(size), &w));
 
-  // construct domain as rou
-  scalar_t x = one;
-  auto domain = std::make_unique<scalar_t[]>(size);
-  for (int i = 0; i < size; ++i) {
-    domain[i] = x;
-    x = x * w;
-  }
+    // construct domain as rou
+    scalar_t x = one;
+    auto domain = std::make_unique<scalar_t[]>(size);
+    for (int i = 0; i < size; ++i) {
+      domain[i] = x;
+      x = x * w;
+    }
 
-  // evaluate f on the domain (equivalent to NTT)
-  auto evaluations = std::make_unique<scalar_t[]>(size);
-  f.evaluate_on_domain(domain.get(), size, evaluations.get());
+    // evaluate f on the domain (equivalent to NTT)
+    auto evaluations = std::make_unique<scalar_t[]>(size);
+    f.evaluate_on_domain(domain.get(), size, evaluations.get());
 
-  // construct g from the evaluations of f
-  auto g = Polynomial_t::from_rou_evaluations(evaluations.get(), size);
+    // construct g from the evaluations of f
+    auto g = Polynomial_t::from_rou_evaluations(evaluations.get(), size);
 
-  // check that f==g
-  ASSERT_EQ((f - g).degree(), -1);
+    // check that f==g
+    ASSERT_EQ((f - g).degree(), -1);
+  }
 }
 
 TEST_F(PolynomialTest, evaluateOnRouDomain)
 {
-  const int logsize = 8;
-  const int size = 1 << logsize;
-  auto f = randomize_polynomial(size);
-  auto g = randomize_polynomial(size, true, true /*from_evals*/);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const int logsize = 8;
+    const int size = 1 << logsize;
+    auto f = randomize_polynomial(size);
+    auto g = randomize_polynomial(size, true, true /*from_evals*/);
+
+    // build domain
+    auto test = [&](auto& p, int domain_logsize) {
+      const int domain_size = 1 << domain_logsize;
+
+      scalar_t w;
+      ICICLE_CHECK(get_root_of_unity_from_domain<scalar_t>(domain_logsize, &w));
+
+      auto domain = std::make_unique<scalar_t[]>(domain_size);
+      domain[0] = scalar_t::one();
+      for (int i = 1; i < domain_size; ++i) {
+        domain[i] = domain[i - 1] * w;
+      }
+
+      // evaluation on domain
+      auto evals_naive = std::make_unique<scalar_t[]>(domain_size);
+      START_TIMER(naive_evals);
+      p.evaluate_on_domain(domain.get(), domain_size, evals_naive.get());
+      END_TIMER(naive_evals, "naive evals took", MEASURE);
+
+      // evaluate on rou domain
+      auto evals_rou_domain = std::make_unique<scalar_t[]>(domain_size);
+      START_TIMER(rou_domain_evals);
+      p.evaluate_on_rou_domain(domain_logsize, evals_rou_domain.get());
+      END_TIMER(rou_domain_evals, "evals on rou domain took", MEASURE);
+
+      ASSERT_EQ(0, memcmp(evals_naive.get(), evals_rou_domain.get(), domain_size * sizeof(scalar_t)));
+    };
 
-  // build domain
-  auto test = [&](auto& p, int domain_logsize) {
-    const int domain_size = 1 << domain_logsize;
+    // test f (in coeffs state)
+    test(f, logsize + 2); // evaluate on larger domain
+    test(f, logsize - 3); // evaluate on smaller domain
+    test(f, logsize);     // evaluate on domain with size like poly
+    // test g (in evals state)
+    test(g, logsize + 2); // evaluate on larger domain
+    test(g, logsize - 3); // evaluate on smaller domain
+    test(g, logsize);     // evaluate on domain with size like poly
+
+    // test f*f (in reversed evals state)
+    auto f_squared = f * f;
+    auto new_logsize = logsize + 1;   // f_squared is twice the degree and size of f
+    test(f_squared, new_logsize + 2); // evaluate on larger domain
+    test(f_squared, new_logsize - 3); // evaluate on smaller domain
+    test(f_squared, new_logsize);     // evaluate on domain with size like poly
+  }
+}
 
-    scalar_t w;
-    ICICLE_CHECK(get_root_of_unity_from_domain<scalar_t>(domain_logsize, &w));
+TEST_F(PolynomialTest, fromEvaluations)
+{
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const int size = 100;
+    const int log_size = (int)ceil(log2(size));
+    const int nof_evals = 1 << log_size;
+    auto f = randomize_polynomial(size);
 
-    auto domain = std::make_unique<scalar_t[]>(domain_size);
-    domain[0] = scalar_t::one();
-    for (int i = 1; i < domain_size; ++i) {
-      domain[i] = domain[i - 1] * w;
+    // evaluate f on roots of unity
+    scalar_t omega = scalar_t::omega(log_size);
+    auto evals = std::make_unique<scalar_t[]>(nof_evals);
+    scalar_t x = one;
+    for (int i = 0; i < nof_evals; ++i) {
+      evals[i] = f(x);
+      x = x * omega;
     }
 
-    // evaluation on domain
-    auto evals_naive = std::make_unique<scalar_t[]>(domain_size);
-    START_TIMER(naive_evals);
-    p.evaluate_on_domain(domain.get(), domain_size, evals_naive.get());
-    END_TIMER(naive_evals, "naive evals took", MEASURE);
-
-    // evaluate on rou domain
-    auto evals_rou_domain = std::make_unique<scalar_t[]>(domain_size);
-    START_TIMER(rou_domain_evals);
-    p.evaluate_on_rou_domain(domain_logsize, evals_rou_domain.get());
-    END_TIMER(rou_domain_evals, "evals on rou domain took", MEASURE);
-
-    ASSERT_EQ(0, memcmp(evals_naive.get(), evals_rou_domain.get(), domain_size * sizeof(scalar_t)));
-  };
-
-  // test f (in coeffs state)
-  test(f, logsize + 2); // evaluate on larger domain
-  test(f, logsize - 3); // evaluate on smaller domain
-  test(f, logsize);     // evaluate on domain with size like poly
-  // test g (in evals state)
-  test(g, logsize + 2); // evaluate on larger domain
-  test(g, logsize - 3); // evaluate on smaller domain
-  test(g, logsize);     // evaluate on domain with size like poly
-
-  // test f*f (in reversed evals state)
-  auto f_squared = f * f;
-  auto new_logsize = logsize + 1;   // f_squared is twice the degree and size of f
-  test(f_squared, new_logsize + 2); // evaluate on larger domain
-  test(f_squared, new_logsize - 3); // evaluate on smaller domain
-  test(f_squared, new_logsize);     // evaluate on domain with size like poly
-}
+    // construct g from f's evaluations
+    auto g = Polynomial_t::from_rou_evaluations(evals.get(), nof_evals);
 
-TEST_F(PolynomialTest, fromEvaluations)
-{
-  const int size = 100;
-  const int log_size = (int)ceil(log2(size));
-  const int nof_evals = 1 << log_size;
-  auto f = randomize_polynomial(size);
-
-  // evaluate f on roots of unity
-  scalar_t omega = scalar_t::omega(log_size);
-  auto evals = std::make_unique<scalar_t[]>(nof_evals);
-  scalar_t x = one;
-  for (int i = 0; i < nof_evals; ++i) {
-    evals[i] = f(x);
-    x = x * omega;
-  }
-
-  // construct g from f's evaluations
-  auto g = Polynomial_t::from_rou_evaluations(evals.get(), nof_evals);
-
-  // make sure they are equal, that is f-g=0
-  auto h = f - g;
-  EXPECT_EQ(h.degree(), -1); // degree -1 is the zero polynomial
+    // make sure they are equal, that is f-g=0
+    auto h = f - g;
+    EXPECT_EQ(h.degree(), -1); // degree -1 is the zero polynomial
+  }
 }
 
 TEST_F(PolynomialTest, fromEvaluationsNotPowerOfTwo)
 {
-  const int size = 100;
-  const int log_size = (int)ceil(log2(size));
-  const int nof_evals = size;
-  auto f = randomize_polynomial(size);
-
-  // evaluate f on roots of unity
-  scalar_t omega = scalar_t::omega(log_size);
-  scalar_t evals[nof_evals] = {0};
-  scalar_t x = one;
-  for (int i = 0; i < nof_evals; ++i) {
-    evals[i] = f(x);
-    x = x * omega;
-  }
-
-  // construct g from f's evaluations
-  auto g = Polynomial_t::from_rou_evaluations(evals, nof_evals);
-
-  // since NTT works on a power of two (therefore the extra elements are arbitrary), f!=g but they should evaluate to
-  // the same values on the roots of unity due to construction.
-  x = one;
-  for (int i = 0; i < nof_evals; ++i) {
-    EXPECT_EQ(f(x), g(x));
-    x = x * omega;
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const int size = 100;
+    const int log_size = (int)ceil(log2(size));
+    const int nof_evals = size;
+    auto f = randomize_polynomial(size);
+
+    // evaluate f on roots of unity
+    scalar_t omega = scalar_t::omega(log_size);
+    scalar_t evals[nof_evals] = {0};
+    scalar_t x = one;
+    for (int i = 0; i < nof_evals; ++i) {
+      evals[i] = f(x);
+      x = x * omega;
+    }
+
+    // construct g from f's evaluations
+    auto g = Polynomial_t::from_rou_evaluations(evals, nof_evals);
+
+    // since NTT works on a power of two (therefore the extra elements are arbitrary), f!=g but they should evaluate to
+    // the same values on the roots of unity due to construction.
+    x = one;
+    for (int i = 0; i < nof_evals; ++i) {
+      EXPECT_EQ(f(x), g(x));
+      x = x * omega;
+    }
   }
 }
 
 TEST_F(PolynomialTest, addition)
 {
-  const int size_0 = 12, size_1 = 17;
-  auto f = randomize_polynomial(size_0);
-  auto g = randomize_polynomial(size_1);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const int size_0 = 12, size_1 = 17;
+    auto f = randomize_polynomial(size_0);
+    auto g = randomize_polynomial(size_1);
 
-  scalar_t x = scalar_t::rand_host();
-  auto f_x = f(x);
-  auto g_x = g(x);
-  auto fx_plus_gx = f_x + g_x;
+    scalar_t x = scalar_t::rand_host();
+    auto f_x = f(x);
+    auto g_x = g(x);
+    auto fx_plus_gx = f_x + g_x;
 
-  auto s = f + g;
-  auto s_x = s(x);
+    auto s = f + g;
+    auto s_x = s(x);
 
-  EXPECT_EQ(fx_plus_gx, s_x);
+    EXPECT_EQ(fx_plus_gx, s_x);
+  }
 }
 
 TEST_F(PolynomialTest, addition_inplace)
 {
-  const int size_0 = 2, size_1 = 2;
-  auto f = randomize_polynomial(size_0);
-  auto g = randomize_polynomial(size_1);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const int size_0 = 2, size_1 = 2;
+    auto f = randomize_polynomial(size_0);
+    auto g = randomize_polynomial(size_1);
 
-  scalar_t x = scalar_t::rand_host();
-  auto f_x = f(x);
-  auto g_x = g(x);
-  auto fx_plus_gx = f_x + g_x;
+    scalar_t x = scalar_t::rand_host();
+    auto f_x = f(x);
+    auto g_x = g(x);
+    auto fx_plus_gx = f_x + g_x;
 
-  f += g;
-  auto s_x = f(x);
+    f += g;
+    auto s_x = f(x);
 
-  EXPECT_EQ(fx_plus_gx, s_x);
+    EXPECT_EQ(fx_plus_gx, s_x);
+  }
 }
 
 TEST_F(PolynomialTest, multiplication)
 {
-  const int size_0 = 1 << 15, size_1 = 1 << 12;
-  auto f = randomize_polynomial(size_0);
-  auto g = randomize_polynomial(size_1);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const int size_0 = 1 << 15, size_1 = 1 << 12;
+    auto f = randomize_polynomial(size_0);
+    auto g = randomize_polynomial(size_1);
 
-  scalar_t x = scalar_t::rand_host();
-  auto f_x = f(x);
-  auto g_x = g(x);
-  auto fx_mul_gx = f_x * g_x;
+    scalar_t x = scalar_t::rand_host();
+    auto f_x = f(x);
+    auto g_x = g(x);
+    auto fx_mul_gx = f_x * g_x;
 
-  START_TIMER(poly_mult_start);
-  auto m = f * g;
-  END_TIMER(poly_mult_start, "Polynomial multiplication took", MEASURE);
+    START_TIMER(poly_mult_start);
+    auto m = f * g;
+    END_TIMER(poly_mult_start, "Polynomial multiplication took", MEASURE);
 
-  auto m_x = m(x);
+    auto m_x = m(x);
 
-  EXPECT_EQ(fx_mul_gx, m_x);
+    EXPECT_EQ(fx_mul_gx, m_x);
+  }
 }
 
 TEST_F(PolynomialTest, multiplicationScalar)
 {
-  const int size = 1 << 15;
-  auto f = randomize_polynomial(size);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const int size = 1 << 15;
+    auto f = randomize_polynomial(size);
 
-  auto g = two * f;
-  auto h = f * three;
+    auto g = two * f;
+    auto h = f * three;
 
-  scalar_t x = scalar_t::rand_host();
-  auto f_x = f(x);
-  auto g_x = g(x);
-  auto h_x = h(x);
+    scalar_t x = scalar_t::rand_host();
+    auto f_x = f(x);
+    auto g_x = g(x);
+    auto h_x = h(x);
 
-  EXPECT_EQ(g_x, f_x * two);
-  EXPECT_EQ(h_x, f_x * three);
+    EXPECT_EQ(g_x, f_x * two);
+    EXPECT_EQ(h_x, f_x * three);
 
-  EXPECT_EQ(g.degree(), f.degree());
-  EXPECT_EQ(h.degree(), f.degree());
+    EXPECT_EQ(g.degree(), f.degree());
+    EXPECT_EQ(h.degree(), f.degree());
+  }
 }
 
 TEST_F(PolynomialTest, monomials)
 {
-  const scalar_t coeffs[3] = {one, zero, two}; // 1+2x^2
-  auto f = Polynomial_t::from_coefficients(coeffs, 3);
-  const auto x = three;
-  const auto expected_f_x = one + two * x * x;
-  auto f_x = f(x);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const scalar_t coeffs[3] = {one, zero, two}; // 1+2x^2
+    auto f = Polynomial_t::from_coefficients(coeffs, 3);
+    const auto x = three;
+    const auto expected_f_x = one + two * x * x;
+    auto f_x = f(x);
 
-  EXPECT_EQ(f_x, expected_f_x);
+    EXPECT_EQ(f_x, expected_f_x);
 
-  f.add_monomial_inplace(three, 1); // add 3x
-  const auto expected_addmonmon_f_x = f_x + three * x;
-  const auto addmonom_f_x = f(x);
+    f.add_monomial_inplace(three, 1); // add 3x
+    const auto expected_addmonmon_f_x = f_x + three * x;
+    const auto addmonom_f_x = f(x);
 
-  EXPECT_EQ(addmonom_f_x, expected_addmonmon_f_x);
+    EXPECT_EQ(addmonom_f_x, expected_addmonmon_f_x);
 
-  f.sub_monomial_inplace(one); // subtract 1. equivalent to 'f-1'
-  const auto expected_submonom_f_x = addmonom_f_x - one;
-  const auto submonom_f_x = f(x);
+    f.sub_monomial_inplace(one); // subtract 1. equivalent to 'f-1'
+    const auto expected_submonom_f_x = addmonom_f_x - one;
+    const auto submonom_f_x = f(x);
 
-  EXPECT_EQ(submonom_f_x, expected_submonom_f_x);
+    EXPECT_EQ(submonom_f_x, expected_submonom_f_x);
+  }
 }
 
 TEST_F(PolynomialTest, ReadCoeffsToHost)
 {
-  const scalar_t coeffs_f[3] = {zero, one, two}; // x+2x^2
-  auto f = Polynomial_t::from_coefficients(coeffs_f, 3);
-  const scalar_t coeffs_g[3] = {one, one, one}; // 1+x+x^2
-  auto g = Polynomial_t::from_coefficients(coeffs_g, 3);
-
-  auto h = f + g; // 1+2x+3x^3
-  const auto h0 = h.get_coeff(0);
-  const auto h1 = h.get_coeff(1);
-  const auto h2 = h.get_coeff(2);
-  EXPECT_EQ(h0, one);
-  EXPECT_EQ(h1, two);
-  EXPECT_EQ(h2, three);
-
-  scalar_t h_coeffs[3] = {0};
-  auto nof_coeffs = h.copy_coeffs(h_coeffs, 0, 2); // read the coefficients
-  EXPECT_EQ(nof_coeffs, 3);                        // expecting 3 due to specified indices
-
-  scalar_t expected_h_coeffs[3] = {one, two, three};
-  for (int i = 0; i < nof_coeffs; ++i) {
-    EXPECT_EQ(expected_h_coeffs[i], h_coeffs[i]);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const scalar_t coeffs_f[3] = {zero, one, two}; // x+2x^2
+    auto f = Polynomial_t::from_coefficients(coeffs_f, 3);
+    const scalar_t coeffs_g[3] = {one, one, one}; // 1+x+x^2
+    auto g = Polynomial_t::from_coefficients(coeffs_g, 3);
+
+    auto h = f + g; // 1+2x+3x^3
+    const auto h0 = h.get_coeff(0);
+    const auto h1 = h.get_coeff(1);
+    const auto h2 = h.get_coeff(2);
+    EXPECT_EQ(h0, one);
+    EXPECT_EQ(h1, two);
+    EXPECT_EQ(h2, three);
+
+    scalar_t h_coeffs[3] = {0};
+    auto nof_coeffs = h.copy_coeffs(h_coeffs, 0, 2); // read the coefficients
+    EXPECT_EQ(nof_coeffs, 3);                        // expecting 3 due to specified indices
+
+    scalar_t expected_h_coeffs[3] = {one, two, three};
+    for (int i = 0; i < nof_coeffs; ++i) {
+      EXPECT_EQ(expected_h_coeffs[i], h_coeffs[i]);
+    }
   }
 }
 
 TEST_F(PolynomialTest, divisionSimple)
 {
-  const scalar_t coeffs_a[4] = {five, zero, four, three}; // 3x^3+4x^2+5
-  const scalar_t coeffs_b[3] = {minus_one, zero, one};    // x^2-1
-  auto a = Polynomial_t::from_coefficients(coeffs_a, 4);
-  auto b = Polynomial_t::from_coefficients(coeffs_b, 3);
-
-  auto [q, r] = a.divide(b);
-  scalar_t q_coeffs[2] = {0}; // 3x+4
-  scalar_t r_coeffs[2] = {0}; // 3x+9
-  const auto q_nof_coeffs = q.copy_coeffs(q_coeffs, 0, 1);
-  const auto r_nof_coeffs = r.copy_coeffs(r_coeffs, 0, 1);
-
-  ASSERT_EQ(q_nof_coeffs, 2);
-  ASSERT_EQ(r_nof_coeffs, 2);
-  ASSERT_EQ(q_coeffs[0], scalar_t::from(4));
-  ASSERT_EQ(q_coeffs[1], scalar_t::from(3));
-  ASSERT_EQ(r_coeffs[0], scalar_t::from(9));
-  ASSERT_EQ(r_coeffs[1], scalar_t::from(3));
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const scalar_t coeffs_a[4] = {five, zero, four, three}; // 3x^3+4x^2+5
+    const scalar_t coeffs_b[3] = {minus_one, zero, one};    // x^2-1
+    auto a = Polynomial_t::from_coefficients(coeffs_a, 4);
+    auto b = Polynomial_t::from_coefficients(coeffs_b, 3);
+
+    auto [q, r] = a.divide(b);
+    scalar_t q_coeffs[2] = {0}; // 3x+4
+    scalar_t r_coeffs[2] = {0}; // 3x+9
+    const auto q_nof_coeffs = q.copy_coeffs(q_coeffs, 0, 1);
+    const auto r_nof_coeffs = r.copy_coeffs(r_coeffs, 0, 1);
+
+    ASSERT_EQ(q_nof_coeffs, 2);
+    ASSERT_EQ(r_nof_coeffs, 2);
+    ASSERT_EQ(q_coeffs[0], scalar_t::from(4));
+    ASSERT_EQ(q_coeffs[1], scalar_t::from(3));
+    ASSERT_EQ(r_coeffs[0], scalar_t::from(9));
+    ASSERT_EQ(r_coeffs[1], scalar_t::from(3));
+  }
 }
 
 TEST_F(PolynomialTest, divisionLarge)
 {
-  const int size_0 = 1 << 12, size_1 = 1 << 2;
-  auto a = randomize_polynomial(size_0);
-  auto b = randomize_polynomial(size_1);
-
-  START_TIMER(poly_mult_start);
-  auto [q, r] = a.divide(b);
-  END_TIMER(poly_mult_start, "Polynomial division took", MEASURE);
-
-  scalar_t x = scalar_t::rand_host();
-  auto a_x = a(x);
-  auto b_x = b(x);
-  auto q_x = q(x);
-  auto r_x = r(x);
-
-  // a(x) = b(x)*q(x)+r(x)
-  EXPECT_EQ(a_x, b_x * q_x + r_x);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const int size_0 = 1 << 12, size_1 = 1 << 2;
+    auto a = randomize_polynomial(size_0);
+    auto b = randomize_polynomial(size_1);
+
+    START_TIMER(poly_mult_start);
+    auto [q, r] = a.divide(b);
+    END_TIMER(poly_mult_start, "Polynomial division took", MEASURE);
+
+    scalar_t x = scalar_t::rand_host();
+    auto a_x = a(x);
+    auto b_x = b(x);
+    auto q_x = q(x);
+    auto r_x = r(x);
+
+    // a(x) = b(x)*q(x)+r(x)
+    EXPECT_EQ(a_x, b_x * q_x + r_x);
+  }
 }
 
 TEST_F(PolynomialTest, divideByVanishingPolynomial)
 {
-  const scalar_t coeffs_v[5] = {minus_one, zero, zero, zero, one}; // x^4-1 vanishes on 4th roots of unity
-  auto v = Polynomial_t::from_coefficients(coeffs_v, 5);
-  auto h = randomize_polynomial(1 << 11, false);
-  auto hv = h * v;
-
-  START_TIMER(poly_div_vanishing);
-  auto h_div_by_vanishing = hv.divide_by_vanishing_polynomial(4);
-  END_TIMER(poly_div_vanishing, "Polynomial division by vanishing (fast) took", MEASURE);
-  assert_equal(h_div_by_vanishing, h);
-
-  START_TIMER(poly_div_long);
-  auto [h_div, R] = hv.divide(v);
-  END_TIMER(poly_div_long, "Polynomial division by vanishing (long division) took", MEASURE);
-  assert_equal(h_div, h);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const scalar_t coeffs_v[5] = {minus_one, zero, zero, zero, one}; // x^4-1 vanishes on 4th roots of unity
+    auto v = Polynomial_t::from_coefficients(coeffs_v, 5);
+    auto h = randomize_polynomial(1 << 11, false);
+    auto hv = h * v;
+
+    START_TIMER(poly_div_vanishing);
+    auto h_div_by_vanishing = hv.divide_by_vanishing_polynomial(4);
+    END_TIMER(poly_div_vanishing, "Polynomial division by vanishing (fast) took", MEASURE);
+    assert_equal(h_div_by_vanishing, h);
+
+    START_TIMER(poly_div_long);
+    auto [h_div, R] = hv.divide(v);
+    END_TIMER(poly_div_long, "Polynomial division by vanishing (long division) took", MEASURE);
+    assert_equal(h_div, h);
+  }
 }
 
 TEST_F(PolynomialTest, clone)
 {
-  const int size = 1 << 10;
-  auto f = randomize_polynomial(size);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const int size = 1 << 10;
+    auto f = randomize_polynomial(size);
 
-  const auto x = scalar_t::rand_host();
-  const auto f_x = f(x);
+    const auto x = scalar_t::rand_host();
+    const auto f_x = f(x);
 
-  Polynomial_t g;
-  g = f.clone(); // operator=(&&)
-  g += f;
+    Polynomial_t g;
+    g = f.clone(); // operator=(&&)
+    g += f;
 
-  auto h = g.clone(); // move constructor
+    auto h = g.clone(); // move constructor
 
-  ASSERT_EQ(g(x), two * f_x);
-  ASSERT_EQ(h(x), g(x));
+    ASSERT_EQ(g(x), two * f_x);
+    ASSERT_EQ(h(x), g(x));
+  }
 }
 
 TEST_F(PolynomialTest, View)
 {
-  const int size = 1 << 6;
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const int size = 1 << 6;
 
-  auto f = randomize_polynomial(size);
-  auto [d_coeff, N] = f.get_coefficients_view();
+    auto f = randomize_polynomial(size);
+    auto [d_coeff, N] = f.get_coefficients_view();
 
-  EXPECT_EQ(d_coeff.isValid(), true);
-  auto g = f + f;
-  // expecting the view to remain valid in that case
-  EXPECT_EQ(d_coeff.isValid(), true);
+    EXPECT_EQ(d_coeff.isValid(), true);
+    auto g = f + f;
+    // expecting the view to remain valid in that case
+    EXPECT_EQ(d_coeff.isValid(), true);
 
-  f += f;
-  // expecting view to be invalidated since f is modified
-  EXPECT_EQ(d_coeff.isValid(), false);
+    f += f;
+    // expecting view to be invalidated since f is modified
+    EXPECT_EQ(d_coeff.isValid(), false);
+  }
 }
 
 TEST_F(PolynomialTest, slicing)
 {
-  auto body = [](int size) {
-    const bool is_odd_size = size % 2 == 1;
-    auto f = randomize_polynomial(size);
-    const auto x = scalar_t::rand_host();
-
-    // computing e(x) and o(x) directly
-    auto expected_even = scalar_t::zero();
-    auto expected_odd = scalar_t::zero();
-    for (int i = size - 1; i >= 0; --i) {
-      if (i % 2 == 0)
-        expected_even = expected_even * x + f.get_coeff(i);
-      else
-        expected_odd = expected_odd * x + f.get_coeff(i);
-    }
-
-    auto e = f.even();
-    auto o = f.odd();
-
-    ASSERT_EQ(o.degree(), size / 2 - 1);
-    ASSERT_EQ(e.degree(), is_odd_size ? size / 2 : size / 2 - 1);
-
-    // compute even and odd polynomials and compute them at x
-    ASSERT_EQ(f.even()(x), expected_even);
-    ASSERT_EQ(f.odd()(x), expected_odd);
-  };
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    auto body = [](int size) {
+      const bool is_odd_size = size % 2 == 1;
+      auto f = randomize_polynomial(size);
+      const auto x = scalar_t::rand_host();
+
+      // computing e(x) and o(x) directly
+      auto expected_even = scalar_t::zero();
+      auto expected_odd = scalar_t::zero();
+      for (int i = size - 1; i >= 0; --i) {
+        if (i % 2 == 0)
+          expected_even = expected_even * x + f.get_coeff(i);
+        else
+          expected_odd = expected_odd * x + f.get_coeff(i);
+      }
+
+      auto e = f.even();
+      auto o = f.odd();
+
+      ASSERT_EQ(o.degree(), size / 2 - 1);
+      ASSERT_EQ(e.degree(), is_odd_size ? size / 2 : size / 2 - 1);
+
+      // compute even and odd polynomials and compute them at x
+      ASSERT_EQ(f.even()(x), expected_even);
+      ASSERT_EQ(f.odd()(x), expected_odd);
+    };
 
-  body(1 << 10);       // test even size
-  body((1 << 10) - 1); // test odd size
+    body(1 << 10);       // test even size
+    body((1 << 10) - 1); // test odd size
+  }
 }
 
 #ifdef MSM
@@ -899,106 +944,124 @@ class Groth16Example
 
 TEST_F(PolynomialTest, QAP)
 {
-  // (1) construct R1CS and QAP for circuit with N inputs
-  Groth16Example QAP(300 /*=N*/);
-
-  // (2) compute witness: randomize inputs and compute other entries [1,out,...N inputs..., ... intermediate values...]
-  auto witness = QAP.random_witness_inputs();
-  QAP.compute_witness(witness);
-
-  // (3) compute L(x),R(x),O(x) using the witness
-  const int nof_cols = QAP.witness_size;
-
-  Polynomial_t Lx = QAP.L_QAP[0].clone();
-  Polynomial_t Rx = QAP.R_QAP[0].clone();
-  Polynomial_t Ox = QAP.O_QAP[0].clone();
-  for (int col = 1; col < nof_cols; ++col) {
-    Lx += witness[col] * QAP.L_QAP[col];
-    Rx += witness[col] * QAP.R_QAP[col];
-    Ox += witness[col] * QAP.O_QAP[col];
-  }
-
-  const int nof_constraints =
-    QAP.nof_inputs - 1; // multiplying N numbers yields N-1 constraints for this circuit construction
-  const int vanishing_poly_deg = ceil_to_power_of_two(nof_constraints);
-  //  (4) sanity check: verify AB=C at the evaluation points
-  {
-    scalar_t w;
-    ICICLE_CHECK(get_root_of_unity_from_domain<scalar_t>((int)ceil(log2(nof_constraints)), &w));
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    // (1) construct R1CS and QAP for circuit with N inputs
+    Groth16Example QAP(300 /*=N*/);
+
+    // (2) compute witness: randomize inputs and compute other entries [1,out,...N inputs..., ... intermediate
+    // values...]
+    auto witness = QAP.random_witness_inputs();
+    QAP.compute_witness(witness);
+
+    // (3) compute L(x),R(x),O(x) using the witness
+    const int nof_cols = QAP.witness_size;
+
+    Polynomial_t Lx = QAP.L_QAP[0].clone();
+    Polynomial_t Rx = QAP.R_QAP[0].clone();
+    Polynomial_t Ox = QAP.O_QAP[0].clone();
+    for (int col = 1; col < nof_cols; ++col) {
+      Lx += witness[col] * QAP.L_QAP[col];
+      Rx += witness[col] * QAP.R_QAP[col];
+      Ox += witness[col] * QAP.O_QAP[col];
+    }
 
-    auto x = scalar_t::one();
-    for (int i = 0; i < vanishing_poly_deg; ++i) {
-      ASSERT_EQ(Lx(x) * Rx(x), Ox(x));
-      x = x * w;
+    const int nof_constraints =
+      QAP.nof_inputs - 1; // multiplying N numbers yields N-1 constraints for this circuit construction
+    const int vanishing_poly_deg = ceil_to_power_of_two(nof_constraints);
+    //  (4) sanity check: verify AB=C at the evaluation points
+    {
+      scalar_t w;
+      ICICLE_CHECK(get_root_of_unity_from_domain<scalar_t>((int)ceil(log2(nof_constraints)), &w));
+
+      auto x = scalar_t::one();
+      for (int i = 0; i < vanishing_poly_deg; ++i) {
+        ASSERT_EQ(Lx(x) * Rx(x), Ox(x));
+        x = x * w;
+      }
     }
-  }
 
-  // (5) compute h(x) as '(L(x)R(x)-O(x)) / t(x)'
-  Polynomial_t h = (Lx * Rx - Ox).divide_by_vanishing_polynomial(vanishing_poly_deg);
+    // (5) compute h(x) as '(L(x)R(x)-O(x)) / t(x)'
+    Polynomial_t h = (Lx * Rx - Ox).divide_by_vanishing_polynomial(vanishing_poly_deg);
 
-  // (6) sanity check: vanishing-polynomial divides (LR-O) without remainder
-  {
-    auto [h_long_div, r] = (Lx * Rx - Ox).divide(vanishing_polynomial(vanishing_poly_deg));
-    EXPECT_EQ(r.degree(), -1); // zero polynomial (expecting division without remainder)
-    assert_equal(h, h_long_div);
+    // (6) sanity check: vanishing-polynomial divides (LR-O) without remainder
+    {
+      auto temp = Lx * Rx - Ox;
+      auto v = vanishing_polynomial(vanishing_poly_deg);
+      auto [q, r] = temp.divide(v);
+      auto rand_x = scalar_t::rand_host();
+      EXPECT_EQ(temp(rand_x), q(rand_x) * v(rand_x) + r(rand_x));
+      EXPECT_EQ(r.degree(), -1); // zero polynomial (expecting division without remainder)
+      assert_equal(h, q);
+    }
   }
 }
 
 #ifdef MSM
 TEST_F(PolynomialTest, commitMSM)
 {
-  const int size = 1 << 6;
-  auto f = randomize_polynomial(size);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    const int size = 1 << 6;
+    auto f = randomize_polynomial(size);
 
-  auto [d_coeff, N] = f.get_coefficients_view();
-  auto msm_config = default_msm_config();
-  msm_config.are_scalars_on_device = true;
+    auto [d_coeff, N] = f.get_coefficients_view();
+    auto msm_config = default_msm_config();
+    msm_config.are_scalars_on_device = true;
 
-  auto points = std::make_unique<affine_t[]>(size);
-  projective_t result;
+    auto points = std::make_unique<affine_t[]>(size);
+    projective_t result;
 
-  auto tau = scalar_t::rand_host();
-  projective_t g = projective_t::rand_host();
-  compute_powers_of_tau(g, tau, points.get(), size);
+    auto tau = scalar_t::rand_host();
+    projective_t g = projective_t::rand_host();
+    compute_powers_of_tau(g, tau, points.get(), size);
 
-  EXPECT_EQ(d_coeff.isValid(), true);
-  ICICLE_CHECK(msm(d_coeff.get(), points.get(), size, msm_config, &result));
+    EXPECT_EQ(d_coeff.isValid(), true);
+    ICICLE_CHECK(msm(d_coeff.get(), points.get(), size, msm_config, &result));
 
-  EXPECT_EQ(result, f(tau) * g);
+    EXPECT_EQ(result, f(tau) * g);
 
-  f += f; // this is invalidating the d_coeff integrity-pointer
+    f += f; // this is invalidating the d_coeff integrity-pointer
 
-  EXPECT_EQ(d_coeff.isValid(), false);
+    EXPECT_EQ(d_coeff.isValid(), false);
+  }
 }
 
 TEST_F(PolynomialTest, DummyGroth16)
 {
-  // (1) construct R1CS and QAP for circuit with N inputs
-  Groth16Example<scalar_t, affine_t, projective_t, dummy_g2_t, dummy_g2_t> groth16_example(30 /*=N*/);
-
-  // (2) compute witness: randomize inputs and compute other entries [1,out,...N inputs..., ... intermediate values...]
-  auto witness = groth16_example.random_witness_inputs();
-  groth16_example.compute_witness(witness);
-
-  groth16_example.setup();
-  auto proof = groth16_example.prove(witness);
-  ASSERT_EQ(groth16_example.dummy_verify(proof, witness), true);
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    // (1) construct R1CS and QAP for circuit with N inputs
+    Groth16Example<scalar_t, affine_t, projective_t, dummy_g2_t, dummy_g2_t> groth16_example(30 /*=N*/);
+
+    // (2) compute witness: randomize inputs and compute other entries [1,out,...N inputs..., ... intermediate
+    // values...]
+    auto witness = groth16_example.random_witness_inputs();
+    groth16_example.compute_witness(witness);
+
+    groth16_example.setup();
+    auto proof = groth16_example.prove(witness);
+    ASSERT_EQ(groth16_example.dummy_verify(proof, witness), true);
+  }
 }
 
   #ifdef G2
 TEST_F(PolynomialTest, Groth16)
 {
-  // (1) construct R1CS and QAP for circuit with N inputs
-  Groth16Example<scalar_t, affine_t, projective_t, g2_affine_t, g2_projective_t> groth16_example(30 /*=N*/);
-
-  // (2) compute witness: randomize inputs and compute other entries [1,out,...N inputs..., ... intermediate
-  // values...]
-  auto witness = groth16_example.random_witness_inputs();
-  groth16_example.compute_witness(witness);
-
-  groth16_example.setup();
-  auto proof = groth16_example.prove(witness);
-  // groth16_example.verify(proof); // cannot implement without pairing
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    // (1) construct R1CS and QAP for circuit with N inputs
+    Groth16Example<scalar_t, affine_t, projective_t, g2_affine_t, g2_projective_t> groth16_example(30 /*=N*/);
+
+    // (2) compute witness: randomize inputs and compute other entries [1,out,...N inputs..., ... intermediate
+    // values...]
+    auto witness = groth16_example.random_witness_inputs();
+    groth16_example.compute_witness(witness);
+
+    groth16_example.setup();
+    auto proof = groth16_example.prove(witness);
+    // groth16_example.verify(proof); // cannot implement without pairing
+  }
 }
   #endif // G2