Do not mutate weights in reduce.mode! Fixes #145

Apart from mutating the weights (which are eligilable for re-use!), this might also cause issues with concurrent writes, since the regrid uses a parallel for loop via numba.prange. With this commit, we allocate a new (heap) array for every call of mode. For large grids, this is an enormous amount of small allocations, but mode is a relatively slow method anyway...
Deltares · Aug 12, 2023 · ea4828d · ea4828d
1 parent 1ec9789
commit ea4828d
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 13 deletions.
diff --git a/tests/test_regrid/test_reduce.py b/tests/test_regrid/test_reduce.py
@@ -52,6 +52,8 @@ def test_mode(args):
     args = (values, indices, weights)
     actual = reduce.mode(*args)
     assert np.allclose(actual, 1.0)
+    # The weights shouldn't be mutated!
+    assert np.allclose(weights, 0.5)
 
 
 def test_median(args):

diff --git a/xugrid/regrid/reduce.py b/xugrid/regrid/reduce.py
@@ -105,14 +105,10 @@ def maximum(values, indices, weights):
 
 
 def mode(values, indices, weights):
-    # Area weighted mode
-    # Reuse weights to do counting: no allocations
-    # The alternative is defining a separate frequency array in which to add
-    # the weights. This implementation is less efficient in terms of looping.
-    # With many unique values, it keeps having to loop through a big part of
-    # the weights array... but it would do so with a separate frequency array
-    # as well. There are somewhat more elements to traverse in this case.
-    s = indices.size
+    # Area weighted mode. We use a linear search to accumulate weights, as we
+    # generally expect a relatively small number of elements in the indices and
+    # weights arrays.
+    accum = weights.copy()
     w_sum = 0
     for running_total, (i, w) in enumerate(zip(indices, weights)):
         v = values[i]
@@ -121,17 +117,17 @@ def mode(values, indices, weights):
         w_sum += 1
         for j in range(running_total):  # Compare with previously found values
             if values[j] == v:  # matches previous value
-                weights[j] += w  # increase previous weight
+                accum[j] += w  # increase previous weight sum
                 break
 
     if w_sum == 0:  # It skipped everything: only nodata values
         return np.nan
     else:  # Find value with highest frequency
         w_max = 0
-        for i in range(s):
-            w = weights[i]
-            if w > w_max:
-                w_max = w
+        for i in range(accum.size):
+            w_accum = accum[i]
+            if w_accum > w_max:
+                w_max = w_accum
                 v = values[i]
         return v