feat(lss): make lss tasks mpi parallel

radiocosmology · Sep 17, 2024 · e91f97c · e91f97c
1 parent ea72897
commit e91f97c
Show file tree

Hide file tree

Showing 4 changed files with 143 additions and 93 deletions.
diff --git a/cora/core/skysim.py b/cora/core/skysim.py
@@ -3,6 +3,7 @@
 import scipy.integrate as si
 import healpy
 
+from caput import mpiarray
 from cora.util import hputil, nputil
 
 
@@ -92,30 +93,45 @@ def mkfullsky(corr, nside, alms=False, rng=None):
     """
 
     numz = corr.shape[1]
-    maxl = corr.shape[0] - 1
+
+    isdistributed = isinstance(corr, mpiarray.MPIArray)
+
+    if isdistributed:
+        maxl = corr.global_shape[0] - 1
+        corr = corr.local_array
+    else:
+        maxl = corr.shape[0] - 1
 
     if corr.shape[2] != numz:
         raise Exception("Correlation matrix is incorrect shape.")
 
-    alm_array = np.zeros((numz, 1, maxl + 1, maxl + 1), dtype=np.complex128)
+    alm_array = mpiarray.zeros(
+        (numz, 1, maxl + 1, maxl + 1), dtype=np.complex128, axis=2
+    )
 
     # Generate gaussian deviates and transform to have correct correlation
     # structure
-    for l in range(maxl + 1):
+    for lloc, lglob in alm_array.enumerate(axis=2):
         # Add in a small diagonal to try and ensure positive definiteness
-        cmax = corr[l].diagonal().max() * 1e-14
-        corrm = corr[l] + np.identity(numz) * cmax
+        cmax = corr[lloc].diagonal().max() * 1e-14
+        corrm = corr[lloc] + np.identity(numz) * cmax
 
         trans = nputil.matrix_root_manynull(corrm, truncate=False)
-        gaussvars = nputil.complex_std_normal((numz, l + 1), rng=rng)
-        alm_array[:, 0, l, : (l + 1)] = np.dot(trans, gaussvars)
+        gaussvars = nputil.complex_std_normal((numz, lglob + 1), rng=rng)
+        alm_array.local_array[:, 0, lloc, : (lglob + 1)] = np.dot(trans, gaussvars)
 
     if alms:
-        return alm_array
+        # Return the entire alm array on each rank
+        return alm_array.allgather()
 
     # Perform the spherical harmonic transform for each z
-    sky = hputil.sphtrans_inv_sky(alm_array, nside)
-    sky = sky[:, 0]
+    alm_array = alm_array.redistribute(axis=0)
+
+    sky = hputil.sphtrans_inv_sky(alm_array.local_array, nside)[:, 0]
+
+    if isdistributed:
+        # Re-wrap the final array
+        sky = mpiarray.MPIArray.wrap(sky, axis=0)
 
     return sky
 

diff --git a/cora/signal/corrfunc.py b/cora/signal/corrfunc.py
@@ -5,6 +5,8 @@
 import scipy.special as ss
 from scipy.fftpack import dct
 
+from caput import mpiarray
+
 import hankl
 import hankel
 import pyfftlog
@@ -329,9 +331,6 @@ def corr_to_clarray(
     M = q * lmax
     mu, w, wsum = ss.roots_legendre(M, mu=True)
 
-    xlen = xarray.size
-    corr_array = np.zeros((M, xlen, xlen))
-
     # If xromb > 0 we need to integrate over the radial bin width, start by modifying
     # the array of distances to add extra points over which we'll integrate
     #
@@ -359,12 +358,15 @@ def corr_to_clarray(
     else:
         xa = xarray
 
-    # Split the set of theta values to fill into groups of length ~50 and process each,
-    # this is helps reduce memory usage which otherwise we be massively inflated by the
-    # extra points we integrate over
-    for msec in np.array_split(np.arange(M), M // 50):
-
-        rc = coord.cosine_rule(mu[msec], xa, xa)
+    xlen = xarray.size
+    corr_array = mpiarray.zeros((M, xlen, xlen), axis=0)
+    clo = corr_array.local_offset[0]
+    _len = corr_array.local_array.shape[0]
+
+    # Split thetas into ~length 50 chunks, otherwise memory will blow up
+    for msec in np.array_split(np.arange(_len), _len // 50):
+        # Index into the global index in mu
+        rc = coord.cosine_rule(mu[clo + msec], xa, xa)
         corr1 = corr(rc)
 
         # If xromb then we need to integrate over the redshift bins which we do using
@@ -374,15 +376,21 @@ def corr_to_clarray(
             corr1 = np.matmul(corr1, x_w).reshape(-1, xlen, xint, xlen)
             corr1 = np.matmul(corr1.transpose(0, 1, 3, 2), x_w)
 
-        corr_array[msec, :, :] = corr1
+        corr_array.local_array[msec] = corr1
 
+    # Perform the dot product split over ranks for
+    # memory and time
     lm = legendre_array(lmax, mu)
-    lm *= w * 4.0 * np.pi / wsum
+    lm *= w[np.newaxis] * 4.0 * np.pi / wsum
+
+    # Reshape and properly distribute the array to
+    # perform the dot product
+    corr_array = corr_array.reshape(None, -1).redistribute(axis=1)
 
-    clxx = np.dot(lm, corr_array.reshape(M, -1))
-    clxx = clxx.reshape(lmax + 1, xlen, xlen)
+    clxx = np.dot(lm, corr_array.local_array)
+    clxx = mpiarray.MPIArray.wrap(clxx, axis=1).redistribute(axis=0)
 
-    return clxx
+    return clxx.reshape(None, xlen, xlen)
 
 
 def ps_to_aps_flat(