From 94abe4c2cadab1d5a11c4ffd8d6eeb318d7c6446 Mon Sep 17 00:00:00 2001 From: mrava87 Date: Mon, 9 Dec 2024 09:26:00 +0300 Subject: [PATCH] feat: speedup fourierradon with engine=cuda --- .../signalprocessing/_fourierradon2d_cuda.py | 17 +++++++++++-- .../signalprocessing/_fourierradon3d_cuda.py | 24 +++++++++++++++---- pylops/signalprocessing/fourierradon2d.py | 6 +++-- pylops/signalprocessing/fourierradon3d.py | 6 +++-- 4 files changed, 43 insertions(+), 10 deletions(-) diff --git a/pylops/signalprocessing/_fourierradon2d_cuda.py b/pylops/signalprocessing/_fourierradon2d_cuda.py index 2a0d35ba..f74390ff 100755 --- a/pylops/signalprocessing/_fourierradon2d_cuda.py +++ b/pylops/signalprocessing/_fourierradon2d_cuda.py @@ -1,8 +1,13 @@ from cmath import exp from math import pi +import cupy as cp from numba import cuda +TWO_PI_MINUS = cp.float32(-2.0 * pi) +TWO_PI_PLUS = cp.float32(2.0 * pi) +I = cp.complex64(1j) + @cuda.jit def _radon_inner_2d_kernel(x, y, f, px, h, flim0, flim1, npx, nh): @@ -16,7 +21,11 @@ def _radon_inner_2d_kernel(x, y, f, px, h, flim0, flim1, npx, nh): ih, ifr = cuda.grid(2) if ih < nh and ifr >= flim0 and ifr <= flim1: for ipx in range(npx): - y[ih, ifr] += x[ipx, ifr] * exp(-1j * 2 * pi * f[ifr] * px[ipx] * h[ih]) + # slow computation of exp(1j * x) + # y[ih, ifr] += x[ipx, ifr] * exp(TWO_PI_MINUS * f[ifr] * px[ipx] * h[ih]) + # fast computation of exp(1j * x) - see https://stackoverflow.com/questions/9860711/cucomplex-h-and-exp/9863048#9863048 + s, c = cuda.libdevice.sincosf(TWO_PI_MINUS * f[ifr] * px[ipx] * h[ih]) + y[ih, ifr] += x[ipx, ifr] * (c + I * s) @cuda.jit @@ -31,7 +40,11 @@ def _aradon_inner_2d_kernel(x, y, f, px, h, flim0, flim1, npx, nh): ipx, ifr = cuda.grid(2) if ipx < npx and ifr >= flim0 and ifr <= flim1: for ih in range(nh): - x[ipx, ifr] += y[ih, ifr] * exp(1j * 2 * pi * f[ifr] * px[ipx] * h[ih]) + # slow computation of exp(1j * x) + # x[ipx, ifr] += y[ih, ifr] * exp(TWO_PI_I_PLUS * f[ifr] * px[ipx] * h[ih]) + # fast computation of exp(1j * x) - see https://stackoverflow.com/questions/9860711/cucomplex-h-and-exp/9863048#9863048 + s, c = cuda.libdevice.sincosf(TWO_PI_PLUS * f[ifr] * px[ipx] * h[ih]) + x[ipx, ifr] += y[ih, ifr] * (c + I * s) def _radon_inner_2d_cuda( diff --git a/pylops/signalprocessing/_fourierradon3d_cuda.py b/pylops/signalprocessing/_fourierradon3d_cuda.py index 16f94f7e..2481abe2 100755 --- a/pylops/signalprocessing/_fourierradon3d_cuda.py +++ b/pylops/signalprocessing/_fourierradon3d_cuda.py @@ -1,8 +1,13 @@ from cmath import exp from math import pi +import cupy as cp from numba import cuda +TWO_PI_MINUS = cp.float32(-2.0 * pi) +TWO_PI_PLUS = cp.float32(2.0 * pi) +I = cp.complex64(1j) + @cuda.jit def _radon_inner_3d_kernel(x, y, f, py, px, hy, hx, flim0, flim1, npy, npx, nhy, nhx): @@ -17,9 +22,15 @@ def _radon_inner_3d_kernel(x, y, f, py, px, hy, hx, flim0, flim1, npy, npx, nhy, if ihy < nhy and ihx < nhx and ifr >= flim0 and ifr <= flim1: for ipy in range(npy): for ipx in range(npx): - y[ihy, ihx, ifr] += x[ipy, ipx, ifr] * exp( - -1j * 2 * pi * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx]) + # slow computation of exp(1j * x) + # y[ihy, ihx, ifr] += x[ipy, ipx, ifr] * exp( + # TWO_PI_MINUS * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx]) + # ) + # fast computation of exp(1j * x) - see https://stackoverflow.com/questions/9860711/cucomplex-h-and-exp/9863048#9863048 + s, c = cuda.libdevice.sincosf( + TWO_PI_MINUS * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx]) ) + y[ihy, ihx, ifr] += x[ipy, ipx, ifr] * (c + I * s) @cuda.jit @@ -35,9 +46,14 @@ def _aradon_inner_3d_kernel(x, y, f, py, px, hy, hx, flim0, flim1, npy, npx, nhy if ipy < npy and ipx < npx and ifr >= flim0 and ifr <= flim1: for ihy in range(nhy): for ihx in range(nhx): - x[ipy, ipx, ifr] += y[ihy, ihx, ifr] * exp( - 1j * 2 * pi * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx]) + # slow computation of exp(1j * x) + # x[ipy, ipx, ifr] += y[ihy, ihx, ifr] * exp( + # TWO_PI_I_PLUS * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx]) + # ) + s, c = cuda.libdevice.sincosf( + TWO_PI_PLUS * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx]) ) + x[ipy, ipx, ifr] += y[ihy, ihx, ifr] * (c + I * s) def _radon_inner_3d_cuda( diff --git a/pylops/signalprocessing/fourierradon2d.py b/pylops/signalprocessing/fourierradon2d.py index f8296d78..37439dda 100755 --- a/pylops/signalprocessing/fourierradon2d.py +++ b/pylops/signalprocessing/fourierradon2d.py @@ -13,10 +13,12 @@ from pylops.utils.typing import DTypeLike, NDArray jit_message = deps.numba_import("the radon2d module") +cupy_message = deps.cupy_import("the radon2d module") if jit_message is None: - from ._fourierradon2d_cuda import _aradon_inner_2d_cuda, _radon_inner_2d_cuda from ._fourierradon2d_numba import _aradon_inner_2d, _radon_inner_2d +if jit_message is None and cupy_message is None: + from ._fourierradon2d_cuda import _aradon_inner_2d_cuda, _radon_inner_2d_cuda logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.WARNING) @@ -167,7 +169,7 @@ def __init__( self._register_multiplications(engine) def _register_multiplications(self, engine: str) -> None: - if engine == "numba" and jit_message is None: + if engine == "numba": self._matvec = self._matvec_numba self._rmatvec = self._rmatvec_numba elif engine == "cuda": diff --git a/pylops/signalprocessing/fourierradon3d.py b/pylops/signalprocessing/fourierradon3d.py index 75622c85..b6c8a309 100755 --- a/pylops/signalprocessing/fourierradon3d.py +++ b/pylops/signalprocessing/fourierradon3d.py @@ -13,10 +13,12 @@ from pylops.utils.typing import DTypeLike, NDArray jit_message = deps.numba_import("the radon2d module") +cupy_message = deps.cupy_import("the radon2d module") if jit_message is None: - from ._fourierradon3d_cuda import _aradon_inner_3d_cuda, _radon_inner_3d_cuda from ._fourierradon3d_numba import _aradon_inner_3d, _radon_inner_3d +if jit_message is None and cupy_message is None: + from ._fourierradon3d_cuda import _aradon_inner_3d_cuda, _radon_inner_3d_cuda logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.WARNING) @@ -192,7 +194,7 @@ def __init__( self._register_multiplications(engine) def _register_multiplications(self, engine: str) -> None: - if engine == "numba" and jit_message is None: + if engine == "numba": self._matvec = self._matvec_numba self._rmatvec = self._rmatvec_numba elif engine == "cuda":