From 2ea42ab06e5a9610d5c8ea71bcf0fa276dc13c4a Mon Sep 17 00:00:00 2001
From: Ricardo Vieira <ricardo.vieira1994@gmail.com>
Date: Fri, 3 Jan 2025 13:47:27 +0100
Subject: [PATCH] Use more specific Numba fastmath flags everywhere

---
 doc/extending/creating_a_numba_jax_op.rst |  8 ++--
 pytensor/link/numba/dispatch/basic.py     | 18 +++++--
 pytensor/link/numba/dispatch/blockwise.py |  1 -
 pytensor/link/numba/dispatch/elemwise.py  | 58 ++---------------------
 pytensor/link/numba/dispatch/extra_ops.py |  9 ++--
 pytensor/link/numba/dispatch/scalar.py    | 24 ++++------
 tests/link/numba/test_scalar.py           | 17 +++++--
 7 files changed, 48 insertions(+), 87 deletions(-)

diff --git a/doc/extending/creating_a_numba_jax_op.rst b/doc/extending/creating_a_numba_jax_op.rst
index 23faea9465..8be08b4953 100644
--- a/doc/extending/creating_a_numba_jax_op.rst
+++ b/doc/extending/creating_a_numba_jax_op.rst
@@ -358,13 +358,13 @@ Here's an example for the `CumOp`\ `Op`:
                 if mode == "add":
                     if axis is None or ndim == 1:
 
-                        @numba_basic.numba_njit(fastmath=config.numba__fastmath)
+                        @numba_basic.numba_njit()
                         def cumop(x):
                             return np.cumsum(x)
 
                     else:
 
-                        @numba_basic.numba_njit(boundscheck=False, fastmath=config.numba__fastmath)
+                        @numba_basic.numba_njit(boundscheck=False)
                         def cumop(x):
                             out_dtype = x.dtype
                             if x.shape[axis] < 2:
@@ -382,13 +382,13 @@ Here's an example for the `CumOp`\ `Op`:
                 else:
                     if axis is None or ndim == 1:
 
-                        @numba_basic.numba_njit(fastmath=config.numba__fastmath)
+                        @numba_basic.numba_njit()
                         def cumop(x):
                             return np.cumprod(x)
 
                     else:
 
-                        @numba_basic.numba_njit(boundscheck=False, fastmath=config.numba__fastmath)
+                        @numba_basic.numba_njit(boundscheck=False)
                         def cumop(x):
                             out_dtype = x.dtype
                             if x.shape[axis] < 2:
diff --git a/pytensor/link/numba/dispatch/basic.py b/pytensor/link/numba/dispatch/basic.py
index 8bf827b52f..12a5f9d62a 100644
--- a/pytensor/link/numba/dispatch/basic.py
+++ b/pytensor/link/numba/dispatch/basic.py
@@ -49,10 +49,22 @@ def global_numba_func(func):
     return func
 
 
-def numba_njit(*args, **kwargs):
+def numba_njit(*args, fastmath=None, **kwargs):
     kwargs.setdefault("cache", config.numba__cache)
     kwargs.setdefault("no_cpython_wrapper", True)
     kwargs.setdefault("no_cfunc_wrapper", True)
+    if fastmath is None and config.numba__fastmath:
+        # Opinionated default on fastmath flags
+        # https://llvm.org/docs/LangRef.html#fast-math-flags
+        fastmath = {
+            "arcp",  # Allow Reciprocal
+            "contract",  # Allow floating-point contraction
+            "afn",  # Approximate functions
+            "reassoc",
+            "nsz",  # no-signed zeros
+        }
+    else:
+        fastmath = False
 
     # Suppress cache warning for internal functions
     # We have to add an ansi escape code for optional bold text by numba
@@ -68,9 +80,9 @@ def numba_njit(*args, **kwargs):
     )
 
     if len(args) > 0 and callable(args[0]):
-        return numba.njit(*args[1:], **kwargs)(args[0])
+        return numba.njit(*args[1:], fastmath=fastmath, **kwargs)(args[0])
 
-    return numba.njit(*args, **kwargs)
+    return numba.njit(*args, fastmath=fastmath, **kwargs)
 
 
 def numba_vectorize(*args, **kwargs):
diff --git a/pytensor/link/numba/dispatch/blockwise.py b/pytensor/link/numba/dispatch/blockwise.py
index 131788e843..b7481bd5a3 100644
--- a/pytensor/link/numba/dispatch/blockwise.py
+++ b/pytensor/link/numba/dispatch/blockwise.py
@@ -32,7 +32,6 @@ def numba_funcify_Blockwise(op: BlockwiseWithCoreShape, node, **kwargs):
         core_op,
         node=core_node,
         parent_node=node,
-        fastmath=_jit_options["fastmath"],
         **kwargs,
     )
     core_op_fn = store_core_outputs(core_op_fn, nin=nin, nout=nout)
diff --git a/pytensor/link/numba/dispatch/elemwise.py b/pytensor/link/numba/dispatch/elemwise.py
index 2759422bf6..ae5ef3dcb1 100644
--- a/pytensor/link/numba/dispatch/elemwise.py
+++ b/pytensor/link/numba/dispatch/elemwise.py
@@ -1,15 +1,11 @@
-from collections.abc import Callable
 from functools import singledispatch
 from textwrap import dedent, indent
-from typing import Any
 
 import numba
 import numpy as np
 from numba.core.extending import overload
 from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple
 
-from pytensor import config
-from pytensor.graph.basic import Apply
 from pytensor.graph.op import Op
 from pytensor.link.numba.dispatch import basic as numba_basic
 from pytensor.link.numba.dispatch.basic import (
@@ -124,42 +120,6 @@ def scalar_in_place_fn_ScalarMinimum(op, idx, res, arr):
 """
 
 
-def create_vectorize_func(
-    scalar_op_fn: Callable,
-    node: Apply,
-    use_signature: bool = False,
-    identity: Any | None = None,
-    **kwargs,
-) -> Callable:
-    r"""Create a vectorized Numba function from a `Apply`\s Python function."""
-
-    if len(node.outputs) > 1:
-        raise NotImplementedError(
-            "Multi-output Elemwise Ops are not supported by the Numba backend"
-        )
-
-    if use_signature:
-        signature = [create_numba_signature(node, force_scalar=True)]
-    else:
-        signature = []
-
-    target = (
-        getattr(node.tag, "numba__vectorize_target", None)
-        or config.numba__vectorize_target
-    )
-
-    numba_vectorized_fn = numba_basic.numba_vectorize(
-        signature, identity=identity, target=target, fastmath=config.numba__fastmath
-    )
-
-    py_scalar_func = getattr(scalar_op_fn, "py_func", scalar_op_fn)
-
-    elemwise_fn = numba_vectorized_fn(scalar_op_fn)
-    elemwise_fn.py_scalar_func = py_scalar_func
-
-    return elemwise_fn
-
-
 def create_multiaxis_reducer(
     scalar_op,
     identity,
@@ -320,7 +280,6 @@ def jit_compile_reducer(
         res = numba_basic.numba_njit(
             *args,
             boundscheck=False,
-            fastmath=config.numba__fastmath,
             **kwds,
         )(fn)
 
@@ -354,7 +313,6 @@ def numba_funcify_Elemwise(op, node, **kwargs):
         op.scalar_op,
         node=scalar_node,
         parent_node=node,
-        fastmath=_jit_options["fastmath"],
         **kwargs,
     )
 
@@ -442,13 +400,13 @@ def numba_funcify_Sum(op, node, **kwargs):
 
     if ndim_input == len(axes):
         # Slightly faster than `numba_funcify_CAReduce` for this case
-        @numba_njit(fastmath=config.numba__fastmath)
+        @numba_njit
         def impl_sum(array):
             return np.asarray(array.sum(), dtype=np_acc_dtype).astype(out_dtype)
 
     elif len(axes) == 0:
         # These cases should be removed by rewrites!
-        @numba_njit(fastmath=config.numba__fastmath)
+        @numba_njit
         def impl_sum(array):
             return np.asarray(array, dtype=out_dtype)
 
@@ -607,9 +565,7 @@ def numba_funcify_Softmax(op, node, **kwargs):
             add_as, 0.0, (axis,), x_at.ndim, x_dtype, keepdims=True
         )
 
-        jit_fn = numba_basic.numba_njit(
-            boundscheck=False, fastmath=config.numba__fastmath
-        )
+        jit_fn = numba_basic.numba_njit(boundscheck=False)
         reduce_max = jit_fn(reduce_max_py)
         reduce_sum = jit_fn(reduce_sum_py)
     else:
@@ -641,9 +597,7 @@ def numba_funcify_SoftmaxGrad(op, node, **kwargs):
             add_as, 0.0, (axis,), sm_at.ndim, sm_dtype, keepdims=True
         )
 
-        jit_fn = numba_basic.numba_njit(
-            boundscheck=False, fastmath=config.numba__fastmath
-        )
+        jit_fn = numba_basic.numba_njit(boundscheck=False)
         reduce_sum = jit_fn(reduce_sum_py)
     else:
         reduce_sum = np.sum
@@ -681,9 +635,7 @@ def numba_funcify_LogSoftmax(op, node, **kwargs):
             add_as, 0.0, (axis,), x_at.ndim, x_dtype, keepdims=True
         )
 
-        jit_fn = numba_basic.numba_njit(
-            boundscheck=False, fastmath=config.numba__fastmath
-        )
+        jit_fn = numba_basic.numba_njit(boundscheck=False)
         reduce_max = jit_fn(reduce_max_py)
         reduce_sum = jit_fn(reduce_sum_py)
     else:
diff --git a/pytensor/link/numba/dispatch/extra_ops.py b/pytensor/link/numba/dispatch/extra_ops.py
index 3629b0e44c..1f0a33e595 100644
--- a/pytensor/link/numba/dispatch/extra_ops.py
+++ b/pytensor/link/numba/dispatch/extra_ops.py
@@ -4,7 +4,6 @@
 import numba
 import numpy as np
 
-from pytensor import config
 from pytensor.graph import Apply
 from pytensor.link.numba.dispatch import basic as numba_basic
 from pytensor.link.numba.dispatch.basic import get_numba_type, numba_funcify
@@ -50,13 +49,13 @@ def numba_funcify_CumOp(op: CumOp, node: Apply, **kwargs):
     if mode == "add":
         if axis is None or ndim == 1:
 
-            @numba_basic.numba_njit(fastmath=config.numba__fastmath)
+            @numba_basic.numba_njit
             def cumop(x):
                 return np.cumsum(x)
 
         else:
 
-            @numba_basic.numba_njit(boundscheck=False, fastmath=config.numba__fastmath)
+            @numba_basic.numba_njit(boundscheck=False)
             def cumop(x):
                 out_dtype = x.dtype
                 if x.shape[axis] < 2:
@@ -74,13 +73,13 @@ def cumop(x):
     else:
         if axis is None or ndim == 1:
 
-            @numba_basic.numba_njit(fastmath=config.numba__fastmath)
+            @numba_basic.numba_njit
             def cumop(x):
                 return np.cumprod(x)
 
         else:
 
-            @numba_basic.numba_njit(boundscheck=False, fastmath=config.numba__fastmath)
+            @numba_basic.numba_njit(boundscheck=False)
             def cumop(x):
                 out_dtype = x.dtype
                 if x.shape[axis] < 2:
diff --git a/pytensor/link/numba/dispatch/scalar.py b/pytensor/link/numba/dispatch/scalar.py
index d9342d5694..e9b637b00f 100644
--- a/pytensor/link/numba/dispatch/scalar.py
+++ b/pytensor/link/numba/dispatch/scalar.py
@@ -2,7 +2,6 @@
 
 import numpy as np
 
-from pytensor import config
 from pytensor.compile.ops import ViewOp
 from pytensor.graph.basic import Variable
 from pytensor.link.numba.dispatch import basic as numba_basic
@@ -23,7 +22,6 @@
     Clip,
     Composite,
     Identity,
-    IsNan,
     Mul,
     Reciprocal,
     ScalarOp,
@@ -138,8 +136,6 @@ def {scalar_op_fn_name}({', '.join(input_names)}):
 
     return numba_basic.numba_njit(
         signature,
-        # numba always returns False if fastmath=True  # https://github.com/numba/numba/issues/9383
-        fastmath=False if isinstance(op, IsNan) else config.numba__fastmath,
         # Functions that call a function pointer can't be cached
         cache=False,
     )(scalar_op_fn)
@@ -179,9 +175,7 @@ def numba_funcify_Add(op, node, **kwargs):
     signature = create_numba_signature(node, force_scalar=True)
     nary_add_fn = binary_to_nary_func(node.inputs, "add", "+")
 
-    return numba_basic.numba_njit(signature, fastmath=config.numba__fastmath)(
-        nary_add_fn
-    )
+    return numba_basic.numba_njit(signature)(nary_add_fn)
 
 
 @numba_funcify.register(Mul)
@@ -189,9 +183,7 @@ def numba_funcify_Mul(op, node, **kwargs):
     signature = create_numba_signature(node, force_scalar=True)
     nary_add_fn = binary_to_nary_func(node.inputs, "mul", "*")
 
-    return numba_basic.numba_njit(signature, fastmath=config.numba__fastmath)(
-        nary_add_fn
-    )
+    return numba_basic.numba_njit(signature)(nary_add_fn)
 
 
 @numba_funcify.register(Cast)
@@ -241,7 +233,7 @@ def numba_funcify_Composite(op, node, **kwargs):
 
     _ = kwargs.pop("storage_map", None)
 
-    composite_fn = numba_basic.numba_njit(signature, fastmath=config.numba__fastmath)(
+    composite_fn = numba_basic.numba_njit(signature)(
         numba_funcify(op.fgraph, squeeze_output=True, **kwargs)
     )
     return composite_fn
@@ -269,7 +261,7 @@ def numba_funcify_Reciprocal(op, node, **kwargs):
     return numba_basic.global_numba_func(reciprocal)
 
 
-@numba_basic.numba_njit(fastmath=config.numba__fastmath)
+@numba_basic.numba_njit
 def sigmoid(x):
     return 1 / (1 + np.exp(-x))
 
@@ -279,7 +271,7 @@ def numba_funcify_Sigmoid(op, node, **kwargs):
     return numba_basic.global_numba_func(sigmoid)
 
 
-@numba_basic.numba_njit(fastmath=config.numba__fastmath)
+@numba_basic.numba_njit
 def gammaln(x):
     return math.lgamma(x)
 
@@ -289,7 +281,7 @@ def numba_funcify_GammaLn(op, node, **kwargs):
     return numba_basic.global_numba_func(gammaln)
 
 
-@numba_basic.numba_njit(fastmath=config.numba__fastmath)
+@numba_basic.numba_njit
 def logp1mexp(x):
     if x < np.log(0.5):
         return np.log1p(-np.exp(x))
@@ -302,7 +294,7 @@ def numba_funcify_Log1mexp(op, node, **kwargs):
     return numba_basic.global_numba_func(logp1mexp)
 
 
-@numba_basic.numba_njit(fastmath=config.numba__fastmath)
+@numba_basic.numba_njit
 def erf(x):
     return math.erf(x)
 
@@ -312,7 +304,7 @@ def numba_funcify_Erf(op, **kwargs):
     return numba_basic.global_numba_func(erf)
 
 
-@numba_basic.numba_njit(fastmath=config.numba__fastmath)
+@numba_basic.numba_njit
 def erfc(x):
     return math.erfc(x)
 
diff --git a/tests/link/numba/test_scalar.py b/tests/link/numba/test_scalar.py
index d9a85ee0e3..655e507da6 100644
--- a/tests/link/numba/test_scalar.py
+++ b/tests/link/numba/test_scalar.py
@@ -143,12 +143,19 @@ def test_reciprocal(v, dtype):
     )
 
 
-@pytest.mark.parametrize("dtype", ("complex64", "float64", "float32"))
-def test_isnan(dtype):
+@pytest.mark.parametrize("composite", (False, True))
+def test_isnan(composite):
     # Testing with tensor just to make sure Elemwise does not revert the scalar behavior of fastmath
-    x = tensor(shape=(2,), dtype=dtype)
-    out = pt.isnan(x)
+    x = tensor(shape=(2,), dtype="float64")
+
+    if composite:
+        x_scalar = psb.float64()
+        scalar_out = ~psb.isnan(x_scalar)
+        out = Elemwise(Composite([x_scalar], [scalar_out]))(x)
+    else:
+        out = pt.isnan(x)
+
     compare_numba_and_py(
         ([x], [out]),
-        [np.array([1, 0], dtype=dtype)],
+        [np.array([1, 0], dtype="float64")],
     )