Add support for the Metal backend (#48)

This PR adds the support for Metal KA backend
PTsolvers · Oct 5, 2024 · e4f9e84 · e4f9e84 · utkinis · Oct 5, 2024
1 parent d8a4131
commit e4f9e84
Show file tree

Hide file tree

Showing 19 changed files with 513 additions and 333 deletions.
diff --git a/.buildkite/run_tests.yml b/.buildkite/run_tests.yml
@@ -18,13 +18,15 @@ steps:
 
       julia -e 'println("+++ :julia: Running tests")
                 using Pkg
-                Pkg.test("Chmy"; test_args=["--backend=CUDA"], coverage=true)'
+                Pkg.test("Chmy"; test_args=["--backends=CUDA"], coverage=true)'
     agents:
       queue: "juliagpu"
       cuda: "*"
     timeout_in_minutes: 120
     soft_fail:
       - exit_status: 3
+    env:
+      JULIA_NUM_THREADS: 4
 
   - label: "AMDGPU Julia {{matrix.version}}"
     matrix:
@@ -44,7 +46,7 @@ steps:
 
       julia -e 'println("+++ :julia: Running tests")
                 using Pkg
-                Pkg.test("Chmy"; test_args=["--backend=AMDGPU"], coverage=true)'
+                Pkg.test("Chmy"; test_args=["--backends=AMDGPU"], coverage=true)'
     agents:
       queue: "juliagpu"
       rocm: "*"
@@ -54,5 +56,36 @@ steps:
       - exit_status: 3
     env:
       JULIA_NUM_THREADS: 4
+
+# We cannot sumbit coverage right now for Metal as this would require using a cryptic setup not enabled here.
+  - label: "Metal Julia {{matrix.version}}"
+    matrix:
+      setup:
+        version:
+          - "1.10"
+          - "1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "{{matrix.version}}"
+  #     - JuliaCI/julia-coverage#v1:
+  #         codecov: false
+    command: |
+      julia -e 'println("--- :julia: Instantiating project")
+                using Pkg
+                Pkg.develop(; path=pwd())' || exit 3
+
+      julia -e 'println("+++ :julia: Running tests")
+                using Pkg
+                Pkg.test("Chmy"; test_args=["--backends=Metal"], coverage=false)'
+    agents:
+      queue: "juliaecosystem"
+      os: "macos"
+      arch: "aarch64"
+    timeout_in_minutes: 60
+    soft_fail:
+      - exit_status: 3
+    env:
+      JULIA_NUM_THREADS: 4
+
 env:
   SECRET_CODECOV_TOKEN: "D2H/GglFTcK7SKyfuO/Fy34xrVWHzXbtGTGQXAA3wpEPNAATGhHO/mIm0ILLzhMZSI1LplJBxJ7nV5WVsky0e/01nbSnW5iB0QqFHK8rD+lXUr4ls4zMlyUa0Lvsl/HixFyhwBtFhy8ruwUsqN8AbJNSJSiF9x4jXhzTgIvlO25/HqQObcfJa6qwcw0m9uMa3K26w1xrPhdE7F4mdUUREjB1W8dzfkKF+vZUeMqYFKgit21uQ9QsRjDJl0ExOEw0SC910rtGHtDO0bpIe+D1nEGQsQr8VEN3o0hOCgTJrya8MFitBqkKeVBV/NUImu4UtxlNb7r0ZrjTawiFle2tfg==;U2FsdGVkX1+sdgrm8OBTX9elIdJMwLMpOvXFFtHrG9lj5J8qDBdbjJDva3XMXkbF6I4PCh9G9NW0pEcF9ghb7g=="
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Chmy"
 uuid = "33a72cf0-4690-46d7-b987-06506c2248b9"
 authors = ["Ivan Utkin <[email protected]>, Ludovic Raess <[email protected]>, and contributors"]
-version = "0.1.19"
+version = "0.1.20"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -13,10 +13,12 @@ MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 
 [extensions]
 ChmyAMDGPUExt = "AMDGPU"
 ChmyCUDAExt = "CUDA"
+ChmyMetalExt = "Metal"
 
 [compat]
 AMDGPU = "0.8, 0.9, 1"
@@ -25,4 +27,5 @@ CUDA = "5"
 KernelAbstractions = "0.9"
 MPI = "0.20"
 MacroTools = "0.5"
+Metal = "1"
 julia = "1.9"
diff --git a/docs/src/concepts/architectures.md b/docs/src/concepts/architectures.md
@@ -2,7 +2,7 @@
 
 ## Backend Selection & Architecture Initialization
 
-Chmy.jl supports CPUs, as well as CUDA and ROC backends for Nvidia and AMD GPUs through a thin wrapper around the [`KernelAbstractions.jl`](https://github.com/JuliaGPU/KernelAbstractions.jl) for users to select desirable backends.
+Chmy.jl supports CPUs, as well as CUDA, ROC and Metal backends for Nvidia, AMD and Apple M-series GPUs through a thin wrapper around the [`KernelAbstractions.jl`](https://github.com/JuliaGPU/KernelAbstractions.jl) for users to select desirable backends.
 
 ```julia
 # Default with CPU
@@ -21,6 +21,12 @@ using AMDGPU
 arch = Arch(ROCBackend())
 ```
 
+```julia
+using Metal
+
+arch = Arch(MetalBackend())
+```
+
 At the beginning of program, one may specify the backend and initialize the architecture they desire to use. The initialized `arch` variable will be required explicitly at creation of some objects such as grids and kernel launchers.
 
 ## Specifying the device ID and stream priority

diff --git a/docs/src/concepts/grids.md b/docs/src/concepts/grids.md
@@ -48,6 +48,9 @@ grid   = UniformGrid(arch;
                     dims=(nx, ny, nz))
 ```
 
+!!! warning "Metal backend"
+    If using the Metal backend, ensure to use `Float32` (`f0`) element types in the `origin` and `extent` tuples when initialising the grid.
+
 !!! info "Interactive Grid Visualization"
     - [grids_2d.jl](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/grids_2d.jl):  Visualization of a 2D `StructuredGrid`
     -  [grids_3d.jl](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/grids_3d.jl):  Visualization of a 3D `StructuredGrid`

diff --git a/docs/src/examples/overview.md b/docs/src/examples/overview.md
@@ -1,15 +1,16 @@
 # Examples Overview
 
-This page provides an overview of [Chmy.jl](https://github.com/PTsolvers/Chmy.jl) examples. These selected examples demonstrate how [Chmy.jl](https://github.com/PTsolvers/Chmy.jl) can be used to solve various numerical problems using architecture-agnostic kernels both on a single-device and in a distributed way.
+This page provides an overview of [Chmy.jl](https://github.com/PTsolvers/Chmy.jl) examples. These selected examples demonstrate how Chmy.jl can be used to solve various numerical problems using architecture-agnostic kernels both on a single-device and in a distributed way.
 
 ## Table of Contents
 
-| Example    | Description | 
+| Example     | Description |
 |:------------|:------------|
-| [Diffusion 2D](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d.jl) | Solving the 2D diffusion equation on an uniform grid. |
-| [Diffusion 2D with MPI](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d_mpi.jl) | Solving the 2D diffusion equation on an uniform grid distributedly using MPI. |
-| [Single-Device Performance Optimization](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d_perf.jl) | Revisiting the 2D diffusion problem with focus on performance optimization techniques on a single-device architecture | 
-| [Stokes 2D with MPI](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/stokes_2d_inc_ve_T_mpi.jl) | Solving the 2D Stokes equation with thermal coupling on an uniform grid. | 
-| [Stokes 3D with MPI](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/stokes_3d_inc_ve_T_mpi.jl) | Solving the 3D Stokes equation with thermal coupling on an uniform grid distributedly using MPI. | 
-| [2D Grid Visualization](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/grids_2d.jl) | Visualization of a 2D `StructuredGrid`. | 
-| [3D Grid Visualization](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/grids_3d.jl) | Visualization of a 3D `StructuredGrid` | 
+| [Diffusion 2D](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d.jl) | Solving the 2D diffusion equation on a uniform grid. |
+| [Diffusion 2D with MPI](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d_mpi.jl) | Solving the 2D diffusion equation on a uniform grid and distributed parallelisation using MPI. |
+| [Single-Device Performance Optimisation](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d_perf.jl) | Revisiting the 2D diffusion problem with focus on performance optimisation techniques on a single-device architecture. |
+| [Stokes 2D with MPI](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/stokes_2d_inc_ve_T_mpi.jl) | Solving the 2D Stokes equation with thermal coupling on a uniform grid. |
+| [Stokes 3D with MPI](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/stokes_3d_inc_ve_T_mpi.jl) | Solving the 3D Stokes equation with thermal coupling on a uniform grid and distributed parallelisation using MPI. |
+| [Diffusion 1D with Metal](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/diffusion_1d_mtl.jl) | Solving the 1D diffusion equation using the Metal backend and single precision (`Float32`) on a uniform grid. |
+| [2D Grid Visualization](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/grids_2d.jl) | Visualization of a 2D `StructuredGrid`. |
+| [3D Grid Visualization](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/grids_3d.jl) | Visualization of a 3D `StructuredGrid`. |
diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
@@ -47,6 +47,7 @@ using KernelAbstractions # for backend-agnostic kernels
 using Printf, CairoMakie # for I/O and plotting
 # using CUDA
 # using AMDGPU
+# using Metal
 ```
 
 In this introductory tutorial, we will use the CPU backend for simplicity:
@@ -56,7 +57,10 @@ backend = CPU()
 arch = Arch(backend)
 ```
 
-If a different backend is desired, one needs to load the relevant package accordingly. For example, if Nvidia or AMD GPUs are available, one can comment out `using CUDA` or `using AMDGPU` and make sure to use `arch = Arch(CUDABackend())` or `arch = Arch(ROCBackend())`, respectively, when selecting the architecture. For further information about executing on a single-device or multi-device architecture, see the documentation section for [Architectures](./concepts/architectures.md)
+If a different backend is desired, one needs to load the relevant package accordingly. For example, if Nvidia or AMD GPUs are available, one can comment out `using CUDA`, `using AMDGPU` or `using Metal` and make sure to use `arch = Arch(CUDABackend())`, `arch = Arch(ROCBackend())` or `arch = Arch(MetalBackend())`, respectively, when selecting the architecture. For further information about executing on a single-device or multi-device architecture, see the documentation section for [Architectures](./concepts/architectures.md).
+
+!!! warning "Metal backend"
+    Metal backend restricts floating point arithmetic precision of computations to `Float32` or lower. In Chmy, this can be achieved by initialising the grid object using `Float32` (`f0`) elements in the `origin` and `extent` tuples.
 
 ## Writing & Launch Compute Kernels
 

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -31,8 +31,9 @@ Chmy.jl provides a comprehensive framework for handling complex computational ta
 
 A general list of the features is:
 
+- Backend-agnostic capabilities leveraging [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl)
 - Distributed computing support with [MPI.jl](https://github.com/JuliaParallel/MPI.jl)
-- Multi-dimensional, parameterizable discrete and continuous fields on structured grids
+- Multi-dimensional, parametrisable discrete and continuous fields on structured grids
 - High-level interface for specifying boundary conditions with automatic batching for performance
 - Finite difference and interpolation operators on discrete fields
 - Extensibility; The package is written in pure Julia, so adding new functions, simplification rules, and model transformations has no barrier

diff --git a/examples/diffusion_1d_mtl.jl b/examples/diffusion_1d_mtl.jl
@@ -0,0 +1,56 @@
+using Chmy, Chmy.Architectures, Chmy.Grids, Chmy.Fields, Chmy.BoundaryConditions, Chmy.GridOperators, Chmy.KernelLaunch
+using KernelAbstractions
+using Printf
+using CairoMakie
+
+using Metal
+
+@kernel inbounds = true function compute_q!(q, C, χ, g::StructuredGrid, O)
+    I = @index(Global, NTuple)
+    I = I + O
+    q.x[I...] = -χ * ∂x(C, g, I...)
+end
+
+@kernel inbounds = true function update_C!(C, q, Δt, g::StructuredGrid, O)
+    I = @index(Global, NTuple)
+    I = I + O
+    C[I...] -= Δt * divg(q, g, I...)
+end
+
+@views function main(backend=CPU(); nx=(32, ))
+    arch = Arch(backend)
+    # geometry
+    grid = UniformGrid(arch; origin=(-1f0, ), extent=(2f0, ), dims=nx)
+    launch = Launcher(arch, grid; outer_width=(4, ))
+    # physics
+    χ = 1.0f0
+    # numerics
+    Δt = minimum(spacing(grid))^2 / χ / ndims(grid) / 2.1f0
+    nt = 100
+    # allocate fields
+    C = Field(backend, grid, Center())
+    q = VectorField(backend, grid)
+    # initial conditions
+    set!(C, rand(Float32, size(C)))
+    bc!(arch, grid, C => Neumann())
+    # visualisation
+    fig = Figure(; size=(400, 320))
+    ax  = Axis(fig[1, 1]; xlabel="x", ylabel="y", title="it = 0")
+    plt = lines!(ax, centers(grid)..., interior(C) |> Array)
+    display(fig)
+    # action
+    for it in 1:nt
+        @printf("it = %d/%d \n", it, nt)
+        launch(arch, grid, compute_q! => (q, C, χ, grid))
+        launch(arch, grid, update_C! => (C, q, Δt, grid); bc=batch(grid, C => Neumann()))
+    end
+    KernelAbstractions.synchronize(backend)
+    plt[2] = interior(C) |> Array
+    ax.title = "it = $nt"
+    display(fig)
+    return
+end
+
+n = 64
+
+main(MetalBackend(); nx=(n, ) .- 2)
diff --git a/ext/ChmyAMDGPUExt/ChmyAMDGPUExt.jl b/ext/ChmyAMDGPUExt/ChmyAMDGPUExt.jl
@@ -1,6 +1,6 @@
 module ChmyAMDGPUExt
 
-using AMDGPU, KernelAbstractions, Chmy
+using AMDGPU, KernelAbstractions
 
 import Chmy.Architectures: heuristic_groupsize, set_device!, get_device, pointertype
 

diff --git a/ext/ChmyMetalExt/ChmyMetalExt.jl b/ext/ChmyMetalExt/ChmyMetalExt.jl
@@ -0,0 +1,19 @@
+module ChmyMetalExt
+
+using Metal, KernelAbstractions
+
+import Chmy.Architectures: heuristic_groupsize, set_device!, get_device, pointertype
+
+Base.unsafe_wrap(::MetalBackend, ptr::Metal.MtlPtr, dims) = unsafe_wrap(MtlArray, ptr, dims)
+
+pointertype(::MetalBackend, T::DataType) = Metal.MtlPtr{T}
+
+set_device!(dev::Metal.MTL.MTLDeviceInstance) = Metal.device!(dev)
+
+get_device(::MetalBackend, id::Integer) = Metal.MTL.MTLDevice(id)
+
+heuristic_groupsize(::MetalBackend, ::Val{1}) = (256,)
+heuristic_groupsize(::MetalBackend, ::Val{2}) = (32, 8)
+heuristic_groupsize(::MetalBackend, ::Val{3}) = (32, 8, 1)
+
+end
diff --git a/test/Project.toml b/test/Project.toml
@@ -8,9 +8,11 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 
 [compat]
 AMDGPU = "0.8, 0.9, 1"
 CUDA = "5"
 KernelAbstractions = "0.9"
 MPI = "0.20"
+Metal = "1"
diff --git a/test/common.jl b/test/common.jl
@@ -3,13 +3,45 @@ using Chmy
 
 using KernelAbstractions
 
-# add KA backends
-backends = KernelAbstractions.Backend[CPU()]
+compatible(::Backend, ::DataType) = true
 
-if get(ENV, "JULIA_CHMY_BACKEND", "") == "AMDGPU"
-    using AMDGPU
-    AMDGPU.functional() && push!(backends, ROCBackend())
-elseif get(ENV, "JULIA_CHMY_BACKEND", "") == "CUDA"
+# number types to test
+TEST_TYPES = [Float32, Float64]
+
+# add backends
+TEST_BACKENDS = []
+
+if haskey(ENV, "JULIA_CHMY_BACKEND_CPU")
+    push!(TEST_BACKENDS, CPU())
+end
+
+if haskey(ENV, "JULIA_CHMY_BACKEND_CUDA")
     using CUDA
-    CUDA.functional() && push!(backends, CUDABackend())
+    if CUDA.functional()
+        push!(TEST_BACKENDS, CUDABackend())
+    end
+end
+
+if haskey(ENV, "JULIA_CHMY_BACKEND_AMDGPU")
+    using AMDGPU
+    if AMDGPU.functional()
+        push!(TEST_BACKENDS, ROCBackend())
+    end
+end
+
+if haskey(ENV, "JULIA_CHMY_BACKEND_Metal")
+    using Metal
+
+    function compatible(::MetalBackend, T::DataType)
+        try
+            Metal.check_eltype(T)
+            return true
+        catch
+            return false
+        end
+    end
+
+    if Metal.functional()
+        push!(TEST_BACKENDS, MetalBackend())
+    end
 end