From 7561726b3314f713fffefe52210582f60a1de44c Mon Sep 17 00:00:00 2001
From: rudderbucky <38639307+rudderbucky@users.noreply.github.com>
Date: Mon, 11 Nov 2024 08:09:52 -0800
Subject: [PATCH 1/5] Update writer.rs

---
 naga/src/back/msl/writer.rs | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)
diff --git a/naga/src/back/msl/writer.rs b/naga/src/back/msl/writer.rs
index 83d937d486..19e56a8a6e 100644
--- a/naga/src/back/msl/writer.rs
+++ b/naga/src/back/msl/writer.rs
@@ -803,14 +803,21 @@ impl<W: Write> Writer<W> {
         }
 
         self.loop_reachable_macro_name = self.namer.call("LOOP_IS_REACHABLE");
-        let loop_reachable_volatile_name = self.namer.call("unpredictable_jump_over_loop");
-        writeln!(
-            self.out,
-            "#define {} if (volatile bool {} = true; {})",
-            self.loop_reachable_macro_name,
-            loop_reachable_volatile_name,
-            loop_reachable_volatile_name,
-        )?;
+        #[cfg(target_arch = "wasm32")]
+        {
+            let loop_reachable_volatile_name = self.namer.call("unpredictable_jump_over_loop");
+            writeln!(
+                self.out,
+                "#define {} if (volatile bool {} = true; {})",
+                self.loop_reachable_macro_name,
+                loop_reachable_volatile_name,
+                loop_reachable_volatile_name,
+            )?;
+        }
+        #[cfg(not(target_arch = "wasm32"))]
+        {
+            writeln!(self.out, "#define {}", self.loop_reachable_macro_name,)?;
+        }
 
         Ok(())
     }

From 2bea408af76bcc6e4a0dbb38180978763cc0687a Mon Sep 17 00:00:00 2001
From: rudderbucky <38639307+rudderbucky@users.noreply.github.com>
Date: Mon, 11 Nov 2024 18:55:53 -0800
Subject: [PATCH 2/5] Revert "Update writer.rs"

This reverts commit 7561726b3314f713fffefe52210582f60a1de44c.
---
 naga/src/back/msl/writer.rs | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/naga/src/back/msl/writer.rs b/naga/src/back/msl/writer.rs
index 19e56a8a6e..83d937d486 100644
--- a/naga/src/back/msl/writer.rs
+++ b/naga/src/back/msl/writer.rs
@@ -803,21 +803,14 @@ impl<W: Write> Writer<W> {
         }
 
         self.loop_reachable_macro_name = self.namer.call("LOOP_IS_REACHABLE");
-        #[cfg(target_arch = "wasm32")]
-        {
-            let loop_reachable_volatile_name = self.namer.call("unpredictable_jump_over_loop");
-            writeln!(
-                self.out,
-                "#define {} if (volatile bool {} = true; {})",
-                self.loop_reachable_macro_name,
-                loop_reachable_volatile_name,
-                loop_reachable_volatile_name,
-            )?;
-        }
-        #[cfg(not(target_arch = "wasm32"))]
-        {
-            writeln!(self.out, "#define {}", self.loop_reachable_macro_name,)?;
-        }
+        let loop_reachable_volatile_name = self.namer.call("unpredictable_jump_over_loop");
+        writeln!(
+            self.out,
+            "#define {} if (volatile bool {} = true; {})",
+            self.loop_reachable_macro_name,
+            loop_reachable_volatile_name,
+            loop_reachable_volatile_name,
+        )?;
 
         Ok(())
     }

From ac60902244f3287a47bba4bf5da1a8233153c40a Mon Sep 17 00:00:00 2001
From: rudderbucky <38639307+rudderbucky@users.noreply.github.com>
Date: Tue, 12 Nov 2024 18:39:14 -0800
Subject: [PATCH 3/5] Only emit loop macro for checked index

---
 naga/src/back/msl/writer.rs | 5 ++++-
 naga/src/proc/index.rs      | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/naga/src/back/msl/writer.rs b/naga/src/back/msl/writer.rs
index 83d937d486..19b0263b30 100644
--- a/naga/src/back/msl/writer.rs
+++ b/naga/src/back/msl/writer.rs
@@ -3028,7 +3028,10 @@ impl<W: Write> Writer<W> {
                     ref continuing,
                     break_if,
                 } => {
-                    self.emit_loop_reachable_macro()?;
+                    // We only emit the macro if the index policy is not checked.
+                    if context.expression.policies.index != index::BoundsCheckPolicy::Unchecked {
+                        self.emit_loop_reachable_macro()?;
+                    }
                     if !continuing.is_empty() || break_if.is_some() {
                         let gate_name = self.namer.call("loop_init");
                         writeln!(self.out, "{level}bool {gate_name} = true;")?;
diff --git a/naga/src/proc/index.rs b/naga/src/proc/index.rs
index d0a7f73e2a..828f1f2984 100644
--- a/naga/src/proc/index.rs
+++ b/naga/src/proc/index.rs
@@ -67,6 +67,7 @@ pub enum BoundsCheckPolicy {
 pub struct BoundsCheckPolicies {
     /// How should the generated code handle array, vector, or matrix indices
     /// that are out of range?
+    /// On Metal, this policy also dictates how loops are checked for UB.
     #[cfg_attr(feature = "deserialize", serde(default))]
     pub index: BoundsCheckPolicy,
 

From ae33c5b0e9b1d7d81d8e2f57aaf89ece4dfc0f5a Mon Sep 17 00:00:00 2001
From: rudderbucky <anandkwork7@gmail.com>
Date: Wed, 13 Nov 2024 17:21:18 -0800
Subject: [PATCH 4/5] Update naga/src/proc/index.rs

Co-authored-by: Erich Gubler <erichdongubler@gmail.com>
---
 naga/src/proc/index.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/naga/src/proc/index.rs b/naga/src/proc/index.rs
index 828f1f2984..f6a78db94c 100644
--- a/naga/src/proc/index.rs
+++ b/naga/src/proc/index.rs
@@ -67,6 +67,7 @@ pub enum BoundsCheckPolicy {
 pub struct BoundsCheckPolicies {
     /// How should the generated code handle array, vector, or matrix indices
     /// that are out of range?
+    ///
     /// On Metal, this policy also dictates how loops are checked for UB.
     #[cfg_attr(feature = "deserialize", serde(default))]
     pub index: BoundsCheckPolicy,

From 7bad686bef4806c68dba9bda514fbd68be5ec681 Mon Sep 17 00:00:00 2001
From: rudderbucky <38639307+rudderbucky@users.noreply.github.com>
Date: Wed, 13 Nov 2024 18:15:22 -0800
Subject: [PATCH 5/5] Surface loop checking at PipelineCompilationOptions

---
 deno_webgpu/pipeline.rs                       | 3 +++
 naga/src/back/msl/mod.rs                      | 3 +++
 naga/src/back/msl/writer.rs                   | 8 ++++++--
 naga/src/proc/index.rs                        | 2 --
 wgpu-core/src/device/global.rs                | 3 +++
 wgpu-core/src/device/resource.rs              | 3 +++
 wgpu-core/src/indirect_validation.rs          | 1 +
 wgpu-core/src/pipeline.rs                     | 6 ++++++
 wgpu-hal/examples/halmark/main.rs             | 2 ++
 wgpu-hal/examples/ray-traced-triangle/main.rs | 1 +
 wgpu-hal/src/dynamic/mod.rs                   | 1 +
 wgpu-hal/src/lib.rs                           | 4 ++++
 wgpu-hal/src/metal/device.rs                  | 1 +
 wgpu/src/api/common_pipeline.rs               | 4 ++++
 wgpu/src/backend/wgpu_core.rs                 | 6 ++++++
 15 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/deno_webgpu/pipeline.rs b/deno_webgpu/pipeline.rs
index 0ab3c40262..34ca29c0ea 100644
--- a/deno_webgpu/pipeline.rs
+++ b/deno_webgpu/pipeline.rs
@@ -112,6 +112,7 @@ pub fn op_webgpu_create_compute_pipeline(
             entry_point: compute.entry_point.map(Cow::from),
             constants: Cow::Owned(compute.constants.unwrap_or_default()),
             zero_initialize_workgroup_memory: true,
+            enable_loop_ub_checking: true,
         },
         cache: None,
     };
@@ -344,6 +345,7 @@ pub fn op_webgpu_create_render_pipeline(
                 constants: Cow::Owned(fragment.constants.unwrap_or_default()),
                 // Required to be true for WebGPU
                 zero_initialize_workgroup_memory: true,
+                enable_loop_ub_checking: true,
             },
             targets: Cow::Owned(fragment.targets),
         })
@@ -369,6 +371,7 @@ pub fn op_webgpu_create_render_pipeline(
                 constants: Cow::Owned(args.vertex.constants.unwrap_or_default()),
                 // Required to be true for WebGPU
                 zero_initialize_workgroup_memory: true,
+                enable_loop_ub_checking: true,
             },
             buffers: Cow::Owned(vertex_buffers),
         },
diff --git a/naga/src/back/msl/mod.rs b/naga/src/back/msl/mod.rs
index fbeaa4cc8d..a070b19943 100644
--- a/naga/src/back/msl/mod.rs
+++ b/naga/src/back/msl/mod.rs
@@ -211,6 +211,8 @@ pub struct Options {
     pub bounds_check_policies: index::BoundsCheckPolicies,
     /// Should workgroup variables be zero initialized (by polyfilling)?
     pub zero_initialize_workgroup_memory: bool,
+    /// Specifies whether shader loops are forcibly prevented from being optimized out.
+    pub enable_loop_ub_checking: bool,
 }
 
 impl Default for Options {
@@ -223,6 +225,7 @@ impl Default for Options {
             fake_missing_bindings: true,
             bounds_check_policies: index::BoundsCheckPolicies::default(),
             zero_initialize_workgroup_memory: true,
+            enable_loop_ub_checking: true,
         }
     }
 }
diff --git a/naga/src/back/msl/writer.rs b/naga/src/back/msl/writer.rs
index 19b0263b30..65d20a89a1 100644
--- a/naga/src/back/msl/writer.rs
+++ b/naga/src/back/msl/writer.rs
@@ -600,6 +600,9 @@ struct ExpressionContext<'a> {
     /// accesses. These may need to be cached in temporary variables. See
     /// `index::find_checked_indexes` for details.
     guarded_indices: HandleSet<crate::Expression>,
+    /// Specifies whether shader loops are forcibly prevented from being optimized out, which may lead
+    /// to UB on Metal. Loop checking may have significant overhead.
+    pub enable_loop_ub_checking: bool,
 }
 
 impl<'a> ExpressionContext<'a> {
@@ -3028,8 +3031,7 @@ impl<W: Write> Writer<W> {
                     ref continuing,
                     break_if,
                 } => {
-                    // We only emit the macro if the index policy is not checked.
-                    if context.expression.policies.index != index::BoundsCheckPolicy::Unchecked {
+                    if context.expression.enable_loop_ub_checking {
                         self.emit_loop_reachable_macro()?;
                     }
                     if !continuing.is_empty() || break_if.is_some() {
@@ -4868,6 +4870,7 @@ template <typename A>
                     module,
                     mod_info,
                     pipeline_options,
+                    enable_loop_ub_checking: options.enable_loop_ub_checking,
                 },
                 result_struct: None,
             };
@@ -5768,6 +5771,7 @@ template <typename A>
                     module,
                     mod_info,
                     pipeline_options,
+                    enable_loop_ub_checking: options.enable_loop_ub_checking,
                 },
                 result_struct: Some(&stage_out_name),
             };
diff --git a/naga/src/proc/index.rs b/naga/src/proc/index.rs
index f6a78db94c..d0a7f73e2a 100644
--- a/naga/src/proc/index.rs
+++ b/naga/src/proc/index.rs
@@ -67,8 +67,6 @@ pub enum BoundsCheckPolicy {
 pub struct BoundsCheckPolicies {
     /// How should the generated code handle array, vector, or matrix indices
     /// that are out of range?
-    ///
-    /// On Metal, this policy also dictates how loops are checked for UB.
     #[cfg_attr(feature = "deserialize", serde(default))]
     pub index: BoundsCheckPolicy,
 
diff --git a/wgpu-core/src/device/global.rs b/wgpu-core/src/device/global.rs
index b6ad2354c3..d87bd1bc8d 100644
--- a/wgpu-core/src/device/global.rs
+++ b/wgpu-core/src/device/global.rs
@@ -1266,6 +1266,7 @@ impl Global {
                         .vertex
                         .stage
                         .zero_initialize_workgroup_memory,
+                    enable_loop_ub_checking: desc.vertex.stage.enable_loop_ub_checking,
                 };
                 ResolvedVertexState {
                     stage,
@@ -1294,6 +1295,7 @@ impl Global {
                         .vertex
                         .stage
                         .zero_initialize_workgroup_memory,
+                    enable_loop_ub_checking: desc.vertex.stage.enable_loop_ub_checking,
                 };
                 Some(ResolvedFragmentState {
                     stage,
@@ -1492,6 +1494,7 @@ impl Global {
                 entry_point: desc.stage.entry_point.clone(),
                 constants: desc.stage.constants.clone(),
                 zero_initialize_workgroup_memory: desc.stage.zero_initialize_workgroup_memory,
+                enable_loop_ub_checking: desc.stage.enable_loop_ub_checking,
             };
 
             let desc = ResolvedComputePipelineDescriptor {
diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs
index afbf73bc03..ac1ddaf4d3 100644
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@@ -2829,6 +2829,7 @@ impl Device {
                 entry_point: final_entry_point_name.as_ref(),
                 constants: desc.stage.constants.as_ref(),
                 zero_initialize_workgroup_memory: desc.stage.zero_initialize_workgroup_memory,
+                enable_loop_ub_checking: desc.stage.enable_loop_ub_checking,
             },
             cache: cache.as_ref().map(|it| it.raw()),
         };
@@ -3250,6 +3251,7 @@ impl Device {
                 entry_point: &vertex_entry_point_name,
                 constants: stage_desc.constants.as_ref(),
                 zero_initialize_workgroup_memory: stage_desc.zero_initialize_workgroup_memory,
+                enable_loop_ub_checking: stage_desc.enable_loop_ub_checking,
             }
         };
 
@@ -3306,6 +3308,7 @@ impl Device {
                     zero_initialize_workgroup_memory: fragment_state
                         .stage
                         .zero_initialize_workgroup_memory,
+                    enable_loop_ub_checking: fragment_state.stage.enable_loop_ub_checking,
                 })
             }
             None => None,
diff --git a/wgpu-core/src/indirect_validation.rs b/wgpu-core/src/indirect_validation.rs
index 35a95f8bbf..5976ea7f80 100644
--- a/wgpu-core/src/indirect_validation.rs
+++ b/wgpu-core/src/indirect_validation.rs
@@ -204,6 +204,7 @@ impl IndirectValidation {
                 entry_point: "main",
                 constants: &Default::default(),
                 zero_initialize_workgroup_memory: false,
+                enable_loop_ub_checking: true,
             },
             cache: None,
         };
diff --git a/wgpu-core/src/pipeline.rs b/wgpu-core/src/pipeline.rs
index 01ceabf669..5d2f3a8434 100644
--- a/wgpu-core/src/pipeline.rs
+++ b/wgpu-core/src/pipeline.rs
@@ -145,6 +145,9 @@ pub struct ProgrammableStageDescriptor<'a> {
     /// This is required by the WebGPU spec, but may have overhead which can be avoided
     /// for cross-platform applications
     pub zero_initialize_workgroup_memory: bool,
+    /// Specifies whether shader loops are forcibly prevented from being optimized out, which may lead
+    /// to UB on Metal. Loop checking may have significant overhead.
+    pub enable_loop_ub_checking: bool,
 }
 
 /// Describes a programmable pipeline stage.
@@ -172,6 +175,9 @@ pub struct ResolvedProgrammableStageDescriptor<'a> {
     /// This is required by the WebGPU spec, but may have overhead which can be avoided
     /// for cross-platform applications
     pub zero_initialize_workgroup_memory: bool,
+    /// Specifies whether shader loops are forcibly prevented from being optimized out, which may lead
+    /// to UB on Metal. Loop checking may have significant overhead.
+    pub enable_loop_ub_checking: bool,
 }
 
 /// Number of implicit bind groups derived at pipeline creation.
diff --git a/wgpu-hal/examples/halmark/main.rs b/wgpu-hal/examples/halmark/main.rs
index 8ab7f1cb47..010c74aa8e 100644
--- a/wgpu-hal/examples/halmark/main.rs
+++ b/wgpu-hal/examples/halmark/main.rs
@@ -259,6 +259,7 @@ impl<A: hal::Api> Example<A> {
                 entry_point: "vs_main",
                 constants: &constants,
                 zero_initialize_workgroup_memory: true,
+                enable_loop_ub_checking: true,
             },
             vertex_buffers: &[],
             fragment_stage: Some(hal::ProgrammableStage {
@@ -266,6 +267,7 @@ impl<A: hal::Api> Example<A> {
                 entry_point: "fs_main",
                 constants: &constants,
                 zero_initialize_workgroup_memory: true,
+                enable_loop_ub_checking: true,
             }),
             primitive: wgt::PrimitiveState {
                 topology: wgt::PrimitiveTopology::TriangleStrip,
diff --git a/wgpu-hal/examples/ray-traced-triangle/main.rs b/wgpu-hal/examples/ray-traced-triangle/main.rs
index 4eedfe7817..de4e5a9b41 100644
--- a/wgpu-hal/examples/ray-traced-triangle/main.rs
+++ b/wgpu-hal/examples/ray-traced-triangle/main.rs
@@ -400,6 +400,7 @@ impl<A: hal::Api> Example<A> {
                     entry_point: "main",
                     constants: &Default::default(),
                     zero_initialize_workgroup_memory: true,
+                    enable_loop_ub_checking: true,
                 },
                 cache: None,
             })
diff --git a/wgpu-hal/src/dynamic/mod.rs b/wgpu-hal/src/dynamic/mod.rs
index 5509d7cce6..336ca3de5a 100644
--- a/wgpu-hal/src/dynamic/mod.rs
+++ b/wgpu-hal/src/dynamic/mod.rs
@@ -146,6 +146,7 @@ impl<'a> ProgrammableStage<'a, dyn DynShaderModule> {
             entry_point: self.entry_point,
             constants: self.constants,
             zero_initialize_workgroup_memory: self.zero_initialize_workgroup_memory,
+            enable_loop_ub_checking: self.enable_loop_ub_checking,
         }
     }
 }
diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs
index 0cddb69976..1daba70194 100644
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@@ -2138,6 +2138,9 @@ pub struct ProgrammableStage<'a, M: DynShaderModule + ?Sized> {
     /// This is required by the WebGPU spec, but may have overhead which can be avoided
     /// for cross-platform applications
     pub zero_initialize_workgroup_memory: bool,
+    /// Specifies whether shader loops are forcibly prevented from being optimized out, which may lead
+    /// to UB on Metal. Loop checking may have significant overhead.
+    pub enable_loop_ub_checking: bool,
 }
 
 impl<M: DynShaderModule + ?Sized> Clone for ProgrammableStage<'_, M> {
@@ -2147,6 +2150,7 @@ impl<M: DynShaderModule + ?Sized> Clone for ProgrammableStage<'_, M> {
             entry_point: self.entry_point,
             constants: self.constants,
             zero_initialize_workgroup_memory: self.zero_initialize_workgroup_memory,
+            enable_loop_ub_checking: self.enable_loop_ub_checking,
         }
     }
 }
diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs
index 4cc8ef0eb0..fc67044455 100644
--- a/wgpu-hal/src/metal/device.rs
+++ b/wgpu-hal/src/metal/device.rs
@@ -150,6 +150,7 @@ impl super::Device {
                 binding_array: naga::proc::BoundsCheckPolicy::Unchecked,
             },
             zero_initialize_workgroup_memory: stage.zero_initialize_workgroup_memory,
+            enable_loop_ub_checking: stage.enable_loop_ub_checking,
         };
 
         let pipeline_options = naga::back::msl::PipelineOptions {
diff --git a/wgpu/src/api/common_pipeline.rs b/wgpu/src/api/common_pipeline.rs
index 697507bca2..3187ecb5a4 100644
--- a/wgpu/src/api/common_pipeline.rs
+++ b/wgpu/src/api/common_pipeline.rs
@@ -20,6 +20,9 @@ pub struct PipelineCompilationOptions<'a> {
     /// This is required by the WebGPU spec, but may have overhead which can be avoided
     /// for cross-platform applications
     pub zero_initialize_workgroup_memory: bool,
+    /// Specifies whether shader loops are forcibly prevented from being optimized out, which may lead
+    /// to UB on Metal. Loop checking may have significant overhead.
+    pub enable_loop_ub_checking: bool,
 }
 
 impl<'a> Default for PipelineCompilationOptions<'a> {
@@ -33,6 +36,7 @@ impl<'a> Default for PipelineCompilationOptions<'a> {
         Self {
             constants,
             zero_initialize_workgroup_memory: true,
+            enable_loop_ub_checking: true,
         }
     }
 }
diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs
index befec4bd78..174614f2bf 100644
--- a/wgpu/src/backend/wgpu_core.rs
+++ b/wgpu/src/backend/wgpu_core.rs
@@ -1092,6 +1092,10 @@ impl crate::Context for ContextWgpuCore {
                         .vertex
                         .compilation_options
                         .zero_initialize_workgroup_memory,
+                    enable_loop_ub_checking: desc
+                        .vertex
+                        .compilation_options
+                        .enable_loop_ub_checking,
                 },
                 buffers: Borrowed(&vertex_buffers),
             },
@@ -1106,6 +1110,7 @@ impl crate::Context for ContextWgpuCore {
                     zero_initialize_workgroup_memory: frag
                         .compilation_options
                         .zero_initialize_workgroup_memory,
+                    enable_loop_ub_checking: frag.compilation_options.enable_loop_ub_checking,
                 },
                 targets: Borrowed(frag.targets),
             }),
@@ -1150,6 +1155,7 @@ impl crate::Context for ContextWgpuCore {
                 zero_initialize_workgroup_memory: desc
                     .compilation_options
                     .zero_initialize_workgroup_memory,
+                enable_loop_ub_checking: desc.compilation_options.enable_loop_ub_checking,
             },
             cache: desc.cache.map(downcast_pipeline_cache).copied(),
         };