From 957abcd27c9bfd0201dc38bdae4dd5fd9d59a168 Mon Sep 17 00:00:00 2001
From: Jonathan Olson <jonathan.olson@colorado.edu>
Date: Fri, 9 Aug 2024 19:00:39 -0600
Subject: [PATCH] Patching a few bugs

---
 js/webgpu/FaceRasterizer.ts                   |  9 ++-
 .../rasterize-two-pass/TiledTwoPassModule.ts  |  3 +-
 .../TwoPassFaceRasterizer.ts                  |  4 +-
 .../tests/rasterize-two-pass/TwoPassTiled.ts  |  5 +-
 .../mainTwoPassCoarseWGSL.ts                  |  6 +-
 .../rasterize-two-pass/mainTwoPassFineWGSL.ts | 61 ++-----------------
 .../rasterize-two-pass/mainTwoPassTileWGSL.ts |  7 ++-
 7 files changed, 28 insertions(+), 67 deletions(-)
diff --git a/js/webgpu/FaceRasterizer.ts b/js/webgpu/FaceRasterizer.ts
index 2cf4318..cbb07d6 100644
--- a/js/webgpu/FaceRasterizer.ts
+++ b/js/webgpu/FaceRasterizer.ts
@@ -65,6 +65,7 @@ export const FACE_RASTERIZER_RUN_DEFAULT_OPTIONS = {
 
 export type FaceRasterizerExecutionInfo = {
   config: TwoPassConfig;
+  numTiles: number;
   numBins: number;
   initialRenderableFaces: TwoPassInitialRenderableFace[];
   initialEdges: LinearEdge[];
@@ -154,8 +155,9 @@ export default class FaceRasterizer {
     // Pick the opposite of the storage format, in case we can't write to it directly, and need to blit it over
     const potentialBlitFormat = this.deviceContext.preferredStorageFormat === 'bgra8unorm' ? 'rgba8unorm' : 'bgra8unorm';
     const blitShader = new BlitShader( this.deviceContext.device, potentialBlitFormat );
-    const wrapBlitModule = new CompositeModule( [ mainModule ], ( context, data: { numBins: number; numInitialRenderableFaces: number; textureBlit: [ GPUTextureView, GPUTextureView ] | null } ) => {
+    const wrapBlitModule = new CompositeModule( [ mainModule ], ( context, data: { numTiles: number; numBins: number; numInitialRenderableFaces: number; textureBlit: [ GPUTextureView, GPUTextureView ] | null } ) => {
       mainModule.execute( context, {
+        numTiles: data.numTiles,
         numBins: data.numBins,
         numInitialRenderableFaces: data.numInitialRenderableFaces
       } );
@@ -182,6 +184,7 @@ export default class FaceRasterizer {
         context.setTypedBufferValue( renderProgramInstructionsSlot, input.renderProgramInstructions );
 
         execute( context, {
+          numTiles: input.numTiles,
           numBins: input.numBins,
           numInitialRenderableFaces: input.initialRenderableFaces.length,
           textureBlit: input.textureBlit
@@ -284,7 +287,8 @@ export default class FaceRasterizer {
     const binWidth = Math.ceil( rasterWidth / binSize );
     const binHeight = Math.ceil( rasterHeight / binSize );
 
-    const numBins = 256 * tileWidth * tileHeight;
+    const numTiles = tileWidth * tileHeight;
+    const numBins = 256 * numTiles;
 
     const initialRenderableFaces: TwoPassInitialRenderableFace[] = [];
     const initialEdges: LinearEdge[] = [];
@@ -386,6 +390,7 @@ export default class FaceRasterizer {
       initialEdges: initialEdges,
       renderProgramInstructions: renderProgramInstructions,
       textureBlit: canOutputToCanvas ? null : [ fineOutputTextureView, canvasTextureView ],
+      numTiles: numTiles,
       numBins: numBins
     } );
   }
diff --git a/js/webgpu/modules/rasterize-two-pass/TiledTwoPassModule.ts b/js/webgpu/modules/rasterize-two-pass/TiledTwoPassModule.ts
index 400b042..51c66ce 100644
--- a/js/webgpu/modules/rasterize-two-pass/TiledTwoPassModule.ts
+++ b/js/webgpu/modules/rasterize-two-pass/TiledTwoPassModule.ts
@@ -40,6 +40,7 @@ export const TILED_TWO_PASS_MODULE_DEFAULTS = {
 } as const;
 
 export type TiledTwoPassRunSize = {
+  numTiles: number;
   numBins: number;
   numInitialRenderableFaces: number;
 };
@@ -123,7 +124,7 @@ export default class TiledTwoPassModule extends CompositeModule<TiledTwoPassRunS
       twoPassModule
     ], ( context, runSize: TiledTwoPassRunSize ) => {
       initializeAddressesModule.execute( context, 0 );
-      tileModule.execute( context, runSize.numInitialRenderableFaces * runSize.numBins );
+      tileModule.execute( context, runSize.numInitialRenderableFaces * runSize.numTiles );
       twoPassModule.execute( context, {
         numBins: runSize.numBins,
         numCoarseRenderableFaces: options.maxCoarseRenderableFaces
diff --git a/js/webgpu/tests/rasterize-two-pass/TwoPassFaceRasterizer.ts b/js/webgpu/tests/rasterize-two-pass/TwoPassFaceRasterizer.ts
index 99194fc..d8cd1c4 100644
--- a/js/webgpu/tests/rasterize-two-pass/TwoPassFaceRasterizer.ts
+++ b/js/webgpu/tests/rasterize-two-pass/TwoPassFaceRasterizer.ts
@@ -50,7 +50,9 @@ export const evaluateTwoPassFaceRasterizer = async (
     }
 
     const filterType = PolygonFilterType.Bilinear;
-    const filterScale = LOOP ? ( 1 + Math.cos( elapsedTime / 100 ) * 0.5 ) * 30 + 1 : 50; // 25 box, 17 bilinear (comparison)
+    // const filterScale = LOOP ? randomNumbers[ index % ( randomNumbers.length ) ] : 50; // 25 box, 17 bilinear (comparison)
+    const filterScale = ( 1 + Math.cos( elapsedTime / 100 ) * 0.5 ) * 30 + 1; // 25 box, 17 bilinear (comparison)
+    // 50.51805795015657
 
     const clippableFace = testPolygonalFace;
 
diff --git a/js/webgpu/tests/rasterize-two-pass/TwoPassTiled.ts b/js/webgpu/tests/rasterize-two-pass/TwoPassTiled.ts
index 8d9f614..05baaf2 100644
--- a/js/webgpu/tests/rasterize-two-pass/TwoPassTiled.ts
+++ b/js/webgpu/tests/rasterize-two-pass/TwoPassTiled.ts
@@ -77,6 +77,7 @@ export const evaluateTwoPassTiled = async (
   const binWidth = Math.ceil( rasterWidth / binSize );
   const binHeight = Math.ceil( rasterHeight / binSize );
 
+  const numTiles = tileWidth * tileHeight;
   const numBins = 256 * tileWidth * tileHeight;
 
   const initialRenderableFaces: TwoPassInitialRenderableFace[] = [];
@@ -151,8 +152,9 @@ export const evaluateTwoPassTiled = async (
   // Pick the opposite of the storage format, in case we can't write to it directly, and need to blit it over
   const potentialBlitFormat = deviceContext.preferredStorageFormat === 'bgra8unorm' ? 'rgba8unorm' : 'bgra8unorm';
   const blitShader = new BlitShader( deviceContext.device, potentialBlitFormat );
-  const wrapBlitModule = new CompositeModule( [ mainModule ], ( context, data: { numBins: number; numInitialRenderableFaces: number; textureBlit: [ GPUTextureView, GPUTextureView ] | null } ) => {
+  const wrapBlitModule = new CompositeModule( [ mainModule ], ( context, data: { numTiles: number; numBins: number; numInitialRenderableFaces: number; textureBlit: [ GPUTextureView, GPUTextureView ] | null } ) => {
     mainModule.execute( context, {
+      numTiles: data.numTiles,
       numBins: data.numBins,
       numInitialRenderableFaces: data.numInitialRenderableFaces
     } );
@@ -184,6 +186,7 @@ export const evaluateTwoPassTiled = async (
       context.setTypedBufferValue( renderProgramInstructionsSlot, input.renderProgramInstructions );
 
       execute( context, {
+        numTiles: numTiles,
         numBins: numBins,
         numInitialRenderableFaces: input.initialRenderableFaces.length,
         textureBlit: input.textureBlit
diff --git a/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassCoarseWGSL.ts b/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassCoarseWGSL.ts
index f909b04..4df9853 100644
--- a/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassCoarseWGSL.ts
+++ b/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassCoarseWGSL.ts
@@ -3,6 +3,8 @@
 /**
  * TODO: doc
  *
+ * Should be dispatched with one workgroup PER coarse renderable face (one thread per face-X-bin).
+ *
  * @author Jonathan Olson <jonathan.olson@colorado.edu>
  */
 
@@ -51,7 +53,7 @@ const mainTwoPassCoarseWGSL = (
     fineEdgesSlot,
     addressesSlot
   ], wgsl`
-    const low_area_multiplier = 1e-4f;
+    const low_area_multiplier = 0.002f;
     
     var<workgroup> coarse_face: ${TwoPassCoarseRenderableFaceWGSL};
     var<workgroup> scratch_data: array<vec2u, 256>;
@@ -145,7 +147,7 @@ const mainTwoPassCoarseWGSL = (
       // TODO: don't use low_area_multiplier with full area!
       let is_full_area = is_source_full_area || area + low_area_multiplier >= max_area;
       
-      let needs_write_face = area > low_area_multiplier;
+      let needs_write_face = area > low_area_multiplier && ( num_clipped_edges > 0u || clipped_clip_counts[ 0u ] != 0i || clipped_clip_counts[ 1u ] != 0i || clipped_clip_counts[ 2u ] != 0i || clipped_clip_counts[ 3u ] != 0i );
       let needs_write_edges = needs_write_face && !is_full_area;
       
       let required_edge_count = select( 0u, num_clipped_edges, needs_write_edges );
diff --git a/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassFineWGSL.ts b/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassFineWGSL.ts
index 3c21eaa..4a1448c 100644
--- a/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassFineWGSL.ts
+++ b/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassFineWGSL.ts
@@ -16,12 +16,14 @@
  * a full 16x16 grid of integrals (and colors), which will then be combined into the proper (e.g. 15x15) set of pixels.
  * Thus the bin size can be 15x15 (if bilinear and filter_scale=1), or 13x13 (if Mitchell-Netravali and filter_scale=1).
  *
+ * Should be dispatched with one workgroup PER bin (one thread per grid "pixel")
+ *
  * TODO: optimize is_constant
  *
  * @author Jonathan Olson <jonathan.olson@colorado.edu>
  */
 
-import { blend_composeWGSL, bounds_clip_edgeWGSL, BufferBindingType, BufferSlot, decimalS, extend_f32WGSL, f32S, F32Type, gamut_map_linear_displayP3WGSL, gamut_map_linear_sRGBWGSL, linear_displayP3_to_linear_sRGBWGSL, linear_sRGB_to_linear_displayP3WGSL, linear_sRGB_to_oklabWGSL, linear_sRGB_to_sRGBWGSL, LinearEdge, LinearEdgeWGSL, logValueWGSL, oklab_to_linear_sRGBWGSL, premultiplyWGSL, RadialGradientType, RenderInstruction, sRGB_to_linear_sRGBWGSL, StorageTextureBindingType, TextureViewSlot, TwoPassConfig, TwoPassFineRenderableFace, TwoPassFineRenderableFaceWGSL, u32S, U32Type, unpremultiplyWGSL, wgsl, wgslBlueprint, WGSLExpressionU32, WGSLMainModule, WGSLSlot } from '../../../imports.js';
+import { blend_composeWGSL, bounds_clip_edgeWGSL, BufferBindingType, BufferSlot, decimalS, extend_f32WGSL, f32S, gamut_map_linear_displayP3WGSL, gamut_map_linear_sRGBWGSL, linear_displayP3_to_linear_sRGBWGSL, linear_sRGB_to_linear_displayP3WGSL, linear_sRGB_to_oklabWGSL, linear_sRGB_to_sRGBWGSL, LinearEdge, LinearEdgeWGSL, oklab_to_linear_sRGBWGSL, premultiplyWGSL, RadialGradientType, RenderInstruction, sRGB_to_linear_sRGBWGSL, StorageTextureBindingType, TextureViewSlot, TwoPassConfig, TwoPassFineRenderableFace, TwoPassFineRenderableFaceWGSL, u32S, unpremultiplyWGSL, wgsl, wgslBlueprint, WGSLExpressionU32, WGSLMainModule, WGSLSlot } from '../../../imports.js';
 import { optionize3 } from '../../../../../phet-core/js/optionize.js';
 
 export type mainTwoPassFineWGSLOptions = {
@@ -64,10 +66,6 @@ const mainTwoPassFineWGSL = (
   const stackSize = 10;
   const instructionStackSize = 8;
 
-  // const logIndex = Math.floor( Math.random() * 1000 );
-  const logIndex = 4794;
-  console.log( logIndex );
-
   const getInstructionWGSL = ( index: WGSLExpressionU32 ) => wgsl`render_program_instructions[ ${index} ]`;
 
   // TODO: find a way so that this isn't needed(!)
@@ -87,7 +85,7 @@ const mainTwoPassFineWGSL = (
   ], wgsl`
     const oops_inifinite_loop_code = vec4f( 0.5f, 0.5f, 0f, 0.5f );
     
-    const low_area_multiplier = 1e-4f;
+    const low_area_multiplier = 0.002f;
     
     var<workgroup> bin_xy: vec2<u32>;
     var<workgroup> workgroup_exit: bool;
@@ -134,30 +132,9 @@ const mainTwoPassFineWGSL = (
       
       let pixel_xy = bin_xy * config.bin_size + vec2( local_id.x % 16u, local_id.x / 16u );
       
-      // 21, 13 ish
-      
-      ${logValueWGSL( {
-        value: 'pixel_xy.x',
-        type: U32Type,
-        lineToLog: line => line.dataArray.flat()[ logIndex ]
-      } )}
-      ${logValueWGSL( {
-        value: 'pixel_xy.y',
-        type: U32Type,
-        lineToLog: line => line.dataArray.flat()[ logIndex ]
-      } )}
-              
       let skip_pixel = pixel_xy.x >= config.raster_width || pixel_xy.y >= config.raster_height;
       
       var accumulation = vec4f( 0f, 0f, 0f, 0f );
-
-      //accumulation = vec4( f32( bin_xy.x ) / 16f, 0f, f32( bin_xy.y ) / 16f, 1f ); // TODO: remove
-      
-      ${logValueWGSL( {
-        value: 'next_address',
-        type: U32Type,
-        lineToLog: line => line.dataArray.flat()[ logIndex ]
-      } )}
       
       var oops_count = 0u;
       while ( workgroupUniformLoad( &next_address ) != 0xffffffffu ) {
@@ -174,12 +151,6 @@ const mainTwoPassFineWGSL = (
         
         workgroupBarrier();
         
-        ${logValueWGSL( {
-          value: 'select( 0u, 1u, skip_pixel )',
-          type: U32Type,
-          lineToLog: line => line.dataArray.flat()[ logIndex ]
-        } )}
-        
         let needs_centroid = ( current_face.bits & 0x10000000u ) != 0u;
         let needs_face = ( current_face.bits & 0x20000000u ) != 0u;
         let is_full_area = ( current_face.bits & 0x80000000u ) != 0u;
@@ -236,30 +207,6 @@ const mainTwoPassFineWGSL = (
         // TODO: stuff integrals + color in workgroup memory, barrier, then have each pixel (subset of threads) sum up
       }
       
-      ${logValueWGSL( {
-        value: 'accumulation.r',
-        type: F32Type,
-        lineToLog: line => line.dataArray.flat()[ logIndex ]
-      } )}
-      
-      ${logValueWGSL( {
-        value: 'accumulation.g',
-        type: F32Type,
-        lineToLog: line => line.dataArray.flat()[ logIndex ]
-      } )}
-      
-      ${logValueWGSL( {
-        value: 'accumulation.b',
-        type: F32Type,
-        lineToLog: line => line.dataArray.flat()[ logIndex ]
-      } )}
-      
-      ${logValueWGSL( {
-        value: 'accumulation.a',
-        type: F32Type,
-        lineToLog: line => line.dataArray.flat()[ logIndex ]
-      } )}
-      
       var will_store_pixel = !skip_pixel;
       
       ${options.supportsGridFiltering ? wgsl`
diff --git a/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassTileWGSL.ts b/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassTileWGSL.ts
index e35dcb5..566f4b1 100644
--- a/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassTileWGSL.ts
+++ b/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassTileWGSL.ts
@@ -51,7 +51,7 @@ const mainTwoPassTileWGSL = (
     coarseEdgesSlot,
     addressesSlot
   ], wgsl`
-    const low_area_multiplier = 1e-4f;
+    const low_area_multiplier = 0.002f;
     
     var<workgroup> scratch_data: array<vec2u, 256>;
     var<workgroup> base_indices: vec2u;
@@ -108,7 +108,8 @@ const mainTwoPassTileWGSL = (
             let p0 = edge.startPoint;
             let p1 = edge.endPoint;
             
-            area += ( p1.x + p0.x ) * ( p1.y - p0.y );
+            // Offset by the centroid, so that our bounds computations are more accurate.
+            area += ( p1.x + p0.x - 2f * bounds_centroid.x ) * ( p1.y - p0.y );
             
             if ( is_edge_clipped_count( p0, p1, min, max ) ) {
               // TODO: consider NOT writing the clip counts in this (hopefully faster) loop?
@@ -132,7 +133,7 @@ const mainTwoPassTileWGSL = (
       // TODO: don't use low_area_multiplier with full area!
       let is_full_area = is_source_full_area || area + low_area_multiplier >= max_area;
       
-      let needs_write_face = !skip_tile && area > low_area_multiplier;
+      let needs_write_face = !skip_tile && area > low_area_multiplier && ( num_clipped_edges > 0u || clipped_clip_counts[ 0u ] != 0i || clipped_clip_counts[ 1u ] != 0i || clipped_clip_counts[ 2u ] != 0i || clipped_clip_counts[ 3u ] != 0i );
       let needs_write_edges = needs_write_face && !is_full_area;
       
       let required_edge_count = select( 0u, num_clipped_edges, needs_write_edges );