From 957abcd27c9bfd0201dc38bdae4dd5fd9d59a168 Mon Sep 17 00:00:00 2001 From: Jonathan Olson Date: Fri, 9 Aug 2024 19:00:39 -0600 Subject: [PATCH] Patching a few bugs --- js/webgpu/FaceRasterizer.ts | 9 ++- .../rasterize-two-pass/TiledTwoPassModule.ts | 3 +- .../TwoPassFaceRasterizer.ts | 4 +- .../tests/rasterize-two-pass/TwoPassTiled.ts | 5 +- .../mainTwoPassCoarseWGSL.ts | 6 +- .../rasterize-two-pass/mainTwoPassFineWGSL.ts | 61 ++----------------- .../rasterize-two-pass/mainTwoPassTileWGSL.ts | 7 ++- 7 files changed, 28 insertions(+), 67 deletions(-) diff --git a/js/webgpu/FaceRasterizer.ts b/js/webgpu/FaceRasterizer.ts index 2cf4318..cbb07d6 100644 --- a/js/webgpu/FaceRasterizer.ts +++ b/js/webgpu/FaceRasterizer.ts @@ -65,6 +65,7 @@ export const FACE_RASTERIZER_RUN_DEFAULT_OPTIONS = { export type FaceRasterizerExecutionInfo = { config: TwoPassConfig; + numTiles: number; numBins: number; initialRenderableFaces: TwoPassInitialRenderableFace[]; initialEdges: LinearEdge[]; @@ -154,8 +155,9 @@ export default class FaceRasterizer { // Pick the opposite of the storage format, in case we can't write to it directly, and need to blit it over const potentialBlitFormat = this.deviceContext.preferredStorageFormat === 'bgra8unorm' ? 'rgba8unorm' : 'bgra8unorm'; const blitShader = new BlitShader( this.deviceContext.device, potentialBlitFormat ); - const wrapBlitModule = new CompositeModule( [ mainModule ], ( context, data: { numBins: number; numInitialRenderableFaces: number; textureBlit: [ GPUTextureView, GPUTextureView ] | null } ) => { + const wrapBlitModule = new CompositeModule( [ mainModule ], ( context, data: { numTiles: number; numBins: number; numInitialRenderableFaces: number; textureBlit: [ GPUTextureView, GPUTextureView ] | null } ) => { mainModule.execute( context, { + numTiles: data.numTiles, numBins: data.numBins, numInitialRenderableFaces: data.numInitialRenderableFaces } ); @@ -182,6 +184,7 @@ export default class FaceRasterizer { context.setTypedBufferValue( renderProgramInstructionsSlot, input.renderProgramInstructions ); execute( context, { + numTiles: input.numTiles, numBins: input.numBins, numInitialRenderableFaces: input.initialRenderableFaces.length, textureBlit: input.textureBlit @@ -284,7 +287,8 @@ export default class FaceRasterizer { const binWidth = Math.ceil( rasterWidth / binSize ); const binHeight = Math.ceil( rasterHeight / binSize ); - const numBins = 256 * tileWidth * tileHeight; + const numTiles = tileWidth * tileHeight; + const numBins = 256 * numTiles; const initialRenderableFaces: TwoPassInitialRenderableFace[] = []; const initialEdges: LinearEdge[] = []; @@ -386,6 +390,7 @@ export default class FaceRasterizer { initialEdges: initialEdges, renderProgramInstructions: renderProgramInstructions, textureBlit: canOutputToCanvas ? null : [ fineOutputTextureView, canvasTextureView ], + numTiles: numTiles, numBins: numBins } ); } diff --git a/js/webgpu/modules/rasterize-two-pass/TiledTwoPassModule.ts b/js/webgpu/modules/rasterize-two-pass/TiledTwoPassModule.ts index 400b042..51c66ce 100644 --- a/js/webgpu/modules/rasterize-two-pass/TiledTwoPassModule.ts +++ b/js/webgpu/modules/rasterize-two-pass/TiledTwoPassModule.ts @@ -40,6 +40,7 @@ export const TILED_TWO_PASS_MODULE_DEFAULTS = { } as const; export type TiledTwoPassRunSize = { + numTiles: number; numBins: number; numInitialRenderableFaces: number; }; @@ -123,7 +124,7 @@ export default class TiledTwoPassModule extends CompositeModule { initializeAddressesModule.execute( context, 0 ); - tileModule.execute( context, runSize.numInitialRenderableFaces * runSize.numBins ); + tileModule.execute( context, runSize.numInitialRenderableFaces * runSize.numTiles ); twoPassModule.execute( context, { numBins: runSize.numBins, numCoarseRenderableFaces: options.maxCoarseRenderableFaces diff --git a/js/webgpu/tests/rasterize-two-pass/TwoPassFaceRasterizer.ts b/js/webgpu/tests/rasterize-two-pass/TwoPassFaceRasterizer.ts index 99194fc..d8cd1c4 100644 --- a/js/webgpu/tests/rasterize-two-pass/TwoPassFaceRasterizer.ts +++ b/js/webgpu/tests/rasterize-two-pass/TwoPassFaceRasterizer.ts @@ -50,7 +50,9 @@ export const evaluateTwoPassFaceRasterizer = async ( } const filterType = PolygonFilterType.Bilinear; - const filterScale = LOOP ? ( 1 + Math.cos( elapsedTime / 100 ) * 0.5 ) * 30 + 1 : 50; // 25 box, 17 bilinear (comparison) + // const filterScale = LOOP ? randomNumbers[ index % ( randomNumbers.length ) ] : 50; // 25 box, 17 bilinear (comparison) + const filterScale = ( 1 + Math.cos( elapsedTime / 100 ) * 0.5 ) * 30 + 1; // 25 box, 17 bilinear (comparison) + // 50.51805795015657 const clippableFace = testPolygonalFace; diff --git a/js/webgpu/tests/rasterize-two-pass/TwoPassTiled.ts b/js/webgpu/tests/rasterize-two-pass/TwoPassTiled.ts index 8d9f614..05baaf2 100644 --- a/js/webgpu/tests/rasterize-two-pass/TwoPassTiled.ts +++ b/js/webgpu/tests/rasterize-two-pass/TwoPassTiled.ts @@ -77,6 +77,7 @@ export const evaluateTwoPassTiled = async ( const binWidth = Math.ceil( rasterWidth / binSize ); const binHeight = Math.ceil( rasterHeight / binSize ); + const numTiles = tileWidth * tileHeight; const numBins = 256 * tileWidth * tileHeight; const initialRenderableFaces: TwoPassInitialRenderableFace[] = []; @@ -151,8 +152,9 @@ export const evaluateTwoPassTiled = async ( // Pick the opposite of the storage format, in case we can't write to it directly, and need to blit it over const potentialBlitFormat = deviceContext.preferredStorageFormat === 'bgra8unorm' ? 'rgba8unorm' : 'bgra8unorm'; const blitShader = new BlitShader( deviceContext.device, potentialBlitFormat ); - const wrapBlitModule = new CompositeModule( [ mainModule ], ( context, data: { numBins: number; numInitialRenderableFaces: number; textureBlit: [ GPUTextureView, GPUTextureView ] | null } ) => { + const wrapBlitModule = new CompositeModule( [ mainModule ], ( context, data: { numTiles: number; numBins: number; numInitialRenderableFaces: number; textureBlit: [ GPUTextureView, GPUTextureView ] | null } ) => { mainModule.execute( context, { + numTiles: data.numTiles, numBins: data.numBins, numInitialRenderableFaces: data.numInitialRenderableFaces } ); @@ -184,6 +186,7 @@ export const evaluateTwoPassTiled = async ( context.setTypedBufferValue( renderProgramInstructionsSlot, input.renderProgramInstructions ); execute( context, { + numTiles: numTiles, numBins: numBins, numInitialRenderableFaces: input.initialRenderableFaces.length, textureBlit: input.textureBlit diff --git a/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassCoarseWGSL.ts b/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassCoarseWGSL.ts index f909b04..4df9853 100644 --- a/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassCoarseWGSL.ts +++ b/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassCoarseWGSL.ts @@ -3,6 +3,8 @@ /** * TODO: doc * + * Should be dispatched with one workgroup PER coarse renderable face (one thread per face-X-bin). + * * @author Jonathan Olson */ @@ -51,7 +53,7 @@ const mainTwoPassCoarseWGSL = ( fineEdgesSlot, addressesSlot ], wgsl` - const low_area_multiplier = 1e-4f; + const low_area_multiplier = 0.002f; var coarse_face: ${TwoPassCoarseRenderableFaceWGSL}; var scratch_data: array; @@ -145,7 +147,7 @@ const mainTwoPassCoarseWGSL = ( // TODO: don't use low_area_multiplier with full area! let is_full_area = is_source_full_area || area + low_area_multiplier >= max_area; - let needs_write_face = area > low_area_multiplier; + let needs_write_face = area > low_area_multiplier && ( num_clipped_edges > 0u || clipped_clip_counts[ 0u ] != 0i || clipped_clip_counts[ 1u ] != 0i || clipped_clip_counts[ 2u ] != 0i || clipped_clip_counts[ 3u ] != 0i ); let needs_write_edges = needs_write_face && !is_full_area; let required_edge_count = select( 0u, num_clipped_edges, needs_write_edges ); diff --git a/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassFineWGSL.ts b/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassFineWGSL.ts index 3c21eaa..4a1448c 100644 --- a/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassFineWGSL.ts +++ b/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassFineWGSL.ts @@ -16,12 +16,14 @@ * a full 16x16 grid of integrals (and colors), which will then be combined into the proper (e.g. 15x15) set of pixels. * Thus the bin size can be 15x15 (if bilinear and filter_scale=1), or 13x13 (if Mitchell-Netravali and filter_scale=1). * + * Should be dispatched with one workgroup PER bin (one thread per grid "pixel") + * * TODO: optimize is_constant * * @author Jonathan Olson */ -import { blend_composeWGSL, bounds_clip_edgeWGSL, BufferBindingType, BufferSlot, decimalS, extend_f32WGSL, f32S, F32Type, gamut_map_linear_displayP3WGSL, gamut_map_linear_sRGBWGSL, linear_displayP3_to_linear_sRGBWGSL, linear_sRGB_to_linear_displayP3WGSL, linear_sRGB_to_oklabWGSL, linear_sRGB_to_sRGBWGSL, LinearEdge, LinearEdgeWGSL, logValueWGSL, oklab_to_linear_sRGBWGSL, premultiplyWGSL, RadialGradientType, RenderInstruction, sRGB_to_linear_sRGBWGSL, StorageTextureBindingType, TextureViewSlot, TwoPassConfig, TwoPassFineRenderableFace, TwoPassFineRenderableFaceWGSL, u32S, U32Type, unpremultiplyWGSL, wgsl, wgslBlueprint, WGSLExpressionU32, WGSLMainModule, WGSLSlot } from '../../../imports.js'; +import { blend_composeWGSL, bounds_clip_edgeWGSL, BufferBindingType, BufferSlot, decimalS, extend_f32WGSL, f32S, gamut_map_linear_displayP3WGSL, gamut_map_linear_sRGBWGSL, linear_displayP3_to_linear_sRGBWGSL, linear_sRGB_to_linear_displayP3WGSL, linear_sRGB_to_oklabWGSL, linear_sRGB_to_sRGBWGSL, LinearEdge, LinearEdgeWGSL, oklab_to_linear_sRGBWGSL, premultiplyWGSL, RadialGradientType, RenderInstruction, sRGB_to_linear_sRGBWGSL, StorageTextureBindingType, TextureViewSlot, TwoPassConfig, TwoPassFineRenderableFace, TwoPassFineRenderableFaceWGSL, u32S, unpremultiplyWGSL, wgsl, wgslBlueprint, WGSLExpressionU32, WGSLMainModule, WGSLSlot } from '../../../imports.js'; import { optionize3 } from '../../../../../phet-core/js/optionize.js'; export type mainTwoPassFineWGSLOptions = { @@ -64,10 +66,6 @@ const mainTwoPassFineWGSL = ( const stackSize = 10; const instructionStackSize = 8; - // const logIndex = Math.floor( Math.random() * 1000 ); - const logIndex = 4794; - console.log( logIndex ); - const getInstructionWGSL = ( index: WGSLExpressionU32 ) => wgsl`render_program_instructions[ ${index} ]`; // TODO: find a way so that this isn't needed(!) @@ -87,7 +85,7 @@ const mainTwoPassFineWGSL = ( ], wgsl` const oops_inifinite_loop_code = vec4f( 0.5f, 0.5f, 0f, 0.5f ); - const low_area_multiplier = 1e-4f; + const low_area_multiplier = 0.002f; var bin_xy: vec2; var workgroup_exit: bool; @@ -134,30 +132,9 @@ const mainTwoPassFineWGSL = ( let pixel_xy = bin_xy * config.bin_size + vec2( local_id.x % 16u, local_id.x / 16u ); - // 21, 13 ish - - ${logValueWGSL( { - value: 'pixel_xy.x', - type: U32Type, - lineToLog: line => line.dataArray.flat()[ logIndex ] - } )} - ${logValueWGSL( { - value: 'pixel_xy.y', - type: U32Type, - lineToLog: line => line.dataArray.flat()[ logIndex ] - } )} - let skip_pixel = pixel_xy.x >= config.raster_width || pixel_xy.y >= config.raster_height; var accumulation = vec4f( 0f, 0f, 0f, 0f ); - - //accumulation = vec4( f32( bin_xy.x ) / 16f, 0f, f32( bin_xy.y ) / 16f, 1f ); // TODO: remove - - ${logValueWGSL( { - value: 'next_address', - type: U32Type, - lineToLog: line => line.dataArray.flat()[ logIndex ] - } )} var oops_count = 0u; while ( workgroupUniformLoad( &next_address ) != 0xffffffffu ) { @@ -174,12 +151,6 @@ const mainTwoPassFineWGSL = ( workgroupBarrier(); - ${logValueWGSL( { - value: 'select( 0u, 1u, skip_pixel )', - type: U32Type, - lineToLog: line => line.dataArray.flat()[ logIndex ] - } )} - let needs_centroid = ( current_face.bits & 0x10000000u ) != 0u; let needs_face = ( current_face.bits & 0x20000000u ) != 0u; let is_full_area = ( current_face.bits & 0x80000000u ) != 0u; @@ -236,30 +207,6 @@ const mainTwoPassFineWGSL = ( // TODO: stuff integrals + color in workgroup memory, barrier, then have each pixel (subset of threads) sum up } - ${logValueWGSL( { - value: 'accumulation.r', - type: F32Type, - lineToLog: line => line.dataArray.flat()[ logIndex ] - } )} - - ${logValueWGSL( { - value: 'accumulation.g', - type: F32Type, - lineToLog: line => line.dataArray.flat()[ logIndex ] - } )} - - ${logValueWGSL( { - value: 'accumulation.b', - type: F32Type, - lineToLog: line => line.dataArray.flat()[ logIndex ] - } )} - - ${logValueWGSL( { - value: 'accumulation.a', - type: F32Type, - lineToLog: line => line.dataArray.flat()[ logIndex ] - } )} - var will_store_pixel = !skip_pixel; ${options.supportsGridFiltering ? wgsl` diff --git a/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassTileWGSL.ts b/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassTileWGSL.ts index e35dcb5..566f4b1 100644 --- a/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassTileWGSL.ts +++ b/js/webgpu/wgsl/rasterize-two-pass/mainTwoPassTileWGSL.ts @@ -51,7 +51,7 @@ const mainTwoPassTileWGSL = ( coarseEdgesSlot, addressesSlot ], wgsl` - const low_area_multiplier = 1e-4f; + const low_area_multiplier = 0.002f; var scratch_data: array; var base_indices: vec2u; @@ -108,7 +108,8 @@ const mainTwoPassTileWGSL = ( let p0 = edge.startPoint; let p1 = edge.endPoint; - area += ( p1.x + p0.x ) * ( p1.y - p0.y ); + // Offset by the centroid, so that our bounds computations are more accurate. + area += ( p1.x + p0.x - 2f * bounds_centroid.x ) * ( p1.y - p0.y ); if ( is_edge_clipped_count( p0, p1, min, max ) ) { // TODO: consider NOT writing the clip counts in this (hopefully faster) loop? @@ -132,7 +133,7 @@ const mainTwoPassTileWGSL = ( // TODO: don't use low_area_multiplier with full area! let is_full_area = is_source_full_area || area + low_area_multiplier >= max_area; - let needs_write_face = !skip_tile && area > low_area_multiplier; + let needs_write_face = !skip_tile && area > low_area_multiplier && ( num_clipped_edges > 0u || clipped_clip_counts[ 0u ] != 0i || clipped_clip_counts[ 1u ] != 0i || clipped_clip_counts[ 2u ] != 0i || clipped_clip_counts[ 3u ] != 0i ); let needs_write_edges = needs_write_face && !is_full_area; let required_edge_count = select( 0u, num_clipped_edges, needs_write_edges );