index.html

<!DOCTYPE html>
<html lang="en" xmlns:p="http://www.w3.org/1999/html">
<head>
  <meta charset="utf-8"/>
  <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
  <meta name="viewport" content="width=device-width, initial-scale=1.0">

<!--
  Query parameters:
  ?TODO  - show TODOs
  ?diagrams=hide    - hide diagrams (for faster loading)

  "edit" link: http://localhost/alpenglow/?TODO&diagrams=hide#WGSL
-->

  <title>Alpenglow Plan &amp; Documentation</title>

  <link rel="shortcut icon" type="image/x-icon" href="../scenery/assets/logo-v1.svg">
  <link rel="stylesheet" href="../sherpa/lib/bootstrap-2.2.2.css">
  <link rel="stylesheet" href="../sherpa/lib/bootstrap-responsive-2.2.2.css">

  <script>
    window.MathJax = {
      tex: {
        inlineMath: [ [ '$', '$' ], [ '\\(', '\\)' ] ]
      },
      autoload: {
        cases: [ [], [ 'numcases', 'subnumcases' ] ]
      }
    };
  </script>
  <script id="MathJax-script" async src="../sherpa/mathjax/tex-chtml.js"></script>

  <script>
    window.phet = window.phet || {};

    window.pendingDiagrams = [];
    window.addDiagram = ( id, callback ) => {
      window.pendingDiagrams.push( { id: id, callback: callback } );
    };
  </script>

  <!-- Before loading other things (that might error), create hooks to report errors/loads for continuous testing -->
  <script src="../chipper/js/browser/sim-tests/pageload-connector.js"></script>

  <!-- jQuery and LoDash are dependencies -->
  <script src="../sherpa/lib/jquery-2.1.0.min.js"></script>
  <script src="../sherpa/lib/lodash-4.17.4.min.js"></script>

  <!-- For the styling -->
  <script src="../sherpa/lib/bootstrap-2.2.2.js"></script>

  <link rel="stylesheet" href="../sherpa/lib/codemirror-5.52.2.min.css">
  <link rel="stylesheet" href="../sherpa/lib/codemirror-5.52.2.monokai.min.css">

  <script src="../sherpa/lib/codemirror-5.52.2.min.js"></script>
  <script src="../sherpa/lib/codemirror-5.52.2.javascript.min.js"></script>

  <script src="./doc/citations.js"></script>

  <link rel="stylesheet" href="./doc/doc.css">

  <script src="./doc/todo.js"></script>

  <script>
    // Loads a synchronously-executed asynchronously-downloaded script tag, with optional data-main parameter.
    // See http://www.html5rocks.com/en/tutorials/speed/script-loading/ for more about script loading. It helps to
    // load all of the scripts with this method, so they are treated the same (and placed in the correct execution
    // order).
    const loadURL = ( preloadURL, type = 'text/javascript', async = false ) => {
      const script = document.createElement( 'script' );
      script.type = type;
      script.src = preloadURL;
      script.async = async;
      script.setAttribute( 'crossorigin', 'use-credentials' );
      document.head.appendChild( script );
    };

    window.usePhetLib = true;

    loadURL( '../scenery/doc/extractFunctionJS.js' );
    loadURL( './doc/teapotObj.js' );
    loadURL( './doc/load-citations.js' );
    if ( window.usePhetLib ) {
      loadURL( './doc/lib/phet-lib.min.js' ); // TODO: consider using plain alpenglow (smaller)?
    }
    else {
      loadURL( '../assert/js/assert.js' );
      loadURL( '../sherpa/lib/linebreak-1.1.0.js' );
      loadURL( '../sherpa/lib/flatqueue-1.2.1.js' );
      loadURL( '../sherpa/lib/paper-js-0.12.17.js' );
      loadURL( '../sherpa/lib/he-1.1.1.js' );
      loadURL( '../sherpa/lib/TextEncoderLite-3c9f6f0.js' );
      loadURL( '../sherpa/lib/base64-js-1.2.0.js' );
    }
    loadURL( './doc/index.js', 'module' );
    loadURL( './doc/load-mermaid.js', 'module' );
  </script>
</head>

<body>

<div class="row-fluid">
  <div class="span2"></div>
  <div class="span8">
    <div class="give-max-width">
      <div class="page-header" style="text-align: center;">
        <h1>Alpenglow Plan &amp; Documentation</h1>
      </div>
    </div>
  </div>
  <div class="span2"></div>
</div>

<div class="nav-container hidden-phone">
  <div class="nav-root">
    <ul class="nav nav-list nav-main">
      <li>
        <a href="#overview">Overview</a>
        <ul class="nav nav-list">
          <li><a href="#overview-mainChallenges">Main Challenges</a></li>
          <li><a href="#overview-relatedWork">Related Work</a></li>
          <li><a href="#overview-contributions">Contributions</a></li>
          <li><a href="#overview-impact">Desired Impact</a></li>
        </ul>
      </li>
      <li>
        <a href="#concepts">Concepts</a>
        <ul class="nav nav-list">
          <li><a href="#polygonalFaces">Polygonal Faces</a></li>
          <li><a href="#clipping">Clipping</a></li>
          <li><a href="#booleanOperations">Boolean Operations</a></li>
          <li>
            <a href="#antialiasing">Anti-Aliasing</a>
            <ul class="nav nav-list">
              <li><a href="#antialiasing-integrals">Integrals</a></li>
              <li><a href="#antialiasing-filters">Filters</a></li>
              <li><a href="#antialiasing-examples">Examples</a></li>
            </ul>
          </li>
          <li>
            <a href="#color-and-blending">Color &amp; Blending</a>
          </li>
          <li>
            <a href="#renderProgram">RenderProgram</a>
            <ul class="nav nav-list">
              <li><a href="#RenderColor">RenderColor</a></li>
              <li><a href="#RenderPathBoolean">RenderPathBoolean</a></li>
              <li><a href="#RenderStack">RenderStack</a></li>
              <li><a href="#RenderLinearBlend">RenderLinearBlend</a></li>
              <li><a href="#RenderLinearGradient">RenderLinearGradient</a></li>
              <li><a href="#RenderRadialBlend">RenderRadialBlend</a></li>
              <li><a href="#RenderRadialGradient">RenderRadialGradient</a></li>
              <li><a href="#RenderBlendCompose">RenderBlendCompose</a></li>
              <li><a href="#renderProgram-simplification">Simplification</a></li>
              <li><a href="#renderProgram-execution">Execution</a></li>
            </ul>
          </li>
          <li>
            <a href="#WGSL">WGSL</a>
            <ul class="nav nav-list">
              <li><a href="#WGSL-templating">Templating</a></li>
              <li><a href="#WGSL-types">Types</a></li>
              <li><a href="#WGSL-linking">Linking</a></li>
              <li><a href="#WGSL-recording">Recording</a></li>
              <li><a href="#WGSL-logging">Logging</a></li>
              <li><a href="#WGSL-testing">Testing</a></li>
              <li><a href="#WGSL-algorithms">Algorithms</a></li>
              <li><a href="#WGSL-reduce">Reduce</a></li>
              <li><a href="#WGSL-scan">Scan</a></li>
              <li><a href="#WGSL-radixSort">Radix Sort</a></li>
              <li><a href="#WGSL-mergeSort">Merge Sort</a></li>
              <li><a href="#WGSL-stackMonoid">Stack Monoid</a></li>
              <li><a href="#WGSL-memory">Memory</a></li>
              <li><a href="#WGSL-rationals">Rationals</a></li>
              <li><a href="#WGSL-profiling">Profiling</a></li>
              <li><a href="#WGSL-unitTesting">Unit Testing</a></li>
              <li><a href="#WGSL-knownBugs">Known Bugs</a></li>
              <li><a href="#WGSL-notes">Notes</a></li>
              <li><a href="#WGSL-newLanguage">New Language</a></li>
              <li><a href="#WGSL-overlap">Overlap</a></li>
              <li><a href="#WGSL-documentation">Documentation</a></li>
              <li><a href="#WGSL-deprecated">Deprecated</a></li>
            </ul>
          </li>
          <li><a href="#asyncSimulation">Async Simulation</a></li>
          <li><a href="#depthSort">Depth Sort</a></li>
          <li><a href="#vectorCanvas">Vector Canvas</a></li>
        </ul>
      </li>
      <li>
        <a href="#stages">Stages</a>
        <ul class="nav nav-list">
          <li><a href="#stage-transforms">Transforms</a></li>
          <li><a href="#stage-subdivision">Subdivision</a></li>
          <li><a href="#stage-bounds">Bounds</a></li>
          <li><a href="#stage-tiling">Tiling</a></li>
          <li><a href="#stage-integerTransform">Integer Transform</a></li>
          <li><a href="#stage-hilbertSort">Hilbert Sort</a></li>
          <li><a href="#stage-intersection">Intersection</a></li>
          <li><a href="#stage-split">Split</a></li>
          <li><a href="#stage-edgeSort">Edge Sort</a></li>
          <li><a href="#stage-filterConnect">Filter &amp; Connect</a></li>
          <li><a href="#stage-boundaryTrace">Boundary Trace</a></li>
          <li><a href="#stage-faceHoles">Face Holes</a></li>
          <li><a href="#stage-windingMaps">Winding Maps</a></li>
          <li><a href="#stage-renderProgramSimplification">RenderProgram Simplification</a></li>
          <li><a href="#stage-renderableFaceCreation">RenderableFace Creation</a></li>
          <li><a href="#stage-splitPrograms">Split Programs</a></li>
          <li><a href="#stage-rasterize">Rasterize</a></li>
        </ul>
      </li>
      <li><a href="#references">References</a></li>
    </ul>
  </div>
</div>

<div class="row-fluid">
  <div class="span2"></div>
  <div class="span8">
    <div class="give-max-width">

      <h2 id="overview">Overview</h2>

      <p>
        Alpenglow is an experimental rasterizer that takes a scene description and efficiently produces a corresponding
        high-quality image.
      </p>

      <p>
        The goal is to have:
      </p>

      <ul>
        <li>
          A high-performance WebGPU rasterizer (<span class="text-info">in progress</span>)
        </li>
        <li>
          A software reference implementation (<span class="text-success">implemented</span>)
        </li>
        <li>
          A potential WebGL 2 fallback (<span class="text-error">not started</span>)
        </li>
      </ul>

      <p>
        <strong>
          Please contact the author (Jonathan Olson, jonathan.olson@colorado.edu) for any questions or comments, no matter how small!
        </strong>
      </p>

      <p class="TODO">
        Improve rasterization docs below
      </p>

      <p class="TODO">
        Get the scan-based rasterization also working - we are being VERY LIMITED by coarse face limit, since we can't
        dispatch 65536 or more workgroups (and we have one workgroup per coarse face).
      </p>

      <p class="TODO">
        Consider keeping CAG in CPU, sweep-line or Vatti (Kite) like algorithms for intersection (rather than what we do now).
        We could potentially do CAG "deltas" if not much changes, for scenes with some animated but most is static.
        Or consider stateful CAG on GPU?
      </p>

      <p class="TODO">
        CPU CAG would also enable better "vector canvas" approach. Progressive, where each frame has added edges and removed edges.
      </p>

      <p class="TODO">
        Get non-crashing on phones -
        Prioritize demos lazily (either when they are close to the scrolled viewport) or on other threads (web workers).
      </p>

      <p class="TODO">
        Run rendering in web worker thread?
      </p>

      <p class="TODO">
        Remove duplication of render-program logic (that was confusing) - how to do this with potential workgroup variables?
      </p>

      <p class="TODO">
        Fix logging so we don't have to patch in forwarding of local/global/workgroup IDs through everything.
        Can we set these things as a var&lt;private&gt;? That should solve all of it, but would need a different naming.
        Can this be consistently done? Any easy way? Add a WGSLString that goes at start of main(), has a module dependency
        that adds the var-privates?
      </p>

      <p class="TODO">
        Lazy paints with IntersectionObserver API.
        See <a href="https://macarthur.me/posts/animate-canvas-in-a-worker/">Animate Canvas in a Worker</a> or
        <a href="https://webgpu.github.io/webgpu-samples/?sample=worker">Web Worker WebGPU</a>.
        <code>canvas.transferControlToOffscreen()</code> and messaging it to worker seems to work.
      </p>

      <p class="TODO">
        Add TODO levels here, so we can flag critical/important ones. Also, make them searchable?
      </p>

      <p class="TODO">
        Sub-pixel rendering! Do separate regions for each color channel. Get Loupe for screen inspection.
        Experiment more with text and readability, see how small we can get text to still be readable!
        Animated moving siemens stars would help identify the type of subpixel! Animate a bunch of options, see
        which looks the cleanest without jitter.
      </p>

      <p class="TODO">
        Implement SVG full compatibility?
      </p>

      <p class="TODO">
        Get font changes done with Scenery vello branch, so we can include code improvements and get RenderFromNode working
        better. Also allows showing Vello stuff in demos (will need to see if we can port over the multi branch work).
        Merge multi-work. Remove auto-feature-detect for now, so it doesn't ping shaders on main?
      </p>

      <p class="TODO">
        Improve styling, see <a href="https://getbootstrap.com/2.3.2/scaffolding.html">Old Bootstrap docs</a>.
      </p>

      <p class="TODO">
        Add "(your browser)" on SVG/Canvas demo labels?
      </p>

      <p class="TODO">
        Stream out tiles one-by-one (as CPU is ready for it)? We could CAG one tile at a time, and push out the rasterization.
        Have the CPU and GPU working at the same time! Compatible with web workers even also?
      </p>

      <p class="TODO">
        Consider "analytic extension" of the border region, so that manual "extension" not needed for use with filters.
      </p>

      <p class="TODO">
        Why did phong rendering break? It was working?
      </p>

      <h3 id="overview-mainChallenges">Main Challenges</h3>

      <p>
        We are trying to solve the following problems:
      </p>

      <ol>
        <li>
          Need a high-quality and high-performance way to display vector graphics (in the browser,
          and natively). In particular, we would like to avoid <a href="#overview-conflation">conflation artifacts</a>,
          <a href="#overview-aliasingGamma">anti-aliasing/gamma-blending issues</a>, and
          <a href="#overview-gradients">gradient aliasing issues</a>
          without having to resort to expensive techniques like multisampling. We'd also prefer to
          <a href="#overview-color">support wide gamut display</a>, and have finer-grained control over image
          <a href="#overview-upsampling">upsampling</a> and <a href="#overview-downsampling">downsampling</a>.
        </li>
        <li>
          WebGPU/WGSL is missing a library with reusable generic parallel computation primitives (e.g. reduce/scan/sort),
          and doesn't have the abstractions built-in to easily support this.
        </li>
        <li>
          Improving the processing of vector graphics primitives to efficiently solve the occlusion/overlapping problem.
        </li>
      </ol>

      <h4 id="overview-conflation">Conflation Artifacts</h4>

      <p>
        Many common approaches run into conflation artifacts (from
        <span class="citation" data-citation-id="KilgardBolz2012"></span>, see
        <a href="https://computergraphics.stackexchange.com/questions/1824/what-is-illustrators-vector-rasterization-process">description</a>,
        <a href="https://github.com/linebender/vello/issues/49">discussion</a>, and
        <a href="https://w3.impa.br/~diego/projects/GanEtAl14/sample.html?contour">examples</a>). These are graphical
        glitches that occur when coverage is conflated with opacity, and happens in many common approaches (especially
        with compositing). The source is treating the cases "a red path at 50% opacity fully covers the area" and
        "a fully-opaque red path covers 50% of the area" as equivalent in later stages, and cases where multiple paths
        precisely join along a curve may incorrectly show the background in those pixels.
      </p>

      <div class="row-fluid">
        <div class="span4 example">
          <div id="conflation-canvas"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Conflation artifacts (Canvas)
        </div>
        <div class="span4 example">
          <div id="conflation-svg"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Conflation artifacts (SVG)
        </div>
        <div class="span4 example">
          <div id="conflation-default"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          No artifacts (Alpenglow CPU)
        </div>
      </div>

      <p>
        We are able to solve this problem by using a different approach based on polygonal boolean operations, where
        we can get a list of polygonal faces where each input path is either fully contained or fully
        excluded in each face. We can then use high-performance approaches to rasterize each face independently, then
        accumulate while avoiding multisampling and conflation artifacts. Often these faces will be of constant color,
        which can very efficiently be displayed (we skip the per-pixel blending with the completely-occluded background
        contents).
      </p>

      <h4 id="overview-aliasingGamma">Aliasing &amp; Gamma-Incorrect Blending</h4>

      <p>
        Aliasing is the low-frequency pattern that spuriously appears when we are displaying high-frequency content
        in a lower-resolution way. In general, we want to reduce these low-frequency patterns without overly introducing
        blurring. Basic anti-aliasing can help "smooth" things like the jagged edges of lines (e.g. with text), but
        some approaches aren't general enough to help with other types of aliasing.
      </p>

      <p>
        To be correct, blending should be done in the correct color spaces. Many applications choose to blend in
        non-linear color spaces (e.g. sRGB), which can cause blended colors to look incorrect (usually darker).
        For more information, see
        <a href="https://blog.johnnovak.net/2016/09/21/what-every-coder-should-know-about-gamma/">this reference</a>.
      </p>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span2"></div>
        <div class="span4 example">
          <div id="checkerboard-canvas"><div class="example placeholder" style="width: 128px; height: 64px;"></div></div>
          Canvas
        </div>
        <div class="span4 example">
          <div id="checkerboard-svg"><div class="example placeholder" style="width: 128px; height: 64px;"></div></div>
          SVG
        </div>
        <div class="span2"></div>
      </div>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span4 example">
          <div id="checkerboard-box"><div class="example placeholder" style="width: 128px; height: 64px;"></div></div>
          Alpenglow CPU (Box Filter)
        </div>
        <div class="span4 example">
          <div id="checkerboard-bilinear"><div class="example placeholder" style="width: 128px; height: 64px;"></div></div>
          Alpenglow CPU (Bilinear Filter)
        </div>
        <div class="span4 example">
          <div id="checkerboard-mitchell-netravali"><div class="example placeholder" style="width: 128px; height: 64px;"></div></div>
          Alpenglow CPU (Mitchell-Netravali Filter)
        </div>
      </div>

      <p>
        For instance, the above checkerboards (with 3d perspective) have smooth transitions from low to high frequency.
        Note the pattern should blur to a light gray as it fades in the distance. Visible patterns are a sign of aliasing,
        and darkening of the pattern is a sign of gamma-incorrect blending.
      </p>

      <h4 id="overview-gradients">Gradient Aliasing &amp; Precision</h4>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span2"></div>
        <div class="span4 example">
          <div id="gradients-canvas"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Canvas
        </div>
        <div class="span4 example">
          <div id="gradients-svg"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          SVG
        </div>
        <div class="span2"></div>
      </div>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span4 example">
          <div id="gradients-box"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Alpenglow CPU (Box Filter)
        </div>
        <div class="span4 example">
          <div id="gradients-bilinear"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Alpenglow CPU (Bilinear Filter)
        </div>
        <div class="span4 example">
          <div id="gradients-mitchell-netravali"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Alpenglow CPU (Mitchell-Netravali Filter)
        </div>
      </div>

      <p>
        Gradients with color stops close together (e.g. when zoomed out in a user interface) can also create many aliasing patterns.
        Alpenglow is able to compute exact analytic gradient contributions (based on the exact pixel coverage), and with
        filtering added is able to produce high-quality gradients.
      </p>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span4 example">
          <div id="gradientPrecision-canvas"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Canvas
        </div>
        <div class="span4 example">
          <div id="gradientPrecision-svg"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          SVG
        </div>
        <div class="span4 example">
          <div id="gradientPrecision-box"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Alpenglow CPU (Box Filter)
        </div>
      </div>

      <p>
        Many gradient implementations also have issues with gradients when zoomed in, and may drop or distort gradients
        in the above cases. Gradient bands should get thinner and thinner until they look like lines.
      </p>

      <h4 id="overview-color">Color Spaces and Wide-Gamut Support</h4>

      <p>
        While gamma is critical, blending in the correct color space is also important. Blending in sRGB does not produce
        perceptually uniform colors. Colors should be converted to/from a color space meant for this purpose when
        perceptual gradients are required. <a href="https://bottosson.github.io/posts/oklab/">Oklab</a> is great for this,
        and while not yet supported in Canvas/SVG, it can be
        <a href="https://developer.mozilla.org/en-US/docs/Web/CSS/color_value/oklab">used in CSS</a>.
      </p>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span6 example">
          <div id="redGreenGradients-example"><div class="example placeholder" style="width: 256px; height: 96px;"></div></div>
          Oklab, linear sRGB, and sRGB (top to bottom). Note the green bias in linear sRGB and darkening in sRGB. Oklab is perceptually uniform.
        </div>
        <div class="span6 example">
          <div id="blueWhiteGradients-example"><div class="example placeholder" style="width: 256px; height: 96px;"></div></div>
          Oklab, linear sRGB, and sRGB (top to bottom). Note the hue shift toward purple in the blue-white gradient for sRGB varieties. Oklab is perceptually uniform, without a hue shift.
        </div>
      </div>

      <p>
        However, when we use other color spaces like Oklab, even blends from colors inside the sRGB gamut can produce
        out-of-gamut colors. To compensate for this, we will need to use gamut mapping:
      </p>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span2"></div>
        <div class="span4 example">
          <div id="sRGBGamutMap-example"><div class="example placeholder" style="width: 206px; height: 180px;"></div></div>
          sRGB triangle with (relative colorimetric) gamut mapping
        </div>
        <div class="span4 example">
          <div id="sRGBNoGamutMap-example"><div class="example placeholder" style="width: 206px; height: 180px;"></div></div>
          sRGB triangle without gamut mapping (gray to white shows how far outside of gamut the colors are)
        </div>
        <div class="span2"></div>
      </div>

      <p>
        Additionally, many monitors are now capable of displaying a wider gamut of colors than sRGB. Chrome and Safari
        support the extended colors inside the <a href="https://en.wikipedia.org/wiki/DCI-P3">Display P3</a> color space.
        As seen, this can be helpful even if most elements are in sRGB, however it can also be impactful to use the
        bolder colors in the extended gamut. Many GPU rasterization solutions do not yet support this.
      </p>

      <div class="row-fluid" style="margin-bottom: 15px;" id="display-p3-container">
        <div class="span2"></div>
        <div class="span4 example">
          <div id="displayP3GamutMap-example"><div class="example placeholder" style="width: 206px; height: 180px;"></div></div>
          Display P3 triangle
        </div>
        <div class="span4 example">
          <div id="displayP3Comparison-example"><div class="example placeholder" style="width: 206px; height: 180px;"></div></div>
          sRGB (top), Display P3 (middle) and Display P3 gamut mapped to sRGB (bottom).
        </div>
        <div class="span2"></div>
      </div>
      <script>
        if ( !window.matchMedia( '(color-gamut: p3)' ).matches ) {
          document.querySelector( '#display-p3-container' ).style.display = 'none';
        }
      </script>

      <p>
        Hopefully soon, we will have a convenient interface to show high dynamic range (HDR) content, but Alpenglow will be able
        to support it when it is available.
      </p>

      <h4 id="overview-upsampling">Image Upsampling</h4>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span2"></div>
        <div class="span4 example">
          <div id="upsampling-canvas"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Canvas
        </div>
        <div class="span4 example">
          <div id="upsampling-svg"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          SVG
        </div>
        <div class="span2"></div>
      </div>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span2"></div>
        <div class="span4 example">
          <div id="upsampling-analytic-box"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Alpenglow CPU Analytic Box Filter
        </div>
        <div class="span4 example">
          <div id="upsampling-mitchell-netravali"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Alpenglow CPU Mitchell-Netravali Filter
        </div>
        <div class="span2"></div>
      </div>

      <p>
        Most Canvas/SVG implementations do not support upsampling with the Mitchell-Netravali ("cubic") filter, and will
        blend in a gamma-incorrect way. Additionally, we are able to represent images with exact analytic filters, so
        that a box filter upsampling will have nicely anti-aliased edges.
      </p>

      <h4 id="overview-downsampling">Image Downsampling</h4>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span2"></div>
        <div class="span4 example">
          <div id="downsampling-canvas"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Canvas
        </div>
        <div class="span4 example">
          <div id="downsampling-svg"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          SVG
        </div>
        <div class="span2"></div>
      </div>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span4 example">
          <div id="downsampling-analytic-box"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Alpenglow CPU (Analytic Box Filter)
        </div>
        <div class="span4 example">
          <div id="downsampling-analytic-bilinear"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Alpenglow CPU (Analytic Bilinear Filter)
        </div>
        <div class="span4 example">
          <div id="downsampling-analytic-mitchell-netravali"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Alpenglow CPU (Analytic Mitchell-Netravali Filter)
        </div>
      </div>

      <p>
        Image downsampling is also prone to both aliasing arifacts and gamma-incorrect blending. Many Canvas implementations
        will quickly sample an image, leading to serious ringing patterns on these example images. There are many cases
        where we will want to use a higher-quality downsampling algorithm, and we can compute an exact analytic filtered
        result.
      </p>

      <p>
        It is recommended to see the <a href="./doc/downscale.png">image</a> at full (1:1) size, to see how the above methods
        scale the image down.
      </p>

      <h3 id="overview-relatedWork">Related Work</h3>

      <h4>GPU Vector Graphics</h4>

      <p>
        <a href="https://github.com/linebender/vello">Vello</a> is the most similar project, and we've tested integration
        of it <a href="https://github.com/phetsims/scenery/tree/vello">into Scenery</a>. It's the main inspiration for
        this project, and we plan to use many components from it. However, it was tricky getting it to work on the web
        (compiling it for WASM creates a bundle in excess of 5MB), and thus required rewriting the Rust code into TypeScript.
        At the time it didn't have support for avoiding conflation artifacts, and now (as of writing) it is using a more
        expensive multisampling approach to reduce the effects of conflation. Additionally, it runs into gamma blending
        problems, and doesn't support additional color spaces currently. The approach we are using is different enough
        that it unfortunately can't be integrated into Vello.
      </p>

      <p>
        Other libraries that target the GPU for similar rendering include
        <a href="https://github.com/servo/pathfinder">Pathfinder</a> (discontinued),
        <a href="https://github.com/google/forma">Forma</a> (not under active development),
        <a href="https://fuchsia.googlesource.com/fuchsia/+/refs/heads/master/src/graphics/lib/compute/spinel/">Spinel</a>,
        and <a href="https://github.com/flutter/engine/tree/main/impeller">Flutter's Impeller</a>
      </p>

      <p>
        Additionally, browsers have some built-in support for GPU-accelerated vector graphics, most notably with
        <a href="https://developer.mozilla.org/en-US/docs/Web/API/Canvas_API">Canvas</a> and
        <a href="https://developer.mozilla.org/en-US/docs/Web/SVG">SVG</a>. These approaches have decent performance,
        but performance can be improved with the WebGPU/WebGL solutions above. Additionally, they run into the
        conflation/gamma/aliasing artifacts previously described (in order to improve their performance).
      </p>

      <h4>Anti-Aliased Vector Graphics</h4>

      <p>
        For filtering-based analytic rasterization, we've explored other recent approaches, including
        <span class="citation" data-citation-id="doi:10.1080/2151237X.2005.10129191"></span>,
        <span class="citation" data-citation-id="doi:10.1080/2151237X.2005.10129189"></span>,
        <span class="citation" data-citation-id="10.1111:cgf.12070"></span>, which could also be included/implemented.
      </p>

      <p>
        For the general GPU processing, relevant literature is in
        <span class="citation" data-citation-id="10.1145/1409060.1409088"></span>,
        <span class="citation" data-citation-id="10.1145/2018323.2018337"></span>, and
        <span class="citation" data-citation-id="10.1145/2661229.2661274"></span> are particularly promising, as the
        inspiring approaches behind Vello.
        <span class="citation" data-citation-id="GPUpathtracingSA16"></span> could also be explored.
      </p>

      <p>
        Especially for text, many approaches look to be patent/license encumbered, most notably the Loop-Blinn approach
        <span class="citation" data-citation-id="loop2005resolution"></span> (with <a href="https://developer.nvidia.com/gpugems/gpugems3/part-iv-image-effects/chapter-25-rendering-vector-art-gpu">notes</a>).
        The <a href="https://sluglibrary.com/">Slug</a> library seems like the go-to solution for games, backed by
        <span class="citation" data-citation-id="Lengyel2017FontRendering"></span>.
        Signed distance fields like in
        <span class="citation" data-citation-id="10.1145/1281500.1281665"></span> and
        <span class="citation" data-citation-id="MCDF"></span> lose too much accuracy for our purposes,
        however there are examples of <a href="https://www.shadertoy.com/view/4sKyzW">exact Cubic beziers</a> with SDFs
        which can <a href="https://www.shadertoy.com/view/4sySDK">render text</a>. There are also a few approaches using
        circular window approaches:
        <a href="https://wdobbie.com/post/gpu-text-rendering-with-vector-textures/">A</a> and
        <a href="https://gasiulis.name/vector-graphics-on-gpu/">B</a>.
      </p>

      <h4>WebGPU</h4>

      <p>
        High quality libraries for the CUDA interface to GPUs exist, and this work is inspired by
        <a href="https://github.com/NVIDIA/cccl">Thrust</a> <span class="citation" data-citation-id="BELL2012359"></span>
        and <a href="https://github.com/moderngpu/moderngpu/wiki">moderngpu</a>
        <span class="citation" data-citation-id="Baxter:2016:M2"></span>. They serve as prime examples of useful patterns,
        but can't target WebGPU. In addition, many algorithms rely in forward progress or workgroup communication that
        WebGPU doesn't offer (in an effort for portability), so we will need to explore new approaches.
      </p>

      <p>
        There are also good shader generation systems, but which don't target WebGPU. Notably
        <a href="https://github.com/shader-slang/slang">Slang</a>
        <span class="citation" data-citation-id="10.1145/3197517.3201380"></span>
        <span class="citation" data-citation-id="bangaru2023slangd"></span>.
      </p>

      <p>
        <a href="https://usegpu.live/">Use.GPU</a> takes a novel approach with WebGPU, but focuses on rendering instead of
        the compute primitives.
      </p>

      <h3 id="overview-contributions">Contributions</h3>

      <p>
        Current contributions:
      </p>

      <ul>
        <li>
          New approach to solving 2D occlusion/overlapping with computational geometry at the start of the rasterization
          process. Based on our previous work for PhET's <a href="https://github.com/phetsims/kite">computational geometry library</a>,
          we are able to compute the overlap for many general regions at the same time (but using line segments with
          rational values for exact intersections and robustness).
        </li>
        <li>
          New approach to anti-aliasing with filters (evaluating arbitrary polynomial integrals over polygons, based on
          Green's Theorem). It's efficient (allows bilinear filtering with minimal performance burden), and compatible
          with other future segment types (quadratic/cubic Bezier curves, elliptical arcs, etc.).
          This also allows <a href="#antialiasing-examples">computing exact blurs</a>.
        </li>
        <li>
          New approach to <a href="#depthSort">3D graphics (using the above)</a>, where we compute (exact vector)
          occlusion and some perspective-corrected exact integrals (for barycentric-based coordinates on triangular meshes).
        </li>
        <li>
          New approach to recursive clipping of polygons, particularly with the ability to handle degenerate cases
          in a compact fashion on a GPU.
        </li>
        <li>
          Initial work for generating WGSL for high-performance computational primitives in generic form (reduce,
          prefix sum, radix sort, merge sort, stream compaction, etc.).
        </li>
        <li>
          New approach to <a href="#asyncSimulation">simulating the WebGPU execution model</a> with TypeScript async/await
          patterns (for evaluation of race conditions, uniformity, indeterminate behavior, etc.).
        </li>
      </ul>

      <p>
        Planned contributions:
      </p>

      <ul>
        <li>
          High-performance WebGPU implementation of the Alpenglow strategy, including the preceding stages (for
          transformation, conversion to line segments, etc.)
        </li>
        <li>
          Library for generic parallel computation primitives (using code generation/templating with WGSL, or potentially
          extending WGSL to a new language with templating support).
        </li>
        <li>
          Full integration of Alpenglow WebGPU into <a href="https://github.com/phetsims/scenery">Scenery</a> (as a renderer) and
          PhET simulations.
        </li>
        <li>
          Further exploration of 3D rasterization using this approach, including general (exact!) perspective-corrected
          integrals across triangles when displayed.
        </li>
        <li>
          Canvas API replacement that supports a fully vector-based description of the current Canvas state (same
          drawing commands, but can be resized/queried and has an underling vector representation). Should be able to do
          this in a high-performance way where we will have the quality of Alpenglow. Drawing over parts of the Canvas
          will "merge" vector regions into one where appropriate, so we can have a finite-memory representation of the
          state after an arbitrary number of drawing commands.
        </li>
        <li>
          Static SVG to Alpenglow converter, which will transform SVG files into Alpenglow render programs for display.
        </li>
        <li>
          Extension of the Alpenglow approach to additional curve types (hopefully in a robust way). Intersections for
          quadratic/cubic Bezier curves and elliptical arcs present some robustness problems. I would like to investigate
          rationally-based curves with rational sections (looking into
          <span class="citation" data-citation-id="Wildberger2005"></span>) to see if we can extend the robust approach.
        </li>
      </ul>

      <h3 id="overview-impact">Desired Impact</h3>

      <p>
        Ideally, Alpenglow would be able to replace the usage of Canvas/SVG (and some WebGL) for the purpose of displaying
        interactive graphics and UI, both in <a href="https://phet.colorado.edu/">PhET Interactive Simulations</a> and in other projects.
        Canvas API integration could be a drop-in replacement for existing Canvas usage, with higher-quality output.
      </p>

      <p>
        In addition, since the target is to rely on WebGPU, it would be possible to run the same code in native applications
        (using <a href="https://dawn.googlesource.com/dawn">Dawn</a> or <a href="https://wgpu.rs/">WGPU</a>),
        or possibly at the OS level. This could be potentially useful for rendering native UIs, or anything that needs
        scale-independent high-quality graphics.
      </p>

      <p>
        It would also allow <a href="https://github.com/phetsims/scenery">Scenery</a> to be a higher-performance solution
        for interactive graphics and user interfaces on the web, especially in an accessibility-compatible way.
      </p>

      <h2 id="concepts">Concepts</h2>

      <h3 id="polygonalFaces">Polygonal Faces</h3>

      <p>
        Polygonal faces are used pervasively throughout Alpenglow. In general, our data structures are designed to support
        <a href="https://en.wikipedia.org/wiki/Polygon_with_holes">polygons with holes</a>, and more generally, the
        primitive is a "list of polygons". An example is the following diagram, which can be stored in different ways:
      </p>

      <div class="example">
        <div id="polygonal-face-example"><div class="example placeholder" style="width: 256px; height: 256px;"></div></div>
      </div>

      <p>
        All of the storage representations below conform to the <code>ClippableFace</code> interface in the software
        implementation.
      </p>

      <h4>Polygonal Form</h4>

      <p>
        It can be represented in the polygonal form as three (directed) lists of vertices (with a <a href="https://en.wikipedia.org/wiki/Nonzero-rule">nonzero winding rule</a>):
      </p>

      <p>
        (0,&nbsp;0),
        (10,&nbsp;0),
        (10,&nbsp;2),
        (2,&nbsp;10),
        (0,&nbsp;10)
        <br>
        (2,&nbsp;2),
        (2,&nbsp;7),
        (7,&nbsp;2)
        <br>
        (9,&nbsp;9),
        (6,&nbsp;9),
        (9,&nbsp;6)
      </p>

      <p>
        where there is a line (edge) between each consecutive pair of vertices, and the last vertex is connected to the
        first vertex. This is compact, however the order is critical. This can cause certain operations (e.g. circular
        clipping, some tracing) to be slower than other methods. This is handled by <code>PolygonalFace</code> in the
        software implementation.
      </p>

      <h4>Edge Form</h4>

      <p>
        For much of Alpenglow's needs, things work better if we consider the list of edges itself (with start/end vertices):
      </p>

      <p>
        (0,&nbsp;0)&nbsp;=>&nbsp;(10,&nbsp;0),
        (10,&nbsp;0)&nbsp;=>&nbsp;(10,&nbsp;2),
        (10,&nbsp;2)&nbsp;=>&nbsp;(2,&nbsp;10),
        (2,&nbsp;10)&nbsp;=>&nbsp;(0,&nbsp;10),
        (0,&nbsp;10)&nbsp;=>&nbsp;(0,&nbsp;0)

        (2,&nbsp;2)&nbsp;=>&nbsp;(2,&nbsp;7),
        (2,&nbsp;7)&nbsp;=>&nbsp;(7,&nbsp;2),
        (7,&nbsp;2)&nbsp;=>&nbsp;(2,&nbsp;2)

        (9,&nbsp;9)&nbsp;=>&nbsp;(6,&nbsp;9),
        (6,&nbsp;9)&nbsp;=>&nbsp;(9,&nbsp;6),
        (9,&nbsp;6)&nbsp;=>&nbsp;(9,&nbsp;9)
      </p>

      <p>
        Now, for most critical operations, order doesn't matter, and each edge can be considered in isolation.
        We won't need to sort things during/after operations, but also certain operations might need to generate more
        edges that get canceled out (whereas with polygonal data, we could detect it and simplify in higher-performance
        ways). This form is handled by <code>EdgedFace</code> in the software implementation.
      </p>

      <h4>Degenerate Edges</h4>

      <p>
        Additionally, we can actually construct this with an equivalent single (degenerate) polygon. Since we only care
        about the filled area, it turns out that edges that "reverse" over each other won't contribute to the filled area,
        so we can double-back. Thus the following diagram:
      </p>

      <div class="example">
        <div id="polygonal-face-canceling"><div class="example placeholder" style="width: 256px; height: 256px;"></div></div>
      </div>

      <p>
        can be constructed with an equivalent area with the following polygon:
      </p>

      <p>
        (0,&nbsp;0),
        (10,&nbsp;0),
        (10,&nbsp;2),
        (7,&nbsp;2),
        (2,&nbsp;2),
        (2,&nbsp;7),
        (6,&nbsp;9),
        (9,&nbsp;6),
        (9,&nbsp;9),
        (6,&nbsp;9),
        (2,&nbsp;7),
        (7,&nbsp;2),
        (10,&nbsp;2),
        (2,&nbsp;10),
        (0,&nbsp;10)
      </p>

      <h4>Edge-Clipped Form</h4>

      <p>
        It turns out, for computational complexity of future clipping operations (see below), it is very helpful to NOT
        directly store the edges that go along the full edge of the clipping rectangle (e.g. the 0,0 to 10,10 region in
        the diagram). Instead, those edges will be counted. Due to the property above of degenerate edges, these counts
        can be added together to add the contribution of these "clipped" edges.
      </p>

      <p>
        The diagram below shows the edge orientations that get positive (+1) counts. Reversed edges will get negative (-1)
        counts:
      </p>

      <div class="example">
        <div id="polygonal-face-edge-clips"><div class="example placeholder" style="width: 256px; height: 256px;"></div></div>
      </div>

      <p>
        Thus the edge-clipped version (for the 0,0 to 10,10 bounds) will have the following edges and counts:
      </p>

      <div class="example">
        <div id="polygonal-face-edge-clipped"><div class="example placeholder" style="width: 256px; height: 256px;"></div></div>
      </div>

      <p>
        (10,&nbsp;0)&nbsp;=>&nbsp;(10,&nbsp;2),
        (10,&nbsp;2)&nbsp;=>&nbsp;(2,&nbsp;10),
        (2,&nbsp;10)&nbsp;=>&nbsp;(0,&nbsp;10),

        (2,&nbsp;2)&nbsp;=>&nbsp;(2,&nbsp;7),
        (2,&nbsp;7)&nbsp;=>&nbsp;(7,&nbsp;2),
        (7,&nbsp;2)&nbsp;=>&nbsp;(2,&nbsp;2)

        (9,&nbsp;9)&nbsp;=>&nbsp;(6,&nbsp;9),
        (6,&nbsp;9)&nbsp;=>&nbsp;(9,&nbsp;6),
        (9,&nbsp;6)&nbsp;=>&nbsp;(9,&nbsp;9)
      </p>

      <p>
        This is handled by the <code>EdgedClippedFace</code> in the software implementation.
      </p>

      <p class="TODO">
        Create "Bounds Pyramid" immutable data structure. It's a binary tree, leaves are edges. Each level stores the
        bounding box. PIP-tests are VERY accelerated on this. Clipping also potentially will be (if a node is fully within
        one side of a binary clip, we can just INCLUDE that node). Thus we will have a linked list way of describing how
        the nodes connect (so we can reuse parts of the source within the result). At a certain point, this will be less
        beneficial, but high-up it might be very helpful? Test PIP and clipping with this. (each section has bounds and
        children). Linked list so we can store "sequences" immutably, that don't get changed.
      </p>

      <p class="TODO">
        Grid clips with bounds-pyramids could be even more efficient on the CPU. Does it really help with the GPU at all?
        When clipped, recompute bounds (will be tighter)? Or at a certain point, do we stop recomputing bounds?
      </p>

      <p>
        Edge-clipped form is particularly helpful for repeated subdivision of clipping areas. If you have a polygon
        clipped to a large rectangle, and then clip that to a smaller rectangle (fully contained by the larger one),
        the counts from the larger rectangle can be directly copied to the smaller case.
      </p>

      <h3 id="clipping">Clipping</h3>

      <h4>Line Clipping</h4>

      The first primitive we will need is the ability to find the part of a line segment that is within an axis-aligned
      bounding rectangle (referred to as bounds). Alpenglow primarily uses <span class="citation" data-citation-id="Matthes_2019"></span>.

      <div class="example">
        <div id="clipping-line-example"><div class="example placeholder" style="width: 256px; height: 256px;"></div></div>
      </div>

      <p class="TODO">Documentation</p>

      <p class="TODO">Edge-Clipped note about preservation of counts</p>

      <p class="TODO">Circular clipping and other types. Write up dual concept of clipping.</p>

      <p class="TODO">
        Read Skala summary of clipping, see if I'm missing something. Also
        <a href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9605407/">this looks valuable as an intro for others.</a>
      </p>

      <p class="TODO">
        Stripe clipping gets messed up with fake edges. Vaguely? Investigate.
      </p>

      <p>
        Maillot clipping ideas <span class="citation" data-citation-id="10.1145/130881.130894"></span> referenced in <span class="citation" data-citation-id="agoston2005computer"></span>.
      </p>

      <h3 id="booleanOperations">Boolean Operations</h3>

      <p class="TODO">
        Describe the general overview of our polygonal overlay computations.
      </p>

      <p class="TODO">
        Check "standalone" performance against other robust implementations?
      </p>

      <p class="TODO">
        Create "standalone" library for this!
      </p>

      <p class="TODO">
        Review <a href="https://www.mdpi.com/2073-8994/10/10/477">paper</a>
      </p>

      <div class="example">
        <div id="boolean-operations-example"><div class="example placeholder" style="width: 340px; height: 300px;"></div></div>
        Example boolean operation, intersection shown in blue, differences shown in red and green.
      </div>

      <p class="TODO">
        Extend this to other segment types.
      </p>

      <p class="TODO">
        Local rationals! Don’t worry about approximate intersections. Just catch all intersections of rationalization
        within zones where curves are close. Just get rational T values, points, tangents (the slopes of the rationalized
        curves). We will be within epsilons anyway. Just act like the rational intersection is on it (for our
        rasterization purposes). Only rationalize the intersected areas. Rational endpoints, rational intersections,
        rational parameterization.
      </p>

      <p class="TODO">
        Can we find a class of curves that has rational intersections only?
      </p>

      <p class="TODO">
        Review Wildberger book, see rational parametrizations of conics/circle. (Rational trigonometry?)
      </p>

      <h3 id="antialiasing">Anti-Aliasing</h3>

      <h4 id="antialiasing-integrals">Integrals</h4>

      <p>
        This will delve into some of the mathematics and background information behind the anti-aliasing techniques used
        in Alpenglow. Most of the anti-aliasing is based on being able to clip polygons down to pixel boundaries, and then
        integrating the polygonal coverage over the pixel.
      </p>

      <h5>Green's Theorem and Polygons</h5>

      <p>
        Using <a href="https://en.wikipedia.org/wiki/Green%27s_theorem">Green's Theorem</a>, we can convert a double
        integral over a region into a line integral over the (closed, oriented counter-clockwise) boundary of the region:
      </p>

      <p>
        $$
        \oint_P\left(L\,\frac{dx}{dt}+M\,\frac{dy}{dt}\right)dt=\iint_P \left( \frac{\partial M}{\partial x}-\frac{\partial L}{\partial y} \right)\,dx\,dy
        $$
      </p>

      <p>
        for curves parameterized on $t$.
      </p>

      <p>
        For polygons, this means that if we can evaluate a line integral over each line segment (between $(x_i,y_i)$ and
        $(x_{i+1},y_{i+1})$, finishing with $(x_i,y_i)$ to $(x_0,y_0)$), we can sum up each edge's contribution to
        evaluate the double integral for the region inside the polygon. Each line segment is parameterized curve:
      </p>

      <p>

      </p>
        $$
        x=x(t)=(1-t)x_i+(t)x_{i+1}=x_i+t(x_{i+1}-x_i)
        $$

        $$
        y=y(t)=(1-t)y_i+(t)y_{i+1}=y_i+t(y_{i+1}-y_i)
        $$

        for $0 \le t \le 1$, with the derivatives:

        $$
        \frac{dx}{dt}=x_{i+1}-x_i
        $$

        $$
        \frac{dy}{dt}=y_{i+1}-y_i
        $$

        Note:
      </p>

      <ol>
        <li>
          If we reverse an edge (swap its endpoints), it will swap the sign of the contribution to the integral (a polygon
          can make a degenerate turn and double-back precisely, with no contribution to area). Thus for terms, swapping $i$
          and $i+1$ will swap the sign of the contribution. This means that polygons with holes can be evaluated by
          visiting the holes with the opposite orientation (clockwise).
        </li>
        <li>
          This is evaluated on closed polygons, so any terms that only depend on one endpoint will cancel out (e.g.
          $x_i^2y_i$ and $-x_{i+1}^2y_{i+1}$ will have their contributions cancel out, since both of those will be
          evaluated for every point in the polygon). It is useful to adjust the coefficients to these terms, since they
          can allow us to factor the expressions into simpler forms (e.g. the Shoelace formula below).
        </li>
      </ol>

      <p>
        We can pick $L$ and $M$ below:

        $$
        L=(n-1)\int f\,dy
        $$

        $$
        M=(n)\int f\,dx
        $$

        so that

        $$
        \iint_P \left( \frac{\partial M}{\partial x}-\frac{\partial L}{\partial y} \right)\,dx\,dy=
        \iint_P \left( (n)f - (n-1)f \right)\,dx\,dy=
        \iint_P f\,dx\,dy
        $$

        for any antiderivatives and real $n$, since the double integral will then be integrating our function $f$. It turns
        out, evaluating Green's Theorem over line segments for polynomial terms for any linear blend (any $n$) of $L$ and
        $M$ will differ only in the "canceled out" terms, so each edge's contribution will be the same.
      </p>

      <h5>Integrating Arbitrary Polynomials over Polygons</h5>

      <p>
        If we zero out all of the canceled terms, it turns out that we can evaluate the integral of any polynomial term $x^my^n$ over a polygon $P$ by summing up the contributions of each edge:

        $$
        \iint_Px^my^n\,dx\,dy=\frac{m!n!}{(m+n+2)!}\sum_{i}\left[ (x_iy_{i+1}-x_{i+1}y_i) \sum_{p=0}^m\sum_{q=0}^n \binom{p+q}{q}\binom{m+n-p-q}{n-q}x_i^{m-p}x_{i+1}^py_i^{n-q}y_{i+1}^q \right]
        $$

        This was first discovered by <span class="citation" data-citation-id="Soerjadi1968OnTC"></span>.
        The contributions of each term can be summed up individually to integrate arbitrary polynomials.
      </p>

      <p>
        e.g. for $x^4y^2$ in matrix form:

        $$
        \iint_Px^4y^2\,dx\,dy=
        \frac{1}{840}
        \sum_{i}
        \left(
        (x_iy_{i+1}-x_{i+1}y_i)
        \begin{bmatrix}
        x_i^4 & x_i^3x_{i+1} & x_i^2x_{i+1}^2 & x_ix_{i+1}^3 & x_{i+1}^4
        \end{bmatrix}
        \begin{bmatrix}
        15 & 5 & 1\\
        10 & 8 & 3\\
        6 & 9 & 6\\
        3 & 8 & 10\\
        1 & 5 & 15
        \end{bmatrix}
        \begin{bmatrix}
        y_i^2\\
        y_iy_{i+1}\\
        y_{i+1}^2
        \end{bmatrix}
        \right)
        $$
      </p>

      <h5>Area of Polygons</h5>

      <p>
        For $x^0y^0=1$, with adding some canceling terms to better factor, we will obtain the
        <a href="https://en.wikipedia.org/wiki/Shoelace_formula">Shoelace formula</a> for finding the area of a polygon:

        $$
        area_P=\iint_P1\,dx\,dy=
        \frac{1}{2}
        \sum_{i}
        (x_i+x_{i+1})(y_{i+1}-y_i)
        $$
      </p>

      <h5>Centroids of Polygons</h5>

      <p>
        <strong>NOTE: see numerical stability notes below, this top way can be inaccurate with floating-point numbers</strong>
      </p>

      <p>
        For $x$ and $y$, we have:

        $$
        \iint_Px\,dx\,dy=
        \frac{1}{6} \sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i+x_{i+1})
        $$

        $$
        \iint_Py\,dx\,dy=
        \frac{1}{6} \sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(y_i+y_{i+1})
        $$

        We can divide the integrals of $x$ and $y$ by the area to get the centroid of the polygon:

        $$
        centroid_x=
        \frac{1}{3}
        \frac{\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i+x_{i+1})}{\sum_{i}(x_i+x_{i+1})(y_{i+1}-y_i)}
        $$

        $$
        centroid_y=
        \frac{1}{3}
        \frac{\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(y_i+y_{i+1})}{\sum_{i}(x_i+x_{i+1})(y_{i+1}-y_i)}
        $$

        This is particularly useful, since if we have any linear function over $(x,y)$ (say, a linear gradient between
        two colors), the average color in the polygon would be the value of that function at the centroid!
      </p>

      <h5>Numerical Stability (addendum)</h5>

      <p>
        The above formulas (from <span class="citation" data-citation-id="DrangGreens"></span>)
        seem to magnify floating-point error significantly. It's highly recommended that either of the following methods
        be used for computation:
      </p>

      <p>
        For the highest precision (but for which there is not as much shared computation between the $x$ and $y$ formulas),
        we can use the following:

        $$
        \iint_Px\,dx\,dy
        =\frac{1}{6}\sum_{i}(x_i^2 + x_i x_{i+1} + x_{i+1}^2) (y_{i+1} - y_i)
        $$

        $$
        \iint_Py\,dx\,dy
        =\frac{1}{6}\sum_{i}(y_i^2 + y_i y_{i+1} + y_{i+1}^2) (x_i - x_{i+1})
        $$

        For higher performance (but with about 30% more numerical error), one with more shared computation can be used:

        $$
        \iint_Px\,dx\,dy
        =\frac{1}{6}\sum_{i}(x_i-x_{i+1})(x_i(2y_i+y_{i+1}) + x_{i+1}(y_i+2y_{i_1}))
        $$

        $$
        \iint_Py\,dx\,dy
        =\frac{1}{6}\sum_{i}(y_{i+1}-y_i)(x_i(2y_i+y_{i+1}) + x_{i+1}(y_i+2y_{i_1}))
        $$

        Both of these approaches are incredibly more stable than the first formula pair (about half a million times less
        error, with triangles centered in x,y in (0,1000), with sizes around 1/1000).

      </p>

      <p>
        For example, with p0={x: 716.1014074033982, y: 879.8148178803798}, p1={x: 716.1017435139888, y: 879.8150887077004},
        p2={x: 716.1021422611582, y: 879.8154096593055}. The top method gives {x: 238.700559480381, y: 293.2716129164566}
        (! distance of more than 700 from the real centroid), whereas the error (distance) for the other methods are
        less than 1e-3

      </p>

      <h5>Evaluation of Filtered Polygons</h5>

      <p>
        Any polynomial-based (windowed or not) filter can be evaluated over a polygon with this approach.
      </p>

      <p>
        A simple practical example is the tent filter (for <a href="https://en.wikipedia.org/wiki/Bilinear_interpolation">Bilinear filtering</a>).
        It is effectively evaluating the integral $(1-x)(1-y)=xy-x-y+1$ over $0\le x\le1,0\le y\le1$, which we can now
        evaluate as:

        $$
        \frac{1}{24}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(12-4( x_i+y_i+x_{i+1}+y_{i+1})+2(x_iy_i+x_{i+1}y_{i+1})+
        x_iy_{i+1}+x_{i+1}y_i)
        $$
      </p>

      <h5>Evaluation of Distance over Polygons</h5>

      <p>
        Above, we saw the centroid is useful to compute exact linear gradient contributions. For purely-circular radial gradients, the equivalent is also possible to compute! We will need to instead integrate $r=\sqrt{x^2+y^2}$ (we can determine the average by dividing by the area).
      </p>

      <p>
        We will need to transform to polar coordinates first:

        $$
        r=\sqrt{x^2+y^2}
        $$

        $$
        \theta=\tan^{-1}\frac{y}{x}
        $$

        We will want to evaluate with Green's Theorem in polar coordinates:

        $$
        \oint_P\left(L\,\frac{dr}{dt}+M\,\frac{d\theta}{dt}\right)dt=\iint_P \left( \frac{\partial M}{\partial r}-\frac{\partial L}{\partial \theta} \right)\,dA
        $$

        but we will want to evaluate $r^2$ due to the coordinate change.
      </p>

      <p>
        If we pick $M=\frac{1}{3}r^3$ and $L=0$, the double integral will be our desired integral (note, $M=\frac{1}{2}r^2$ gives us the same Shoelace-like area formula).
      </p>

      <p>
        Given our definitions of $x=x_i+t(x_{i+1}-x_i)$ and $y=y_i+t(y_{i+1}-y_i)$:

        $$
        \begin{aligned}
        \frac{d\theta}{dt}&=\frac{d}{dt}\tan^{-1}\frac{y}{x} \\
        &=\frac{d}{dt}\tan^{-1}\frac{y_i+t(y_{i+1}-y_i)}{x_i+t(x_{i+1}-x_i)} \\
        &=\frac{x_iy_{i+1}-x_{i+1}y_i}{t^2((x_{i_1}-x_i)^2+(y_{i+1}-y_i)^2)-2t(x_i^2-x_ix_{i+1}-y_iy_{i+1}+y_i^2)+(x_i^2+y_i^2)}
        \end{aligned}
        $$

        Thus given $M$ and $\frac{d\theta}{dt}$, we can evaluate (with Mathematica in this case):

        $$
        \begin{aligned}
        \oint_P\left(M\,\frac{d\theta}{dt}\right)dt&=\int_0^1\frac{1}{3}r^3\frac{d\theta}{dt}\,dt \\
        &=
        \frac{s}{6d_{xy}^3}\left[
          d_{xy}\left( q_0( x_i^2 - x_ix_{i+1} - y_id_y ) + q_1( k_x + y_{i+1}d_y ) \right) +
          s^2\log\frac{k_x + k_y + d_{xy}q_1}{x_id_x + q_0d_{xy} + y_id_y}
        \right]
        \end{aligned}
        $$

        with

        $$
        d_x = x_{i+1} - x_i
        $$

        $$
        d_y = y_{i+1} - y_i
        $$

        $$
        s = x_iy_{i+1} - y_ix_{i+1}
        $$

        $$
        d_{xy} = \sqrt{d_xd_x + d_yd_y}
        $$

        $$
        q_0 = \sqrt{x_ix_i + y_iy_i}
        $$

        $$
        q_1 = \sqrt{x_{i+1}x_{i+1} + y_{i+1}y_{i+1}}
        $$

        $$
        k_x = x_{i+1}x_{i+1} - x_ix_{i+1}
        $$

        $$
        k_y = y_{i+1}y_{i+1} - y_iy_{i+1}
        $$

        thus

        $$
        \iint_P\sqrt{x^2+y^2}\,dx\,dy=
        \frac{s}{6d_{xy}^3}\left[
          d_{xy}\left( q_0( x_i^2 - x_ix_{i+1} - y_id_y ) + q_1( k_x + y_{i+1}d_y ) \right) +
          s^2\log\frac{k_x + k_y + d_{xy}q_1}{x_id_x + q_0d_{xy} + y_id_y}
        \right]
        $$
      </p>

      <h5>Checking if a Polygon is not Closed</h5>

      <p>
        We can integrate $0$ over a polygon's edges in a similar way, to compute if the polygon is not closed (there are some cases where this happens and is useful).

        $$
        0=\sum_{i}(x_iy_i-x_{i+1}y_{i+1})
        $$

        This test (and any other tests of this type) will have false-negatives (for instance, if all the points are on an x or y axis, this formula won't detect non-closed polygons). However the useful probability of that happening can be reduced by using a random point as a translation:

        $$
        0=\sum_{i}\left[(x_i-p_x)(y_i-p_y)-(x_{i+1}-p_x)(y_{i+1}-p_y)\right]
        $$
      </p>

      <h5>Assorted formulas</h5>

      <p>
        Cases where we can adjust the formulas for integrals that might be useful

        $
        \iint_P1\,dx\,dy
        =\frac{1}{2}\sum_{i}(x_i+x_{i+1})(y_{i+1}-y_i)
        =\frac{1}{2}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)
        =\frac{1}{2}\sum_{i}(y_i+y_{i+1})(x_i-x_{i+1})
        $

        $
        \iint_Px\,dx\,dy
        =\frac{1}{6}\sum_{i}(x_i + x_{i+1}) (x_i y_{i+1}-y_i x_{i+1})
        =\frac{1}{6}\sum_{i}(x_i^2 + x_i x_{i+1} + x_{i+1}^2) (y_{i+1} - y_i)
        =\frac{1}{6}\sum_{i}(x_i-x_{i+1})(x_i(2y_i+y_{i+1}) + x_{i+1}(y_i+2y_{i_1}))
        $

        $
        \iint_Py\,dx\,dy
        =\frac{1}{6}\sum_{i}(y_i + y_{i+1}) (x_i y_{i+1} - y_i x_{i+1})
        =\frac{1}{6}\sum_{i}(y_i^2 + y_i y_{i+1} + y_{i+1}^2) (x_i - x_{i+1})
        =\frac{1}{6}\sum_{i}(y_{i+1}-y_i)(x_i(2y_i+y_{i+1}) + x_{i+1}(y_i+2y_{i_1}))
        $

        $
        \iint_Px^2\,dx\,dy
        =\frac{1}{12}\sum_{i}(x_i + x_{i+1}) (x_i^2 + x_{i+1}^2) (y_{i+1} - y_i)
        =\frac{1}{12}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i^2 + x_i x_{i+1} + x_{i+1}^2)
        $

        $
        \iint_Pxy\,dx\,dy
        =\frac{1}{24}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i (2 y_i + y_{i+1}) + x_{i+1} (y_i + 2 y_{i+1}))
        $

        $
        \iint_Py^2\,dx\,dy
        =\frac{1}{12}\sum_{i}(y_i + y_{i+1})(y_i^2 + y_{i+1}^2)(x_i - x_{i+1})
        =\frac{1}{12}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(y_i^2 + y_i y_{i+1} + y_{i+1}^2)
        $

        Additionally, powers of $x^m$ or $y^n$ on their own show a prime-factorization-like pattern when factored:

        $\iint_Px^0\,dx\,dy=\frac{1}{2}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)$
        $\iint_Px^1\,dx\,dy=\frac{1}{6}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i + x_{i+1})$
        $\iint_Px^2\,dx\,dy=\frac{1}{12}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i^2 + x_i x_{i+1} + x_{i+1}^2)$
        $\iint_Px^3\,dx\,dy=\frac{1}{20}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i + x_{i+1}) (x_i^2 + x_{i+1}^2)$
        $\iint_Px^4\,dx\,dy=\frac{1}{30}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i^4 + x_i^3 x_{i+1} + x_i^2 x_{i+1}^2 + x_i x_{i+1}^3 + x_{i+1}^4)$
        $\iint_Px^5\,dx\,dy=\frac{1}{42}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i + x_{i+1}) (x_i^2 - x_i x_{i+1} + x_{i+1}^2) (x_i^2 + x_i x_{i+1} + x_{i+1}^2)$
        $\iint_Px^6\,dx\,dy=\frac{1}{56}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i^6 + x_i^5 x_{i+1} + x_i^4 x_{i+1}^2 + x_i^3 x_{i+1}^3 + x_i^2 x_{i+1}^4 +
          x_i x_{i+1}^5 + x_{i+1}^6)$
        $\iint_Px^7\,dx\,dy=\frac{1}{72}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i + x_{i+1}) (x_i^2 + x_{i+1}^2) (x_i^4 + x_{i+1}^4)$
        $\iint_Px^8\,dx\,dy=\frac{1}{90}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i^2 + x_i x_{i+1} + x_{i+1}^2) (x_i^6 + x_i^3 x_{i+1}^3 + x_{i+1}^6)$
        $\iint_Px^9\,dx\,dy=\frac{1}{110}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i + x_{i+1}) (x_i^4 - x_i^3 x_{i+1} + x_i^2 x_{i+1}^2 - x_i x_{i+1}^3 +
           x_{i+1}^4) (x_i^4 + x_i^3 x_{i+1} + x_i^2 x_{i+1}^2 + x_i x_{i+1}^3 + x_{i+1}^4)$
        $\iint_Px^{10}\,dx\,dy=\frac{1}{132}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i^10 + x_i^9 x_{i+1} + x_i^8 x_{i+1}^2 + x_i^7 x_{i+1}^3 + x_i^6 x_{i+1}^4 +
          x_i^5 x_{i+1}^5 + x_i^4 x_{i+1}^6 + x_i^3 x_{i+1}^7 + x_i^2 x_{i+1}^8 + x_i x_{i+1}^9 +
          x_{i+1}^10) $

        So some powers are more efficient to evaluate than others:

        $\iint_Px^{128-1}\,dx\,dy=\frac{1}{16512}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i + x_{i+1})(x_i^2 + x_{i+1}^2)(x_i^4 + x_{i+1}^4)(x_i^8 + x_{i+1}^8)(x_i^16 + x_{i+1}^16)(x_i^32 + x_{i+1}^32)(x_i^64 + x_{i+1}^64)$
        $\iint_Px^{81-1}\,dx\,dy=\frac{1}{6642}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(x_i^{2}+x_ix_{i+1}+x_{i+1}^{2})(x_i^{6}+x_i^{3}x_{i+1}^{3}+x_{i+1}^{6})(x_i^{18}+x_i^{9}x_{i+1}^{9}+x_{i+1}^{18})(x_i^{54}+x_i^{27}x_{i+1}^{27}+x_{i+1}^{54})$
      </p>

      <p class="TODO">
        Note "ability to skip horizontal edges" in math doc. Can formulate area/centroid/bilinear to all ignore horizontals.
        We can prove alternative formulas based off of Soefredi and the "opposing variable terms"
      </p>

      <h5>Alpenglow Area Formulas</h5>

      <p>
        We are best served by the Shoelace formula. It is fast and has good numerical performance:
      </p>

      <p>
        $$
        area=
        \frac{1}{2}
        \sum_{i}
        (x_i+x_{i+1})(y_{i+1}-y_i)
        $$

        For our edge-clipped polygons, we will want to add in the contribution of the counts:

        $$
        area_{clip} = ( y_{max} - y_{min} )( count_{x_{min}} x_{min} + count_{x_{max}} x_{max} )
        $$

        NOTE that it only uses two of the counts! We can't abuse this too much, because for recursive clipping we still
        need to store the opposite counts.
      </p>

      <h5>Alpenglow Centroid Formulas</h5>

      <p>
        As noted above, some of the more common formulas have terrible numerical performance. We are using the following
        one, since it results in simple formulas for the normal and edge-clipped cases, AND it shares a good amount of
        the computation between the $x$ and $y$ formulas:
      </p>

      <p>
        $$
        centroid =
        \frac{1}{6\ area}
        \sum_{i}
        \left( x_i ( 2 y_i + y_{i+1} ) + x_{i+1} ( y_i + 2 y_{i+1} ) \right)
        \begin{bmatrix}
        x_i - x_{i+1} \\ y_{i+1} - y_i
        \end{bmatrix}
        $$

        Additionally for our edge-clipped polygons, we will want to add the contribution based on the counts:

        $$
        centroid_{clip} =
        \frac{1}{area}
        \begin{bmatrix}
        x_{avg} ( x_{min} - x_{max} ) * ( count_{y_{min}} * y_{min} + count_{y_{max}} * y_{max} ) \\
        y_{avg} ( y_{max} - y_{min} ) * ( count_{x_{min}} * x_{min} + count_{x_{max}} * x_{max} )
        \end{bmatrix}
        $$

        Where $x_{avg}=\frac{1}{2}(x_{min}+x_{max})$ and $y_{avg}=\frac{1}{2}(y_{min}+y_{max})$ are the values of the
        center of the edge-clipped area.
      </p>

      <h4 id="antialiasing-filters">Filters</h4>
      <!-- For antialiasing-->

      <p>
        Note: other approaches include
        <span class="citation" data-citation-id="doi:10.1080/2151237X.2005.10129191"></span>
        <span class="citation" data-citation-id="doi:10.1080/2151237X.2005.10129189"></span>
        <span class="citation" data-citation-id="10.1111:cgf.12070"></span>
      </p>

      <p>
        For each filter, we are going to evaluate it at a pixel's "center" to get a single sample. We will do this by
        evaluating the integral of the filter's value over the area included in the polygon within the filter's range of
        support.
      </p>

      <p>
        Below, we are able to evaluate piecewise-polynomial filters with an exact analytical solution. While the box filter
        has one piecewise-polynomial section (the constant function), the others are split up into gridded sections,
        each of which is a polynomial. We will essentially determine the clipped part of the polygon for each section,
        and evaluate the integral with the formula for that section.
      </p>

      <p>
        NOTE: When using a non-box filter, we will need to examine/process a larger area than just the resulting bounds.
        In the code, this is called the <code>contributionBounds</code>.
      </p>

      <h5>Box Filter</h5>

      <div class="example">
        <div id="box-diagram"><div class="example placeholder" style="width: 600px; height: 200px;"></div></div>
      </div>

      <p>
        We will take our filter (which is a constant function of 1 in a unit-square centered on the origin), and evaluate
        it just for the square area around it. This is equivalent to finding the area of the polygon within this square area.
      </p>

      <h5>Bilinear Filter</h5>

      <div class="example">
        <div id="bilinear-diagram"><div class="example placeholder" style="width: 600px; height: 200px;"></div></div>
      </div>

      <p>
        Bilinear filtering uses the tent filter $f(t) = |1-t|$ for $t \in [-1,1]$, but extended to 2D:
        $f(x,y) = |1-x| |1-y|$. Thus it has four quadrants, each of which is polynomial. We can evaluate the integral for
        the edges of the clipped polygon within each of these quadrants with the formula:

        $$
        \frac{1}{24}\sum_{i}(x_iy_{i+1}-x_{i+1}y_i)(12-4( x_i+y_i+x_{i+1}+y_{i+1})+2(x_iy_i+x_{i+1}y_{i+1})+
        x_iy_{i+1}+x_{i+1}y_i)
        $$

        We can simply take the absolute value of the relative $x$ and $y$ values, and use the same formula for all
        quadrants.
      </p>

      <h5>Mitchell-Netravali Filter</h5>

      <div class="example">
        <div id="mitchell-netravali-diagram"><div class="example placeholder" style="width: 600px; height: 200px;"></div></div>
      </div>

      <p>
        The Mitchell-Netravali filter (really a class of filters, where we use the common parameter values $B=1/3$ and $C=1/3$)
        is a piecewise-polynomial filter. Notably:

        $$
        f(t)=
        \begin{cases}
          |t|<1, & \frac{1}{6}((12 - 9B - 6C)t^3 + (-18 + 12B + 6C)t^2 + (6 - 2B)) \\
          1\le |t| < 2, & \frac{1}{6}((-B - 6C)t^3 + (6B + 30C)t^2 + (-12B - 48C) t + (8B + 24C))
        \end{cases}

        $$

        We will use the above polynomial filtering techniques to evaluate the integral. The resulting formulas are not
        terribly pretty. We will have 3 cases to handle after symmetry (we can use the absolute value to get down to 4
        cases, and then reflection to get down to 3).
      </p>

      <p>
        Note that this filter also has sections where it goes negative, so it can actually cause negative values.
        We try not to clamp color values until the very end (during gamut mapping), so that this effect will work nicely.
      </p>

      <p>
        It will result in 16 different sections, each of which will contribute to a final output sample (pixel).
      </p>

      <h4 id="antialiasing-examples">Examples</h4>

      <p>
        Aliasing (what we are trying to avoid) is the low-frequency pattern that spuriously appears when we display
        high-frequency content in a low-resolution way. In general, we want to reduce these low-frequency patterns
        without overly introducing blurring.
      </p>

      <h5>Siemens Stars</h5>

      <p>
        Siemens stars can show Moiré patterns.
      </p>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span2"></div>
        <div class="span4 example">
          <div id="siemens-canvas"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Canvas
        </div>
        <div class="span4 example">
          <div id="siemens-svg"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          SVG
        </div>
        <div class="span2"></div>
      </div>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span4 example">
          <div id="siemens-box"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Box Filter
        </div>
        <div class="span4 example">
          <div id="siemens-bilinear"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Bilinear Filter
        </div>
        <div class="span4 example">
          <div id="siemens-mitchell-netravali"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
          Mitchell-Netravali Filter
        </div>
      </div>

      <h5>Blurs</h5>

      <p>
        If we artificially scale the filter (possible in Alpenglow software, but not planned for GPU), it is possible to
        generate blurs.
      </p>


      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span6 example">
          <div id="blur-reference"><div class="example placeholder" style="width: 150px; height: 150px;"></div></div>
          Reference (Dirac Delta)
        </div>
        <div class="span6 example">
          <div id="blur-box"><div class="example placeholder" style="width: 150px; height: 150px;"></div></div>
          Box Filter
        </div>
      </div>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span6 example">
          <div id="blur-bilinear"><div class="example placeholder" style="width: 150px; height: 150px;"></div></div>
          Bilinear Filter
        </div>
        <div class="span6 example">
          <div id="blur-mitchell-netravali"><div class="example placeholder" style="width: 150px; height: 150px;"></div></div>
          Mitchell-Netravali Filter
        </div>
      </div>

      <p class="DEFERRED">Polynomial approximation to lanczos filter would be great</p>

      <p class="DEFERRED">
        Unsharp masking would be great with the "overblurring" (scaling the filter up). Is there a way to create a
        <code>RenderBlur</code> in any reasonable fashion? This would need... multiple levels of Alpenglow reduction?
        (Would sharpen the oklab luminosity channel)
      </p>

      <p class="DEFERRED">
        We could compute the gradient of a moving window/filter. Compute deltas of points in the directions they will move,
        then compute the derivative based on that. Points will move along edges if on the boundary.
        Points in corners don't have compensation. Points on the "shifting" (not sliding) border likely have compensation.
      </p>

      <h3 id="color-and-blending">Color &amp; Blending</h3>

      <p>
        Blending of colors is important to get right for correct visual results. There are multiple color spaces where
        blending can be done. Furthermore, anti-aliasing with correct blending will give a more accurate appearance.
      </p>

      <p>
        For example, in the above anti-aliasing examples, on many browsers the SVG/Canvas examples will show a "darkening"
        in areas where there is a lot of overlap. This is typically a result of blending in the sRGB color space, which
        is NOT linear.
      </p>

      <p class="TODO">Blending with/without linear conversion</p>

      <p class="TODO">Blending in different color spaces</p>

      <p class="TODO">Premultiplication section</p>

      <p class="TODO">Gamma correction notes</p>

      <p class="TODO">sRGB and Display P3 --- Note the Display P3 color matrices (we could not find those online)</p>

      <p class="TODO">
        Check that the Display P3 colors are side-by-side identical - maybe white point slight change? WE SHOULD check
        Chrome's conversions
      </p>

      <p class="TODO">Gamut Mapping</p>

      <p class="TODO">
        Better gamut mapping: <a href="https://bottosson.github.io/posts/gamutclipping/">https://bottosson.github.io/posts/gamutclipping/</a>
      </p>

      <p class="TODO">More documentation</p>

      <p class="TODO">
        Get oklch color space up and running, can do hue gradients then.
      </p>

      <h3 id="renderProgram">RenderProgram</h3>

      <p class="TODO">Documentation</p>

      <p class="TODO">
        NOTE: That the CAG process is essentially just simplifying into a different RenderStack (RenderUnion, because it
        is disjoint?) with each RenderableFace included. And our rasterizer is also just a RenderProgram transform, giving
        us a RenderableFace broken down to the pixel level. Note that we split everything except for gradients/depth at the
        first step, THEN we split gradients/depth. We could potentially generalize to different stages of splitting, OR
        we could force splitting all at once.
      </p>

      <p class="TODO">
        Allow more general computation, with conditionals, RenderPrograms that provide the x,y, Constant(n), all Vector4
        for now (based on our stack for things). Possibly, can we auto-differentiate RenderPrograms?
      </p>

      <p class="TODO">
        Generalized first-class rasterization: Implement gaussian blur / drop shadow, by executing an overfiltered copy,
        then combine? This would require a lot more legwork to do.
      </p>

      <h4 id="RenderColor">RenderColor</h4>

      <p>
        RenderColor displays a single solid color everywhere, and is a basic building-block for many other RenderPrograms.
      </p>

      <div class="example">
        <div id="RenderColor-example"><div class="example placeholder" style="width: 512px; height: 128px;"></div></div>
      </div>

      <p>
        Note that the default color space is <a href="https://en.wikipedia.org/wiki/SRGB">sRGB</a>. Additionally, the
        values are by default interpreted with
        <a href="https://en.wikipedia.org/wiki/Alpha_compositing#Straight_versus_premultiplied">premultiplied alpha</a>
        (also called associated alpha), so <code>0.5, 0, 0, 0.5</code> will represent a 50% transparent fully-red color.
      </p>

      <h4 id="RenderPathBoolean">RenderPathBoolean</h4>

      <p>
        RenderPathBoolean will display one RenderProgram "inside" the path, and another RenderProgram "outside" the path.
      </p>

      <div class="example">
        <div id="RenderPathBoolean-example"><div class="example placeholder" style="width: 512px; height: 128px;"></div></div>
      </div>

      <h4 id="RenderStack">RenderStack</h4>

      <p>
        RenderStack will apply normal compositing/blending to a list of RenderPrograms, where each RenderProgram in the
        list is drawn "on top" of all of the previous ones.
      </p>

      <div class="example">
        <div id="RenderStack-example"><div class="example placeholder" style="width: 512px; height: 128px;"></div></div>
      </div>

      <h4 id="RenderLinearBlend">RenderLinearBlend</h4>

      <p>
        RenderLinearBlend will interpolate between two different RenderPrograms based on the location. It will evaluate
        <code>clamp( dot( scaledNormal, point ) - offset, 0, 1 )</code>, and will linearly blend between the "zero"
        program (when the value is 0) and the "one" program (when the value is 1).
      </p>

      <p>
        It can be used in a standalone way, however it is primarily meant to be used when a <code>RenderLinearGradient</code>
        is split into each section between two gradient stops.
      </p>

      <div class="example">
        <div id="RenderLinearBlend-example"><div class="example placeholder" style="width: 512px; height: 128px;"></div></div>
      </div>

      <h4 id="RenderLinearGradient">RenderLinearGradient</h4>

      <p>
        RenderLinearGradient will display the typical linear gradient.
      </p>

      <div class="example">
        <div id="RenderLinearGradient-example"><div class="example placeholder" style="width: 512px; height: 128px;"></div></div>
      </div>

      <p class="TODO">
        NOTE: The shear cases are broken on splits, we can't just transform the points
      </p>

      <h4 id="RenderRadialBlend">RenderRadialBlend</h4>

      <p>
        RenderRadialBlend will interpolate between two different RenderPrograms based on the location. It will evaluate
        <code>clamp( ( averageFragmentRadius - radius0 ) / ( radius1 - radius0 ), 0, 1 )</code>, and will linearly blend
        between the "zero" program (when the value is 0) and the "one" program (when the value is 1).
      </p>

      <p>
        It can be used in a standalone way, however it is primarily meant to be used when a <code>RenderRadialGradient</code>
        is circular, and is split into each radial-linear partition.
      </p>

      <div class="example">
        <div id="RenderRadialBlend-example"><div class="example placeholder" style="width: 512px; height: 128px;"></div></div>
      </div>

      <h4 id="RenderRadialGradient">RenderRadialGradient</h4>

      <p>
        RenderRadialGradient will display the typical radial gradient.
      </p>

      <div class="example">
        <div id="RenderRadialGradient-example"><div class="example placeholder" style="width: 512px; height: 128px;"></div></div>
      </div>

      <p class="TODO">The split "Reflect" extend type seems to be reversed from the desired behavior!</p>

      <p class="TODO">We need UnsplitCentroid to get more general conic gradients working, it should NOT split in those cases</p>

      <h4 id="RenderBlendCompose">RenderBlendCompose</h4>

      <p>
        RenderBlendCompose will blend/composite two RenderPrograms with a more general blending model, that takes in
        a Porter-Duff <a href="https://en.wikipedia.org/wiki/Alpha_compositing">compositing mode</a>, in addition to a
        <a href="https://en.wikipedia.org/wiki/Blend_modes">blend mode</a>.
      </p>

      <div class="example">
        <div id="RenderBlendCompose-example"><div class="example placeholder" style="width: 512px; height: 128px;"></div></div>
      </div>

      <p class="TODO">Add blend type example, once text is easier, like https://learn.microsoft.com/en-us/windows/win32/direct2d/blend</p>

      <p class="TODO">Docs/demo for RenderColorSpaceConversion (and color conversions), RenderPremultiply/RenderUnpremultiply</p>
      <p class="TODO">Docs/demo for RenderImage</p>

      <p class="TODO">
        Images can use the splitting infrastructure. Split with pixels. For regions of filter support. Box filter single
        sample per square. Bilinear larger and overlapping and each has 4 samples. Mitchell-Netravali has 16 per. Need to
        potentially store the transform, for asymmetric shear etc. we split into filterable sections.
      </p>

      <p class="TODO">
        Mipmapping! Blend between adjacent mipmap levels (especially for perspective). Anisotropic for 3d, like
        <a href="https://en.wikipedia.org/wiki/Anisotropic_filtering#/media/File:MipMap_Example_STS101_Anisotropic.png">this example</a>.
        We will want to mipmap "based on the area" if we aren't doing the "analytic" bits. So isotropic mipmap can look at the scale.
        We will want to "bias" presumably, adjustably?
        We will want to mipmap-blend the "extended" version (in each mipmap level computed, if non-power-of-2, it should be "padded" with the extended version for correct results)
        Mipmaps should work for any filter type
      </p>

      <p class="TODO">
        HOW are we handling Images? One atlas? How are we able to split things so this works?
        We could always just... determine the size, and create that many atlases?
      </p>

      <p class="TODO">Docs/demo for RenderAlpha</p>
      <p class="TODO">Docs/demo for RenderFilter</p>
      <p class="TODO">Docs/demo for RenderBarycentricBlend</p>
      <p class="TODO">Docs/demo for RenderBarycentricPerspectiveBlend</p>
      <p class="TODO">Docs/demo for RenderDepthSort / RenderPlanar</p>
      <p class="TODO">Docs/demo for RenderPhong / RenderLight / RenderNormalize</p>

      <h4 id="renderProgram-simplification">Simplification</h4>

      <p class="TODO">Documentation</p>

      <h4 id="renderProgram-execution">Execution</h4>

      <p>
        The software tree form of RenderPrograms can be directly executed. However it is better if we turn our RenderProgram
        into a series of instructions that can be executed in a more efficient way.
      </p>

      <p>
        We are able to output instructions (both in software as types, and on the GPU in a binary format) that operate in
        a simple stack-based fashion. We will have a main stack, where vec4 values are pushed and popped. Instructions
        primarily either push content, or pop content then push more content (processing existing values). We are able to
        determine a bound on the stack size by analyzing the RenderProgram (normal visual blending won't cause stack
        issues, since we are able to blend each pair of colors before computing the next color).
      </p>

      <p>
        This is combined with a stack of instruction indices, to support function calls and return values, in addition to
        simple branching logic. It turns out all of the primitives we need can be done with "opaque jumps" (if the top of
        the stack is a fully-opaque value, jump to an instruction index) and function calls.
      </p>

      <h5>Inputs for Evaluation</h5>

      <p>
        <code>RenderEvaluationContext</code> represents the main inputs to evaluation, namely:
      </p>

      <ul>
        <li>Bounding box of affected area (typically clipped to pixel sizes)</li>
        <li>(optional) face data (<code>ClippableFace</code>, or other edge-clipped data on the GPU)</li>
        <li>(optional) area</li>
        <li>(optional) centroid</li>
      </ul>

      <h5>Direct Evaluation</h5>

      <p>
        <code>RenderProgram</code> (TS object) supports the direct evaluation by calling
        <code>.evaluate( context: RenderEvaluationContext )</code>.
      </p>

      <h5>Instruction Evaluation</h5>

      <p>
        <code>RenderProgram</code> has a method <code>.writeInstructions( instructions: RenderInstruction[] )</code> that
        will write out instruction objects (<code>RenderInstruction</code>) in a stream-like fashion.
      </p>

      <p>
        These can be evaluated with <code>RenderExecutor</code> in software (load the instructions, and then it can be
        executed many times with different contexts).
      </p>

      <h5>Binary Instruction Evaluation</h5>

      <p>
        <code>RenderInstruction</code> has a number of utilities, including the ability to convert a stream of
        instructions into a binary format. These can then be executed on the GPU with WGSL code.
      </p>

      <p>
        Instruction lists in the binary format are always suffixed with an exit instruction, so that we can have a
        solid block containing multiple instruction lists. During rasterization, we need to track the index of the start
        of a RenderProgram's instructions inside this list.
      </p>

      <h5>Instructions Demo</h5>

      <p>
        Edit the below code to see its form, and the object/binary instructions associated with it.
      </p>

      <div class="example">
        <div id="execution-instructions-example"><div class="example placeholder" style="width: 512px; height: 128px;"></div></div>
      </div>

      <p class="TODO">
        Add "stack depth" getters to RenderProgram (both the main and instruction stacks), so we know what depth is needed.
        RenderPhong can use up a good amount of stack space (ambient/diffuse/specular/position/normal + direction/color for
        each light), BUT we really want to minimize the memory individual invocations/threads take up during execution.
        We can customize the stack sizes at shader creation time... maybe we can do pipeline-overloadable constants
        instead?
      </p>

      <p class="TODO">
        Consider separating out the RenderInstruction items into its own subdirectory. Potentially the logic items also.
      </p>

      <p class="TODO">
        Add error handling to detect if we blow past a stack?
      </p>

      <p class="TODO">
        Full unit tests to compare all of the versions. Flesh it out, fuzz it.
      </p>

      <p class="TODO">
        Detect if we run past a buffer, and set a flag so we can restart with a larger buffer? This will still cause a delay...
      </p>

      <p class="TODO">
        Document stack-based operations with stack-effect diagrams: "( before -- after )" (top of stack states).
      </p>

      <p class="TODO">
        If we have hashed RenderPrograms, deduplicate so we only write RenderProgram instructions for one.
      </p>

      <p>
        Note: RenderProgram instruction execution can be interrupted and saved, if we want to do so in the future.
      </p>

      <h3 id="WGSL">WGSL</h3>

      <p>
        Specification changes: <a href="https://github.com/gpuweb/gpuweb/commits/main">https://github.com/gpuweb/gpuweb/commits/main</a>,
        WebGPU Chrome changes: <a href="https://developer.chrome.com/blog/new-in-webgpu-118/">https://developer.chrome.com/blog/new-in-webgpu-118/</a>,
        subgroups proposal: <a href="https://github.com/gpuweb/gpuweb/blob/main/proposals/subgroups.md">https://github.com/gpuweb/gpuweb/blob/main/proposals/subgroups.md</a>.
      </p>

      <p>
        Safari progress: <a href="https://github.com/WebKit/WebKit/commits/main/Source/WebGPU">https://github.com/WebKit/WebKit/commits/main/Source/WebGPU</a>.
      </p>

      <p class="TODO">
        Try out "unrestricted pointer parameters", can we do things better with this?
      </p>

      <p class="TODO">
        Try out subgroups <a href="https://github.com/gpuweb/gpuweb/blob/main/proposals/subgroups.md">https://github.com/gpuweb/gpuweb/blob/main/proposals/subgroups.md</a>,
        need to opt-in on chrome. Could write specific options to use subgroups where available.
      </p>

      <p class="TODO">
        Complete TypeScript switch (e.g. testing, raster-clipper, etc.) -- almost done, get things working with phet-lib build.
      </p>

      <p class="TODO">
        Inspect <a href="https://github.com/brendan-duncan/webgpu_inspector">WebGPU Inspector</a> by Brendan Duncan.
      </p>

      <p class="TODO">
        Review <a href="https://github.com/gpuweb/gpuweb/blob/main/proposals/subgroups.md">Subgroups proposal</a>.
      </p>

      <p class="TODO">
        We are <em>very broken</em> with non-integral devicePixelRatio (browser zoom in).
        Use <a href="https://developer.mozilla.org/en-US/docs/Web/API/MediaQueryList/change_event">media query change event</a>
        to potentially detect (a) display-p3 availability, (b) devicePixelRatio, (c) light-dark mode changes,
        (d) possibly WebGPU support changes?
        <a href="https://github.com/phetsims/scenery/issues/1585">https://github.com/phetsims/scenery/issues/1585</a>.
        <a href="https://stackoverflow.com/questions/28905420/window-devicepixelratio-change-listener">Alternative ref 1</a>,
        <a href="https://developer.mozilla.org/en-US/docs/Web/API/Window/devicePixelRatio">MDN ref</a>.
      </p>

      <p class="TODO">
        Lint with  <a href="https://github.com/wgsl-analyzer/wgsl-analyzer">wgsl-analyzer</a> on generated output, in an
        automated way.
      </p>

      <p class="TODO">
        Investigate <a href="https://lightsighter.github.io/CudaDMA/">CudaDMA</a> for memory patterns.
      </p>

      <p>
        WGSL Parsing options: <a href="https://lezer.codemirror.net/">https://lezer.codemirror.net/</a>
      </p>

      <p class="TODO">
        Add a JS WGSL parser, so we can do AST manipulations and checks. Lots of assertions possible. Look into peggy.js.
        Or <a href="https://github.com/mehmetoguzderin/tree-sitter-wgsl/blob/main/grammar.js">Tree Sitter WGSL grammar</a>
        (requires WASM?)
      </p>

      <p class="TODO">
        Read <a href="https://gpuopen.com/learn/optimizing-gpu-occupancy-resource-usage-large-thread-groups/">this</a> for performance notes.
      </p>

      <p class="TODO">
        Bake in support for context loss handling.
      </p>

      <p class="TODO">
        Look more into <a href="https://shadeup.dev/">Shadeup</a>.
      </p>

      <p class="TODO">
        Create a pattern so that we can easily pre-load shaders before they are used.
      </p>

      <p class="TODO">
        Live WGSL sandbox with compute input/output (and all our existing WGSL included?) -- use our templates in this,
        would be powerful. Have it work like ShaderToy a bit.
      </p>

      <p class="TODO">
        Label canvas swapchain textures! texture.label =
      </p>

      <p class="TODO">
        Better familiarize with <a href="https://shader-slang.com/slang/user-guide/00-introduction.html">https://shader-slang.com/slang/user-guide/00-introduction.html</a>.
      </p>

      <p>
        In WGSL, prefer CamelCase for types, but snake_case for variables/functions. This helps differentiate (a bit)
        the template variables (camcelCase).
      </p>

      <p class="TODO">
        Use <code>clearBuffer</code> to zero out buffers (potentially when new), will that flag Dawn to not add barriers
        and usage switches? (For the parallel execution of dispatches).
      </p>

      <h4 id="WGSL-templating">Templating</h4>

      <p class="TODO">
        Factored out way of handling the "expression or statement" facility, so that things can be specified in a
        generic way (WGSLContext CAN turn these into function calls, but we would like to do this in places where we
        do not want function call overhead.
      </p>

      <p class="TODO">
        Better way of guaranteeing unique local variable names. For now, try to add prefixes? This will be hard to
        notice if we're silently shadowing variables.
      </p>

      <p class="TODO">
        A templating function multiply (for constants), that can replace with bit shifts where helpful?
      </p>

      <p class="TODO">
        Sometimes parsing the WGSL of a parameter (to tell if it's doing something, and possibly optimizing it) could be
        useful. Getting to work with the WGSL AST would be great. For instance, knowing "hey, is this a statement or
        an expression?" would be great.
      </p>

      <h4 id="WGSL-types">Types</h4>

      <p class="TODO">
        "atomic" types possible for different configurations (compound objects!), we'll want to get these set up for
        atomic-able operations.
      </p>

      <p class="TODO">
        ConcreteType Structures<br>
        We'll want to be able to slice/subtype (e.g. "extract these fields)? - e.g. sort on subset, move full object<br>
        Support bit packing (combine booleans, finite-precision types - NOTE HLSL2021 includes bitfields)<br>
        Structure-of-arrays handling<br>
        Use size/alignment of structures for array/structure types. Our vec3 array is broken right now.
        Note that storage and uniform address spaces have different layout constraints:
        <a href="https://www.w3.org/TR/WGSL/#address-space-layout-constraints">https://www.w3.org/TR/WGSL/#address-space-layout-constraints</a>.
      </p>

      <p class="TODO">
        Support runtime-sized arrays in ConcreteType.
      </p>

      <p class="TODO">
        Have "casting" of ConcreteTypes to other types (e.g. reinterpretation as u32 array, or something using
        atomics in one pass, and not in another). This allows us to strongly type BufferSlots in varying contexts.
      </p>

      <h4 id="WGSL-linking">Linking</h4>

      <p class="TODO">
        Fully review WGSL linking from Mighdoll:
        <a href="https://github.com/mighdoll/wgsl-linker/blob/parser/README.md">wgsl-linker</a>
        and the <a href="https://github.com/mighdoll/wgsl-linker/blob/parser/packages/mini-parse/Parsing-README.md">parsing notes</a>.
        Note Nearley, Lezer might be good options for parsing fast.
      </p>

      <p class="TODO">
        Load/Store (separate from, but enables fusing)<br><br>

        Type terms:
          - WGSL-type - a WGSL type, including atomics and runtime-length arrays nested structs etc., however excluding
                        things we want to avoid (e.g. pointers, things that can't go in a var(storage), etc.)
          - Leafable - constructible WGSL-typeable (can let/return/parameter it), or atomic typeable (do a direct action).
                       NO combined atomics/other, and NO runtime-length arrays.
          - Leaf - the WGSL type (and location) of the individual read/write (that likely takes part of a larger
                   read/write operation). It is "leafable". - consider name "unit"
            NOTE it can have different scopes, e.g.:
              - read someArray[ 5 ]
              - read someArray[ 5 ].someField
              - read someArray[ 5 ].someField.otherSubField
              We'll need to see which can be faster (if we can read more data with a single read, even if we don't use
              some of it, vs. more reads but less data).
          - Logical type - logical array/structs with leaf nodes of boolean/integer/float/atomic types (defined number of bits).
                           describes the logical bits of a type
          - TypeScript type - Type that exists in TS, and stores the logical info in whatever way desired (easiest for
                              the client, can be read/written to/from things using a logical type).
          - Serialization - typescript <=> logical (happens for routines and start/end, and likely for BufferLogger support)
          - Packing - logical <=> buffers or BufferLogger output (can be spread apart)

        Load/Store terms:
          - Access - a read/write/atomic operation that has has a leafable WGSL-type, and could take or return a specific
                     value of that type (potentially with arbitrary data transformation/mapping)
          - Access DAG - TODO: if bidirectional, our data has DAG nature?
          - Node - TODO: has inputs and outputs? --- that's bidirectinoal
          TODO: think load-only case
            e.g. auto-generated indices(!)
            .. can also run arbitrary WGSL to generate data
            ** "read-only" nodes (like indices) can't be written to. you can't write to these "leaves"?
              -- and can't write full objects that contain these(!)
          TODO: think store-only case
            ... we can write auto-generated indices too, right?
            .. can also run arbitrary WGSL to store data?

        ********* WAIT, if we take this approach, do we LOSE the "conditional loading" setup? (only load part of what is needed?)
        **TODO TODO TODO TODO TODO** think about an example for this.
        Loads and Stores --- can accept arbitrary (no or many) arguments?
          Our loads seem to work at different levels(!)
            Say we have a logical array(struct{point:struct{x: f32, y: f32}, color:vec4u, id: u32, properties: array(u32,10)})
              - load() - IF we weren't runtime-length, could load the entire array
              - load( index )
              - load( index, 'point' )
              - BUT this doesn't work well for "mixes", e.g. "give me {x, id, properties[4]}, but named as {coord, id, fifthprop} in logical"
              load( subset ) <--
              SUBSETS:
                trivial subset: "give me the entire thing"
                "point" subset: "when asked for (index), I give load(index).point"
                "x" subset: "when asked for (index), I give load(index).point.x"
                "properties[4]" subset: "when asked for (index), I give load(index).properties[4]"
                ....?
              SUBSETS + ZIP/COMBINE:
                zip of that style: "when asked for (index), I give {coord: xSubset(index), id: idSubset( index ), fifthprop: fifthPropSubset( index )}"
                -- NOTE: a store of this might need to read the y (e.g. if we store point as vec2f in our raw).
            Clearly we have subset types:
              StructSubsetter( properties: string[], newPropNames: string[] ) -- try to keep props in order? (or... doesn't matter?)
              StaticArraySubsetter( index: number )
              DynamicArraySubsetter( index: WGSLExpressionU32 ) <--- *** we would use this one a lot
              ArraySliceSubsetter( start: number, end: number ) ???
              BitSubsetter( ...? extract bits )
              Array.applySubsetForEachElement( subsetter )
            And zip types:
              Struct( map: Record(string, ... ) )
              FixedLengthArray( elements: T[] ) <-- length implicit

              TODO TODO .... wait what? RuntimeLengthArray( ... )
                TODO: also how to describe the packing
            Permuters:
              RakedStripedPermute( workgroupSize: number, grainSize: number ) <--- uh, are we leaving an argument for the client?
            Bit Packing:
              ... maps logical types to logical types. presumably each "leaf" type has bit definitions, and we map leaf to leaf?
              TODO: how do arrays of runtime (or large) size work? Presumably we only want to "align" those in some cases, not others

              NOTE: To do bit-packing, we'll presumably need to **shift** things IN a u32.
                bitcast supports concrete numeric (scalar | vector), e.g. i32/u32/f32/f16, for vec2/vec3/vec4 of those.
                  so if it's faster, we could potentially bitcast vec4us, unless we have to split it up
                e.g. ( bitcast(u32)( a ) & 0xffff ) << 16 | ( bitcast(u32)( b ) & 0xffff0000 ), but some cases we can avoid

              BUT: we could potentially load more data at once, and bitcast PARTS of it
                NO! (I mean, potentially we can avoid this, but sometimes it might be helpful if we are array-of-structures)
                We really don't want the register pressure if possible.
              Can experiment with loading approaches.

              ... for now, can we just think about load/store to array(u32) as the main one type of conversion?
              combine immediately into vecN or matNxN (based on... zipping) for performance when needed.

              NOTE: We'll need to sometimes keep the data in a packed format, while just having the getters out. Maybe this is a default?
              Instead of extracting booleans to booleans, perhaps we extract them out of the packed format? (what can we potentially re-align?)

              bit-packing runtime arrays has different tradeoffs than bit-packing small fixed-length arrays or structs (somewhat)
                e.g. tight-packing 3-bit values into a u32, we will have 3 configurations of loads depending on the index
                  (wait, but really either "split across 1 or 2 u32s"...?) but with variable-length integers it gets tricky
                WAIT so we actually only need to handle... 2 different loading cases, since otherwise it is just
                  - "offset into N loads" or
                  - "offset into N+1 loads"
                NONO, if our source is something like a vec4u, then we can't just offset easily... we have to have different styles.
                BUT: if we just handle these with array(u32) for shared cases, we don't run into that.
                  -- we can zip into other things, or extract from u32s (like packed booleans transferred)

              WGSL-type mapping of logical types:
                WGSLStructOf( ... (ordered key map) )
                WGSLAtomicU32( atomic ) / WGSLAtomicI32( atomic )
                WGSLFixedArray( ..., length: number )
                WGSLRuntimeArray( ... )
                WGSLVec4u( ... ) / WGSLVec3f( ... ) / ...
                WGSLBitPacked( ..., { alignOptions } ) => arbitrary bit-packed structure?
              ... can we have reusable packings that we can use for things? (Yes, factor them out?)

              For a load/store, we effectively we have WGSL-type-map of our "input" and "output"

              ***** We'll actually be able to "aggregate" stores, and create WGSL-type mapping and logical types that include LOCAL VARIABLES.
              *****
              *** And likewise, we could have a load output things to multiple local variables...
                this would be like a top-level zip?

                This sounds potentially helpful NOW. And being able to "inline" a lot of the loads sounds helpful


              With booleans, presumably we can pack 32 into a u32
            Thus we would have:
              ((let x =)) load()
              store( value: WGSLExpressionT )
              ((let old =)) atomicX( ... )
              ((let old =)) atomicExchange( ... )
              ((let result =)) atomicCompareExchangeWeak( ..., ... )
                NOTE: "weak" since it might fail and no-op?

            So... once you have your logical mappings and whatnot in place, any (function/operation/mapping/load/store)
            that is provided a subset will need to specify information about how to WGSL-type the subset (bit-pack or explode),
            so we can generate the necessary WGSL code to handle it.

          *** We need to get something minimal working, so we can try it out, and get it to work with other algorithms we haven't done yet.
          *** potentially subgroup operations(!).

        \\\\\\\\\\\\\\\\\
          *** really, for our current goals, get it working so we can abstract out data sources and load/store operations,
          *** so our algorithms can be written in a generic way (can apply to anything, with mapping, collecting data sources, etc.)
        \\\\\\\\\\\\\\\\\

          ... a new language will make it so that we don't have to compute the "subsets" manually.
          ... and the load/store complications will be... almost easier?
          ... but doing code generation first will probably help figure out what the language would need.

          -----
            language, if we go that way:

          -----

          NOTE: Almost all of these access patterns can be changed with subset/zip/permute
            No args:
              - load-constant - no-args load
                load()
              - single-atomic-increment - no-args atomic
                atomicIncrement
              - store-constant - no-args set
                store()
            Some args:
              - typical array load - load( index: WGSLExpressionU32 ) => WGSLExpressionT
              - typical array store - store( index: WGSLExpressionU32, value: WGSLExpressionT ) => void
            More args:
              - typical static striped load - load( workgroupSize: number, grainSize: number, index: number ) => WGSLExpressionT
              - typical dynamic striped load - load( workgroupSize: number, grainSize: number, index: WGSLExpressionU32 ) => WGSLExpressionT
              - typical static striped store - store( workgroupSize: number, grainSize: number, index: number, value: WGSLExpressionT ) => WGSLStatement
              - typical dynamic striped store - store( workgroupSize: number, grainSize: number, index: WGSLExpressionU32, value: WGSLExpressionT ) => WGSLStatement
            SO:
              - loads/stores are essentially ( ...args ) => WGSLString (WGSLExpressionT for load, WGSLStatement for store)
              - we can also have arbitrary... other functions?
              - data source = { load: ( ...args ) => WGSLString, store: ( ...args ) => WGSLString }
              AND the strictly typed parts of this is great, DataSource&lt;LoadParams, StoreParams&gt;
              So we'll be able to handle load/stores in any way we want, AND create other type of objects as we want
            BUT WAIT:
              - given load(index) we can make a staticStripedLoad( workgroupSize, grainSize, index: number ) THAT CALLS load(index)
              - given store(index, value) we can make a staticStripedStore( workgroupSize, grainSize, index: number, value ) THAT CALLS store(index, value)
              etc.

        Examples:
          Sort:
            Give a "load" and a "store", that take the same logical type. (HEY, that is a data source on its own!)
            Give either "compare" or "extract bits + bit count" for the logical type.
            *if* the (minimized) load is zipped with the index, the store extracts the index to move the full one.
            We'll want to create buffers and workgroup vars to store the logical type
              We'll likely want to use a more packed format.
            Load (as written):
             takes (logical index), zips both the underlying data source given (logical index), and turns index into an
             index, providing (originalLoad(logical index), logical index)
            Store (as written):
              takes (logical index, logical type), extracts/subsets so we're storing (logical index, index-of-original),
              issues lookup so we are storing (logical index, load( index-of-original))
              *** our "store" op needs a "load" op
            Store (for if we didn't do the zip, and have the full element):
              takes (logical index, logical type), and we just write it into the location (normal store)
          Pairs (array of structures);
            buffer A, raw tightly-packed array(struct {x: u32, y: u32})
            data source for A: logical array(struct {x: u32, y: u32})
            data source B extracted: logical array(x: u32)
            data source C extracted: logical array(y: u32)
          Pairs (structure of arrays);
            buffer A, raw tightly-packed array(x: u32)
            buffer B, raw tightly-packed array(y: u32)
            data source for A: logical array(x: u32)
            data source for B: logical array(y: u32)
            data source C zipped: logical array(struct {x: u32, y: u32}) - load/store-able


        Mapping:
          - TODO: Should implement the TypeScript map function also?
          - TODO: Bidirectional vs unidirectional maps

        -- how to mappings fit in? it goes from one logical type to another, right?
        -- DO our mappings/transforms go from one "WGSL-type" to another, or can we make jumps in-between?
        ** can mappings only apply to certain parts(!) of a logical type? (e.g. map a vector, but leave the type intact)
        *-*- mappings should indicate:
          - what logical parts are read in
          - what logical parts are written out (changed)
          - which logical parts are unaffected (say we go from vec3 => vec2, maybe x is unaffected)

        -- Can I write things with higher-precision number types somewhat implicitly??? (e.g. knows how to multiply
           two u64s into a u128) - suggests an eventual language where this is possible and gets compiled down.


        -- Should we be able to specify "leaf" WGSL struct types, and "convert" to that specific type for an access?

        Load/store should support efficient ways of accessing data based on the particular order/striping, taking
        local variables and local/workgroup/etc. identifiers into account. Good WGSL. Consider a
        "predeclaration" somewhat similar to unroll, where we can create local variables needed for multiple loading.

        They can access BufferSlots, and will know the actual storage order (blocked/striped with workgroup/grain) and
        type. However they can also access workgroup variables, local variables, can be generated (i.e. indices or a
        mapping of that), and should handle permutation.

        DEFER:
          pointers
            - just deal with reference types, pointers will be tricky to use safely
            - do they have performance improvements?
          preambles
            - Should we try to avoid block scope preambles? Always easier to NOT have that, so we can declare things
              used later right?
            - loads/stores might have optional "preambles" (local variables we can compute with).
              ACTUALLY, we can wrap this code inside something... like unroll, but declare local variables and then have a
              "do work" section. We would NEED to guarantee we wouldn't have name conflicts (would each load have a unique
              ID?) --- you could TRANSFORM a "non-premabled" load into a "preambled load"?
            - Instead of preambles for now, make sure we can subset nicely to load "what we need"
          automatic subsetting by what is used
            - if we automatically check what "parts" of an implicit struct are used BEFORE deciding how to subset the type,
              we'll need to "run the code generation" for that part first. With the blueprint bit, that is tricky?
              Could we "dry compile" it, see what calls are needed, and then do it for real?
          composite operations on small types
            - Think about long-term "how we handle u8 arrays", since each memory access should really give us 4 elements.
              We'll hit the same memory access with 4 threads if we don't do it more intelligently.
          multidimensional load/store (multidimensional arrays)
            - Load, Load2, Load3, etc.

        Complications:
          atomics
            - CANNOT run normal load/store on parts that contain atomics
            - ONLY can run atomic operations directly
            - without extensions, we can't ptr(storage/workgroup) to them in functions, and our normal variations of
              load/store don't work.
          bit-packed writes
            NOTE: writes to bit-packed data are tricky, SINCE THEY COULD CONFLICT UNLESS WE DO ATOMICS
          range checks / length
            TODO: how to handle range checks? where do we store length?
          BufferLogger
            BufferLogger with zipped things from different buffers?

        * "data layout" is the actual configuration of leaf types for any data source (load/store)
        * "binary-compatible" data layouts are data layouts that have the same binary layout (e.g. same stride, etc.)
          - can have different binary-compatible data layouts for the same buffer data in different shaders
        * actual load/store only happens with leaves (or a struct that essentially sets multiple leaves)
        * every data source can produce WGSL to load/store any individual leaf.
        * "raw" data source is one that specifies the data layout directly (the WGSL type, and either the buffer,
          workgroup, private variable, or local variable that it is stored in).
          - e.g. might have array&lt;u32&gt;
        * "reinterpret" data source wraps another one (e.g. turning array-u32 into array-implicitStruct{x:u32, y:32, ... + some booleans})
          - changes the length of an array - we might have tricky mappings for this.
          - Also provides getter/setter code.
        * "leafify" (reinterpret) data source wraps another one (e.g. turning array-implicitStruct{x: u32, y:32, some booleans}
          into array-struct{xy: vec2, some booleans}) - now it's a leaf and we can access and store it.
          - If we have a binary/compare op on the "before", we should ideally have a binary op on it now.
          - Ability to leafify things to either (a) unpacked or (b) bit-packed memory-efficient forms.
        * "subset" - either one or multiple fields of a type (generally within an array?)
        * "map" (can be unidirectional or bidirectional) gives a different type (either for an array or a single bit?)
        * "zip" combines 2+ data sources into one data source. Can be used to create tuples.
        * "bit-pack" can turn a data source into a array-u32??? --- this we'd want to be the "raw" right?
          - DO WE just have the "reinterpret" as the bit-pack?
        * "striped permutation" - based on a workgroup/grain size, adjusts the indices, or:
        * "permutation" - some arbitrary type of permutation
        * "offset" should be trivial

        Examples of concepts applied:
          structure-of-arrays is simply data sources zipped together - works for all cases
          sort-by-subset accomplished by zipping an index with the mapped subset, then handling the last sort bit

        Separation of "data source" from load/store (for things like map)
          - data source just describes the data layout
          - can grab load/store from data source, THEN apply maps/etc.
        NOTE: For this, if it is a bidirectional (homomorphic) map, it can still act like a data source, no?

        TODO: see if performance is worse when we use a function to do the mapping

        Every access/store is for a WGSL type. They are either atomic or constructible.
        We maintain metadata to be able to "access" any parts of this type (if constructible).

        When subsetting things with bit-packed fields, we might be able to "transfer" multiple fields at once, e.g.
        a mask and bit shift could transfer multiple booleans at once.


        Examples:
          1:
            data:
              buffer A, raw tightly-packed array(u32)
              buffer B, raw tightly-packed array(f32)
            data sources:
              C:  raw A array(u32)
              C*: interpret C as array( u8 )
              D:  raw B array(f32)
              D*: interpret D as array( implicit{ x: f32, y: f32, z: f32 } )
              E:  zip C*,D* as array( implicit{ index: u8, x: f32, y: f32, z: f32 } )
              F:  subset E to array( implicit{ index: u8, x: f32 } )
              G:  leafify to array( struct{ index: u32 (rep u8), x: f32 } ) <--- hmm, do we do this so we can minimize duplicate loads?
              H:  striped raked coalesced access of workgroupSize,grainSize to array( struct{ index: u32 (rep u8), x: f32 }, grainSize )
              I:  H load at index, do a sort on x, then apply rearrange to E?
          TODO: example where we do some scan/reduce ON BIT-PACKED DATA(!)
            NOTE: writes to bit-packed data are tricky, SINCE THEY COULD CONFLICT UNLESS WE DO ATOMICS

        Each data source has a "logical" representation (e.g. a specific concrete/implicit type) independent of how it is stored.
        Each data source also can support requesting load/stores of specific "leaf" explicit representations, usually
        of parts/slices, but also the whole. For instance:
          "get the 'a' and 'b' fields from the 3rd striped element of the array, for specific workgroupgrain".
        -- NOTE: we probably first "transform" the data source by saying "give us the logical slice of an array with
          elements of just 'a' and 'b'". THEN transform it with getting things from a striped format with workgroup/grain.
          THEN just ask for the element (with either a constant or varying expression), possibly the preamble too.
          At each stage, should be able to note a good "access order"
          !!!!!
        -- NOTE: If we need to read data, we might have to "read" more bits than "necessary". If we're doing a store, we
          might need to read certain "unmodified" bits to write out the leaf type.
        -- Have a good way of encoding information into ByteEncoders (which can set the bytes/bits they care about)
        -- Ability to create "rearranged" leaf struct types (so that they take up less space).
          -- !! HOWEVER we probably don't need the binary equivalent forms WITH leaf struct types. Good eventually.
        -- !!! Wait for optimization of index expressions until later. We'll tackle it, just need to know WHAT we need
               to tackle first.
        -- Figure out how to load ONLY the stored leaf types needed to extract the substructures(!). And how to create
           substructures based on what an operation does (e.g. if we only need a certain operation that neglects
           certain fields, can we auto-ignore it?)

        -- Should we have two --- representations? Both a "buffer" representation, and a "local" representation?
        -- Probably not, we presumably will want to sometimes bit-pack "local" representations (for workgroup memory).
        -- We implicitly have a "bit-packed" representation, and then "WGSL leaf" representation?

        -- We'll want to sometimes extract multiple fields at the same time. So that we don't do "over-reads", perhaps
           create a custom subset as an explicit leaf type that can be read.

        -- how to create mappings/combination of types (and bit-pack them for storage, but have in-memory non-bit-packed representations)
        -- how to create the buffer slots / sources that are derivative for things

        implictStructLoad.someImplicitField.leafField.getAccessor( 'striped', workgroupSize, grainSize, { premable: true } )

        -- If we wrap an implicit struct into an explicit (leaf) struct, we need the code to get each field out to be generated.

        -- NEED variable-length arrays (but also support fixed-size arrays - we just don't need to specify a size for
        certain full operations).

        Load should only put "read" binding on a buffer slot. If we also have a store, then they will get combined.

        OR:

        --------------

        load.getLoaderForFields( ... ) // no, just use whatever API we can do create sub-structures (or also zip structs)

        --------------

        load.getBlocked( workgroupSize, grainSize )

        const accessor = load.getAccessor( 'striped', workgroupSize, grainSize, { premable: true } )
        ${accessor.preamble}
        ${accessor.load( index )}

        const accessor = load.getAccessor( 'global', { premable: false } )
        ${accessor.load( index )}

        --------------

        store.setBlocked( workgroupSize, grainSize, valueWGSL )

        const accessor = store.getAccessor( 'striped', workgroupSize, grainSize, { premable: true } )
        ${accessor.preamble}
        ${accessor.store( index, value )}

        --------------

        -- the ability to QUERY what order the data is in, so we can pick a good order to read with.
        For instance, is striped or blocked loading coalesced? If data is blocked, striped is coalesced.
        If data is striped, blocked is coalesced.

        Maybe a load can have different access patterns you can "GRAB" from it, as an object. Decide on premable or not.
        Then call the access pattern when needed.

        Can get dynamic offsets by reading from uniforms.

        -- we need to get better uniforms setup.

        IMPORTANT:
          atomics are NOT constructible, so we can't have them as a let, or a return type, or included in anything like that
          we CAN have a pointer to non-constructible things
          how can we handle pointers - also unrestricted_pointer_parameters?
            without this, user-defined function parameters can only have pointers of address-space function/private
            also unrestricts "each argument of pointer type to a user-defined function must have the same memory view as its root identifier
          pointers are... tricky, see https://gpuweb.github.io/gpuweb/wgsl/#root-identifier and alias analysis.
            we can't alias them in function calls with a read and write.
          .... can we get around these things by avoiding function calls, and inlining things?
        WE AVOID THIS BY NOT USING THE DATA SOURCE SYSTEM WITH ATOMICS. Atomics should use more of a raw system.

      </p>

      <p class="TODO">
        Ensure we're setting atomics values before we start accumulating into them.
      </p>

      <p class="TODO">
        A good example of fusing/etc. would be sorting one list of items by the results of another list or order? Or
        by extracting out part of them?
      </p>

      <p class="TODO">
        Do an initial fusing pass, but then move the raster clipper over into our new framework.
      </p>

      <p class="TODO">
        Get an automated tester for bind group strategies, so that we can try ALL of the possible strategies, and
        identify (based on the platform) what the fastest approaches are.
      </p>

      <p class="TODO">
        Fusing<br><br>

        Only some access patterns are compatible (i.e. will not need a shader barrier).<br><br>

        Do somewhat manual fusing - specify load/store for shared things to a local variable or workgroup memory,
        then combine the main()s. We will want to factor out our "mains" and their needs separately.

        How to reuse workgroup storage? (potential for u32 read-writing).

        More advanced fusing might be possible, for the future.
      </p>

      <p class="TODO">
        Buffer offsets for uniforms, for radix passes? (Can we use dynamic offsets in that case?) - We will want to
        minimize number of shaders eventually, for Windows.
      </p>

      <p class="TODO">
        Add Thrust-like primitives:

        (a) "fill"/"set" (to fill a certain buffer/slot with a specific value). Needed for non-zero (or multiple use) atomics.
        (b) "sequence"
        (c) "replace
        (d) "transform"
        (e) "filter" / "partition"
        (f) "count", and other map-reduce patterns.
        (g) "unique"
        (h) "sort-by-key" - and counting iterator to pair it.
        (i) ability to use iterators and zip (to create tuples with their composite ConcreteTypes)
        (j) iteration through LINKED LISTS (e.g. fast reduce/scan over them)

        See <a href="https://docs.nvidia.com/cuda/thrust/index.html">Thrust docs</a>.
      </p>

      <p class="TODO">
        Performance test radix sorts (micro and macro).
      </p>

      <p class="TODO">
        Get a way to visualize the memory loading of a pipeline (order, is it coalesced?)
      </p>

      <p class="TODO">
        Top level doc todo:

        We are creating a framework around WebGPU's compute shader APIs so that we can easily vary the bind group and buffer
        sharing (for profiling or optimization), while providing a convenient interface for writing shaders.

        TODO: describe the API we're working with

        TODO: flesh out this description and current work
      </p>

      <p>
        Current weakness with a pipeline that would be given different inputs (e.g. ping-ponging in radix sort).
        Current slot approach will require multiple pipelines (... more of a compilation step?) that will be
        long-term undesirable. However it sounds tricky to actually have different inputs on a pipeline (as that might
        actually constrain our bind group layout setup), since we'd need to pick ONE bind group layout for the pipeline.

        To repair... what if we had "switchable" buffer slots?

        This would result in multiple bind groups PER layout.
      </p>

      <p class="TODO">
        lengthExpression taking context. Factor out length getter. Think about abstractions for "expression" vs
        "assignment statement". For "statements", do we just have the user create a function? (With certain types,
        that might not be feasible?)
      </p>

      <p class="TODO">
        Minimal example of "shared" bind groups. Or also... maybe toss the log into a bind group that isn't modified.

        For this... get things like the radix sort example working (it's more... interesting).

        Once we have radix sort, test bind group approaches, buffer sharing/aliasing, buffer offsets (one big buffer?)
      </p>

      <p class="TODO">
        We'll really need to port over the double/triple reduce styles, since it handles non-commutative correctly.
      </p>

      <p class="TODO">
        Typing improvements for linking!
      </p>

      <p class="TODO">
        Buffer labels!
      </p>

      <p class="TODO">
        Break out data sources. Explicitly model data sources (input/output).
      </p>

      <p class="TODO">
        Set up example of stored "slots" potentially on routines as fields, so they can be accessed. Figure out bind group
        optimization with this.
      </p>

      <p class="TODO">
        Test performance comparison between "changing bind groups" and "changing pipelines" (could use one bind group with
        multiple pipelines, or multiple bind groups with one pipeline in some cases).
      </p>

      <p class="TODO">
        ConcreteBufferSlot aliasing - we'll want these to be classed as BufferSlotSlices (so things will pass through).
      </p>

      <p class="TODO">
        Get it working with texture storage (and combine with blitshader improvements) for full chain.
      </p>

      <p class="TODO">
        Get writeup for AI analysis.
      </p>

      <p class="TODO">
        Working Mermaid diagram for the linking process.
      </p>

      <p>
        Current approach uses slots (that are passed through or cast as needed). Sizes/offsets defined at blueprints.
        Blueprints can have hints for bind groups - but we expose fields recursively for full buffer control.
        We support partial binding (with Procedure), and "sharing" of buffers.
      </p>

      <p class="TODO">
        Performance test: bind groups, buffer sharing/aliasing, buffer offsets (one big buffer?).
      </p>

      <p class="TODO">
        NOTE dynamic offsets need to be a multiple of 256:
        https://www.w3.org/TR/webgpu/#dom-supported-limits-minstoragebufferoffsetalignment
      </p>

      <p class="TODO">
        toString() all of our objects when prototyping, so we can better understand the structures we are playing with.
      </p>

      <p class="TODO">
        In radix example, we would have two different reduction pipelines (one for each direction of the ping-pong).
        But... how can we get them to "share" their internals?
      </p>

      <p class="TODO">
        Use dynamic buffer offsets for easy uniform use (for radix passes). The "blueprint" will specify the dynamic
        nature, the bind group setup will provide the dynamic offset (GPUBufferBinding offset,size). Provide minBindingSize
        in the blueprint setup.
      </p>

      <p class="TODO">
        How can we support Thrust-like fusing of shader stages transparently/effortlessly? Do we not need this (say,
        everything is factored out where we don't have to create things as much)?
      </p>

      <p class="TODO">
        It's a pain for fusing right now, with workgroup memory, bindings, etc. We'd want to deduplicate bindings when
        fusing.
      </p>

      <p class="TODO">
        Consider splitting uniforms into "ones that change during calls" and "ones that do not". Performance cost
        potentially.
      </p>

      <p class="TODO">
        Use (conditionally, based on a boolean) <code>GPUCommandEncoder</code>'s <code>pushDebugGroup</code>,
        <code>popDebugGroup</code> and <code>insertDebugMarker</code>. Hopefully this will show up in things like
        RenderDoc/PIX.
      </p>

      <p class="TODO">
        Get an example with "render to Canvas texture" working with the new approach.
      </p>

      <p class="TODO">
        Rename "primary" compute pass into better naming (for when we are profiling but not separating). Test separated
        and profiled.
      </p>

      <p class="TODO">
        Factor out all of the associated code to render things to a Canvas texture (e.g. blitshader).
      </p>

      <p class="TODO">
        Use staging buffer ring like in
        <a href="https://toji.dev/webgpu-best-practices/buffer-uploads.html#:~:text=Buffer%20data%20flow,-Before%20we%20get%20into%20the">WebGPU Buffer Uploads</a>.
        Actually, find the other (WGPU) example of a buffer "belt" and implement that!
        See also <a href="https://github.com/gpuweb/gpuweb/blob/main/design/BufferOperations.md#updating-data-to-an-existing-buffer-like-webgls-buffersubdata">https://github.com/gpuweb/gpuweb/blob/main/design/BufferOperations.md#updating-data-to-an-existing-buffer-like-webgls-buffersubdata</a>
        and <a href="https://github.com/gpuweb/gpuweb/discussions/1428">https://github.com/gpuweb/gpuweb/discussions/1428</a> ("Buffer ring “belt” in wgpu. It sub-allocates chunks of buffers to ship?").
      </p>

      <p class="TODO">
        Use unique names in each template, so we can trace code back better(!)
      </p>

      <p class="DEFERRED">
        Dynamic buffer sizing (For the same blueprint / compiled version)
      </p>

      <p class="DEFERRED">
        Dynamic buffer offsets (for the same blueprint / compiled)
      </p>

      <p class="DEFERRED">
        Multiple flows (e.g. 1/2/3 level scan/reduce in the same pipeline) - tricky for bind group questions.
        How can we have different "flows" with the same blueprint? (e.g. differing levels of reduce). How can we
        get bind group layouts to work nicely for all of these approaches? Write down an example (e.g. scans in radix sort).
        Flows: we can actually just construct multiple different "routines" (e.g. reduce), and pick the one based on
        the constraint. How can we provide notes about the possible configurations to the bind group layout strategy?
        Really need to nail down bind group layout examples. What if flows are branching (e.g. any node can have multiple
        pathways that can share things).
        IMPORTANT: We can potentially have different bind groups provided for different flow cases. Even if they are
        using the same buffers. This would allow full optimization(!) for each different flow used. At runtime, we
        would need to know which flow is picked. Think about how we would set up bind groups for varying flows.
        Most importantly, will it affect bind groups
        or layouts OUTSIDE of the branch?
      </p>

      <p class="DEFERRED">
        Automated buffer sharing based on the "usage scope" of the subroutines - share "unrelated" buffers if scopes do
        not overlap - NOTE what do we need for zero-init atomic reduce?
      </p>

      <p class="DEFERRED">
        Automated combined uniforms. How can we "compose" uniform types, into one "read-only" uniform that could be left the entire pass? Provide
        a way to get things out of it that can be included in shader templates(!). Pipelines might need to register
        uniforms (with a type).
      </p>

      <p class="DEFERRED">
        "per compilation" memoizing of routines (for shared buffers, etc.)
      </p>

      <p class="DEFERRED">
        Pack multiple "slots" into one buffer, for buffer quantity limits (so we'll index into a big u32 buffer, and
        cast/extract contents).
      </p>

      <p class="DEFERRED">
        "per compilation" memoizing of routines (for shared buffers, etc.)
      </p>

      <p class="DEFERRED">
        Optimize buffer usage flags.
      </p>

      <p class="DEFERRED">
        Get it to work with vertex/fragment shaders too (e.g. BlitShader)
      </p>

      <h4 id="WGSL-recording">Recording</h4>

      <p class="TODO">
        Record performance tests (perhaps in PerformanceTesting), to create the setup/run/dispose object (e.g.
        () => { run, dispose } as closures).
      </p>

      <p class="TODO">
        Have the ability to generate an easily-profiled "full-path animation frame" form, so we can iterate quickly
        on manual tweaks to performance (e.g. manually tweak WGSL or WebGPU calls, to see how it performs).
      </p>

      <p class="TODO">
        Add in render (e.g. BlitShader) commands.
      </p>

      <p class="TODO">
        Add in wrapping of Canvas calls (especially if discovered, record the resolution). Wrap the configure and texture getter.
      </p>

      <p class="TODO">
        Record properties of the device, so we can see if things are compatible (preferred canvas format, supported
        storage texture formats, etc.)
      </p>

      <p class="TODO">
        Record onSubmittedWorkDone()
      </p>

      <p class="TODO">
        Use this to record any platform-specific behavior we have(!) - Or reproduction of bugs
      </p>

      <p class="TODO">
        Set up a library for third parties that stubs out WebGPU APIs, and records like this. Can enable a website for
        comparison of performance in approaches.
      </p>

      <p class="TODO">
        How to handle mapAsync/onSubmittedWorkDone nicely? We could record the order they are resolved in, and then
        replay them in that order.
      </p>

      <p class="TODO">
        ArrayBuffers: Do we store persistent and non-changing ones as "one object"? We could just recreate an ArrayBuffer
        for every data transfer. We could also see the length and contents, and store it as a "constant" on the new end.
      </p>

      <p class="TODO">
        Consider "replay" or "replay with replacement of objects in the closure" - Could potentially use this for production
        performance?
      </p>

      <p class="TODO">
        Be able to record the raster clipper
      </p>

      <h4 id="WGSL-logging">Logging</h4>

      <p class="TODO">
        Add log levels or features
      </p>

      <p class="TODO">
        Handle different output formats (rather than the raw, existing, etc.). Per-thread formats? My notes show
        "flat", "workgroup", "indexed" (show all threads), "no-data", "ranged" (show only a range of threads).
      </p>

      <p class="TODO">
        Look more at per-thread slices if helpful
      </p>

      <p class="TODO">
        Think about ways to add assertions into this? (Runtime assertions would be helpful).
      </p>

      <p class="TODO">
        How to handle only a workgroup-level log? (would we just do that for local_id === 0?) - Currently better than our
        "log every thread that gets it" approach, BUT it _IS_ collapsed.
      </p>

      <h4 id="WGSL-testing">Testing</h4>

      <p class="TODO">
        Write values into buffers PAST the length, to verify length checks.
      </p>

      <p class="TODO">
        Write bogus values into output buffers (so if we do e.g. atomics, we set the value correctly first).
      </p>

      <h4 id="WGSL-algorithms">Algorithms</h4>

      <p class="TODO">
        Add assertions for power-of-2 workgroupSize when needed (e.g. most places Math.log2 are used).
      </p>

      <p class="TODO">
        Double-buffering works around write-after-read race conditions, we should use that more.
      </p>

      <p>
        Many algorithmic details from <span class="citation" data-citation-id="bb5afff9714941c681fc0e5270168a71"></span>.
        Inspiration on patterns partially from <span class="citation" data-citation-id="BELL2012359"></span>.
      </p>

      <p class="TODO">
        Factor out "main_" type code so that it is reusable in other places (that might use different types of buffer
        reads, etc.)
      </p>

      <p class="TODO">
        Write main operations so they can operate at arbitrary buffer offsets - could included all reduces into a single buffer.
      </p>

      <p class="TODO">
        Read papers and books! - Notably Levien one with the scan patterns.
      </p>

      <p class="TODO">
        Review <a href="https://github.com/moderngpu/moderngpu">modernGPU</a> for other algorithms.
      </p>

      <p class="TODO">
        Investigate compute ordering of workgroups across devices, see https://compute.toys/view/207
        and https://compute.toys/view/210
      </p>

      <p class="TODO">
        Allow subtract() in addition to combine() in templates, there are some cases where it might be more efficient to
        compute (if that's allowed).
      </p>

      <p class="TODO">
        Implement our "matching reduce" (used for our chunk reduction) as a general thing.
      </p>

      <p class="DEFERRED">
        Create parallel TS version of templates.
      </p>

      <p class="DEFERRED">
        Convolution? "Halo" load in convolution, see Programming Massively Parallel Processors book.
      </p>

      <p class="TODO">
        Check into <a href="https://github.com/greggman/webgpu-utils">https://github.com/greggman/webgpu-utils</a>,
        could it work for WGSL struct offsets?
        <a href="https://webgpufundamentals.org/webgpu/lessons/resources/wgsl-offset-computer.html#">Offset computer</a>
      </p>

      <p>
        Some good examples of primitives: <a href="https://github.com/fynv/webgpu_math/tree/master/client">https://github.com/fynv/webgpu_math/tree/master/client</a>
      </p>

      <p>
        Some demos: <a href="https://tellusim.com/webgpu/">https://tellusim.com/webgpu/</a>.
      </p>

      <p class="TODO">
        Install and play with CUDA so I have more general knowledge/familiarity.
        <a href="https://journal.hexmos.com/gpu-survival-toolkit/">intro notes</a>.
        <a href="https://developer.nvidia.com/cuda-downloads">CUDA downloads</a>
      </p>

      <p class="TODO">
        <a href="https://docs.nvidia.com/cuda/thrust/index.html#algorithms">Review Thrust algorithms</a>, and
        <a href="https://github.com/STEllAR-GROUP/hpx">HPX</a>.
      </p>

      <p class="TODO">
        Review this <a href="https://safari.ethz.ch/projects_and_seminars/fall2022/doku.php?id=heterogeneous_systems">Heterogeneou systems class</a>.
      </p>

      <p class="TODO">
        Create explicit stream compaction. Or also "prefix max" with index of first occurrance.
      </p>

      <h4 id="WGSL-reduce">Reduce</h4>

      <p>
        Referenced <span class="citation" data-citation-id="bb5afff9714941c681fc0e5270168a71"></span> and <span class="citation" data-citation-id="KieberEmmonsReduce"></span> in the implementations.
      </p>

      <p>
        Commutative and non-commutative monoids have to be handled differently. For commutative cases, we can load it
        in a striped memory-access pattern regardless of the original ordering (although we need to know the original
        ordering to do range-based checks). For non-commutative cases, a raked reduce doesn't work well unless we are
        loading in the normal (blocked) order.
      </p>

      <p>
        Both can be run as a "convergent" reduce, but for non-commutative cases we will need to transform it within shared
        memory for this to work.
      </p>

      <p>
        For any atomic-combinable type, we should always just do one level of reduction, which stores the result to a
        single atomic. For other types, we will need to run a multi-level reduce if the input size is potentially large
        enough to require it.
      </p>

      <p class="TODO">
        A "sequential-raked" reduce for blocked non-commutative cases. We either do this in
      </p>
      <ol class="TODO">
        <li>
          Zero-memory-overhead case, we keep the reduction in scratch[0], read in a workgroup-chunk into shared memory,
          fully reduce it (and during the read-in we combine new[0] with our reduced scratch[0]). This is a bit simpler.
        </li>
        <li>
          Fewer operations/workgroupBarriers: we keep a workgroup-var of whatever size (in addition to scratch) that will
          hold partial results. We read in a workgroup-chunk into shared memory, and PARTIALLY reduce it, storing the
          (multiple) partially reduced values into the workgroup-var of partials. We do this until the end (filling our
          workgroup-var, choosing our partial reduction depth to do this), and then run a normal full reduce on that.
        </li>
      </ol>

      <p class="TODO">
        Get index_mapped_loop working nicely (it is created, but not really used yet). load_multiple and load_reduced.
        We should do this for the newer TypeScript-based templates. Do it on a high-performance day.
      </p>

      <p class="TODO">
        Condense examples. Create some reusable "examples" that we can use for BOTH unit tests AND profiling.
      </p>

      <p class="TODO">
        Performance Tests:<br>
        Non-commutative non-raked striped-load, remap to blocked in shared memory (and then... convergent), and do convergent reduce?
      </p>

      <p>
        ModernGPU notes on grain size: By choosing an odd number for VT we avoid bank conflicts that would otherwise be incurred when re-ordering data between strided and thread orders. Within a warp, all banks (VT * tid + i) % 32 are accessed exactly once for each step i when VT is odd. If VT is a power-of-two, you can expect VT-way conflicts at each step.
      </p>

      <p>
        <a href="https://simonbyrne.github.io/notes/fastmath/">Beware fast-math</a>
      </p>

      <h4 id="WGSL-scan">Scan</h4>

      <p>
        Referenced <span class="citation" data-citation-id="bb5afff9714941c681fc0e5270168a71"></span> and <span class="citation" data-citation-id="KieberEmmonsPrefixSum"></span> in the implementations.
      </p>

      <p>
        We are using Hillis-Steele approaches <span class="citation" data-citation-id="10.1145/7902.7903"></span>, but with
        raking <span class="citation" data-citation-id="223009"></span> approaches from
        <span class="citation" data-citation-id="bb5afff9714941c681fc0e5270168a71"></span>, with notes from <span class="citation" data-citation-id="LevienPrefixSumVulkan"></span>.
      </p>

      <p class="TODO">
        Can we have an example scanning (or reducing) packed u16 data?
      </p>

      <p class="TODO">
        Remember for 3-level atomic-able, we can single-reduce and distribute atomics. For others, we will need
        multi-step reduce (if we are doing reduce-scan+add strategies). If we do the atomic-able, we can start with the
        scan on the bottom level.
      </p>

      <p class="TODO">
        After we get blocked non-commutative better reduce, do 3-level scans next. Get an optimized u32 scan (consider u16 packed scan?)
      </p>

      <p class="TODO">
        Get "right" direction scans working for everything.
      </p>

      <p class="TODO">
        Use the terminology "scan block" pervasively.
      </p>

      <p>
        Implemented the 2-level "reduce-scan-scan" strategy (<code>DoubleReduceScanShader</code>), and the 3-level
        "reduce-scan-scan-scan" strategy (<code>TripleReduceScanShader</code>).
      </p>

      <p class="TODO">
        Strategies:<br>
        2-level scan-scan-add: scan the scan blocks (save entire thing + 1 value per workgroup), scan that buffer/data (save), scan/add the scan blocks (save)<br>
        2-level reduce-scan+add (vello-like): reduce the scan blocks (save 1), scan that IN EACH, use value for scan with scan blocks<br>
        3-level reduce-reduce-scan-scan-scan: reduce the scan blocks (save 1), reduce middle (save 1), scan the base (save), scan/add the middle (save), scan/add the scan blocks (save)<br>
        3-level (atomic): reduce-scan-scan-scan: reduce the scan blocks (save 1 middle, atomic-save to base), scan the base (save), scan/add the middle (save), scan/add the scan blocks (save)<br>
      </p>

      <p class="TODO">
        Performance Tests:<br>
        Non-commutative non-raked striped-load, remap to blocked in shared memory (and then... convergent), and do convergent scan?
      </p>

      <p class="TODO">
        Create a scan using Brent-Kung (see Chap11 of Programming Massively Parallel Processors, code p250), for cases where our
        comparison might be doing memory loads (or other expensive work), we can get better work efficiency.
      </p>

      <h4 id="WGSL-radixSort">Radix Sort</h4>

      <p class="TODO">
        For sorting in general, read through <a href="https://xi.zulipchat.com/#narrow/stream/197075-gpu/topic/Sorting.20revisited">https://xi.zulipchat.com/#narrow/stream/197075-gpu/topic/Sorting.20revisited</a>,
        and note the examples <a href="https://webgpu.github.io/webgpu-samples/samples/bitonicSort">https://webgpu.github.io/webgpu-samples/samples/bitonicSort</a>
        and <a href="https://poniesandlight.co.uk/reflect/bitonic_merge_sort/">https://poniesandlight.co.uk/reflect/bitonic_merge_sort/</a>.
      </p>

      <p class="TODO">
        Look into a <a href="https://arxiv.org/ftp/arxiv/papers/2206/2206.01784.pdf">potentially faster radix sort implementation</a>
        with less memory access!
      </p>

      <p class="TODO">
        Look into <a href="https://arxiv.org/pdf/1611.01137.pdf">A memory-bandwidth efficient radix sort</a>.
      </p>

      <p>
        Referenced <span class="citation" data-citation-id="bb5afff9714941c681fc0e5270168a71"></span> and <span class="citation" data-citation-id="KieberEmmonsRadixSort"></span> in the implementations.
      </p>

      <p class="TODO">
        Scan code for TODOs
      </p>

      <p class="TODO">
        Extract the SCAN code out of the RADIX wrapper.
      </p>

      <p class="TODO">
        Separate out workgroup/grain for the scan and the radix operation(!), the radix op takes a lot more memory, and
        might be more limited.
      </p>

      <p class="TODO">
        3-level scan with the multi-level reduce based on atomics might fit nicely here.
      </p>

      <p class="TODO">
        getBits/getBin should handle statements.
      </p>

      <p class="TODO">
        Handle "lightweight" sorting for cases where there is a ton of associated data, but we only
        need a fraction of that to get the keys for sorting (or for comparisons).
      </p>

      <p class="TODO">
        Can we pack the radix sort histogram into u32 atomics (multiples)? Essentially apply our n-bit packing to the
        histogram (since we have a decent number of bits of potential counts).
      </p>

      <p>
        See <a href="https://github.com/fynv/webgpu_math/blob/master/client/radix_sort.js">example</a>, or
        <a href="https://www.reddit.com/r/GraphicsProgramming/comments/14c7gok/parallel_radix_sort_confusions_options/">reddit note</a>.
      </p>

      <h4 id="WGSL-mergeSort">Merge Sort</h4>

      <p>
        Raked merge code from <span class="citation" data-citation-id="bb5afff9714941c681fc0e5270168a71"></span>.
      </p>

      <p class="TODO">
        Implement this, and figure out a way to sort a workgroup's raked worth! We could radix-sort it if we can get bits.
        Otherwise, what's a good parallel efficient sort? We could actually just merge-sort if it comes to it?
        Sorting networks/circuits?
        <a href="https://en.wikipedia.org/wiki/Bitonic_sorter">https://en.wikipedia.org/wiki/Bitonic_sorter</a>
        is recommended?!?
      </p>

      <p>
        Might be possible, see <a href="https://moderngpu.github.io/mergesort.html">ModernGPU merge sort</a>.
      </p>

      <h4 id="WGSL-stackMonoid">Stack Monoid</h4>

      <p class="TODO">
        Implement <span class="citation" data-citation-id="levien2022fast"></span>
      </p>

      <h4 id="WGSL-memory">Memory</h4>

      <p>
        See thoughts for <a href="https://github.com/linebender/vello/issues/366">Vello</a>.
      </p>

      <p class="TODO">
        Bump allocation with atomics
      </p>

      <p class="TODO">
        Recird when bump allocation runs out of memory (use <code>arrayLength</code> on buffers).
      </p>

      <h4 id="WGSL-rationals">Rationals</h4>

      <p class="TODO">
        WE NEED to keep around our initial coordinates, or store something like that initially. Because otherwise,
        comparing Q128 cross-multiply messes with our bit depth. We would need u64u64multto128, subtracti128, etc.
        NOTE: If it is only an issue with sorting things with the same end, perhaps we could sort with floats and would
        have enough precision?
      </p>

      <p class="TODO">
        Use Karatsuba multiplication to get rid of another multiply.
      </p>

      <p class="TODO">
        HOW CRAZY: If we allow representations of square roots, could we handle cubic bezier intersections precisely?
        Get exact bounds? probably not everything - augment with floating point (especially for atan2) for fast angle
        comparison? OR, what is the best way to compare angles? We can still compare slope, right?
      </p>

      <p>
        Reference for bithacks: <a href="https://graphics.stanford.edu/~seander/bithacks.html">https://graphics.stanford.edu/~seander/bithacks.html</a>.
      </p>

      <h4 id="WGSL-profiling">Profiling</h4>

      <p class="TODO">
        Profile all of the things! Especially those small settings that we might want to get rid of.
      </p>

      <p>
        Remember to put a "copy" like shader in front of something we're profiling, if we want to fully test the entire
        "memory overhead" of the main operation. Or find some other way of ejecting the content from the cache.
      </p>

      <p class="TODO">
        Omit mappable reads from performance tests
      </p>

      <p class="TODO">
        Test and see if having one big buffer that we use for everything is more performant.
      </p>

      <p>
        Using PIX on Windows. See <a href="https://gist.github.com/Popov72/41f71cbf8d55f2cb8cae93f439eee347">https://gist.github.com/Popov72/41f71cbf8d55f2cb8cae93f439eee347</a>
      </p>

      <p>
        See GPUProfiling and strategies. See also <a href="https://github.com/mighdoll/alpenbench">alpenbench</a>.
      </p>

      <p>
        Note that we really want to separately examine a "single" vs "parallel" execution in benchmarks. Sometimes we care
        if we're running just a single workgroup (how can we make that workgroup as fast as possible on its own - occupancy
        doesn't matter as much). Sometimes we care about a huge number of workgroups (occupancy matters a lot more, and
        can be the cliff-limiting factor).
      </p>

      <p>
        Try <a href="https://renderdoc.org/">RenderDoc</a>.
      </p>

      <p class="TODO">
        PIX didn't show occupancy information on my OS/GPU. Find a way where we can inspect occupancy directly.
      </p>

      <p>
        Use median for profiling.
      </p>

      <p>
        <a href="https://github.com/mighdoll/thimbleberry/blob/main/doc/Utilities.md#gpu-performance-reports">Thimbleberry usage notes</a>
      </p>

      <p class="TODO">
        Look for cases where we should turn array-of-structures into structure-of-arrays (e.g. striding) for better
        memory access patterns.
      </p>

      <p>
        Note: <code>--enable-dawn-features=allow_unsafe_apis --enable-webgpu-developer-features</code> for when we are profiling. Or
        <code>open -a "Google Chrome Canary" --args --enable-dawn-features=allow_unsafe_apis --enable-webgpu-developer-features</code> on macOS.
      </p>

      <p>
        Can we profile on Metal (on macBook), see <a href="https://developer.apple.com/documentation/xcode/optimizing-gpu-performance">https://developer.apple.com/documentation/xcode/optimizing-gpu-performance</a>.
      </p>

      <p class="TODO">
        Optimize for MAD (multiply-add, one cycle).
      </p>

      <p>
        <a href="https://developer.nvidia.com/gpugems/gpugems2/part-iv-general-purpose-computation-gpus-primer/chapter-35-gpu-program-optimization">Performance tips in general</a>,
        <a href="https://developer.nvidia.com/blog/how-access-global-memory-efficiently-cuda-c-kernels/">Global memory efficiency</a>,
        <a href="https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/">Grid stride loops</a>,
        <a href="https://developer.nvidia.com/blog/the-peak-performance-analysis-method-for-optimizing-any-gpu-workload/">Performance analysis</a>,
        <a href="https://gpuopen.com/learn/optimizing-gpu-occupancy-resource-usage-large-thread-groups/">Optimizing occupancy</a>,
        <a href="https://www.kodeco.com/books/metal-by-tutorials/v3.0/chapters/32-best-practices">32 best practices</a>,
        <a href="https://developer.nvidia.com/gpugems/gpugems2/part-iv-general-purpose-computation-gpus-primer/chapter-35-gpu-program-optimization">GPU optimization</a>,
        <a href="https://developer.nvidia.com/gpugems/gpugems">GPU Gems 1</a>,
        <a href="https://developer.nvidia.com/gpugems/gpugems2">GPU Gems 2</a>,
        <a href="https://developer.nvidia.com/gpugems/gpugems3">GPU Gems 3</a>
      </p>

      <p class="TODO">
        WebGL2 if possible, see <a href="https://webgl2fundamentals.org/webgl/lessons/webgl-gpgpu.html">Compute guide</a>
      </p>

      <p class="TODO">
        Test out a bunch of performance hypotheses.
      </p>

      <h4 id="WGSL-unitTesting">Unit Testing</h4>

      <p class="TODO">
        Minimize duplication in unit tests (and get more of our "shaders" up to speed, like the radix ones).
      </p>

      <h4 id="WGSL-knownBugs">Known Bugs</h4>

      <p class="TODO">
        Overflows on left-shifts are a problem! <code>&lt;&lt; 32</code> was causing all sorts of issues in the radix
        sort. Investigate full overflow handling for u32/i32. Make sure we are not overflowing (or at least shifting by 32).
      </p>

      <p class="TODO">
        Check the radix sort, and make sure it is not requesting more bitVector size than it is using.
      </p>

      <h4 id="WGSL-notes">Notes</h4>

      <p>
        Can request more storage buffers:
        <a href="https://bugs.chromium.org/p/dawn/issues/detail?id=2159">https://bugs.chromium.org/p/dawn/issues/detail?id=2159</a>
      </p>

      <p>
        Synchronization notes: onSubmittedWorkDone() are settled in order, and mapAsync() settles before the corresponding
        onSubmittedWorkDone().
      </p>

      <p>
        BindGroupLayout "group-equivalent" means "essentially equal bind group layouts are interchangeable".
      </p>

      <p>
        Think about memory bank conflict patterns (with grains, etc.). Interleaved banks/channels for memory, with
        coalesced memory. Remember for coalescing we can potentially do one of the following:
      </p>

      <ul>
        <li>
          Rearrange threads
        </li>
        <li>
          Rearrange data that the threads access
        </li>
        <li>
          Perform uncoalesceable accesses on shared memory, then transform the data from shared to global in a coalesced
          way
        </li>
      </ul>

      <p>
        Batch/aggregate atomic updates (with privatization).
      </p>

      <p>
        Use <code>arrayLength()</code> more.
      </p>

      <p>
        Use <code>num_workgroups</code> more.
      </p>

      <p>
        Dynamic offset alignment restricted to 256 by default (https://www.w3.org/TR/webgpu/#dom-supported-limits-minstoragebufferoffsetalignment).
        Default 8 storage buffers max per pipeline layout, 12 uniform buffers max per pipeline layout. 256MiB for buffer size.
        16384 bytes for workgroup storage
      </p>

      <p>
        Buffers must be unmapped while any in-flight submission or queue operation may be using them.
      </p>

      <p>
        Really reduce registers because having more workgroups concurrently hides latency.
      </p>

      <p>
        Out-of-gamut premultiplied RGBA provided to a Canvas (rendered on the screen) has undefined behavior!
      </p>

      <p>
        <a href="https://developer.mozilla.org/en-US/docs/Web/API/GPUAdapter/requestAdapterInfo">https://developer.mozilla.org/en-US/docs/Web/API/GPUAdapter/requestAdapterInfo</a>
        plus WebGPU Developer Features flag (chrome flags) will show type/backend.
      </p>

      <p>
        f16 types could be helpful, see <a href="https://developer.chrome.com/blog/new-in-webgpu-120">https://developer.chrome.com/blog/new-in-webgpu-120</a>.
      </p>

      <p class="TODO">
        Try dynamic block index assignment (with an atomic), so that we can process data in order. Can we cheat, and
        use that for forward progress and workgroup communication?
      </p>

      <p class="TODO">
        Create a map of what in WGSL are abstract/concrete/structure/composite/constructible/fixed-footprint/plain/atomic/storable types
      </p>

      <h4 id="WGSL-newLanguage">New Language</h4>

      <p>
        A new language should potentially:
      </p>

      <ul>
        <li>
          Include the full chain of operations in a single file (with each section importable).
          Have <code>shaderBarrier()</code> for things that create a split between shaders.
        </li>
        <li>
          Support everything in ConcreteType (bit packed structures, structure-of-array representations, etc.).
          Also should be able to "subset" types automatically, for those parts that are actually used.
        </li>
        <li>
          Potentially include a language (JS/TS?) for templating inside, for when we need it? Maybe not.
        </li>
        <li>
          Include the same syntax/handling for most operations, so we can parse the WGSL part and directly include it.
        </li>
      </ul>

      <h4 id="WGSL-overlap">Overlap</h4>

      <p>
        Overlap is for when we have multiple shader stages that run concurrently (because they don't write to the same
        place, and other constraints).
      </p>

      <p>
        This was extremely painful to get working. Dawn's zero-copy buffers and stages messed up the "usage" flags,
        so we had to use a ton of workarounds. We'll eventually want to get this working in general.
        See <code>PerformanceTesting.ts</code> for the working example (as verified by PIX).
      </p>

      <p>
        For overlap:
      </p>

      <ol>
        <li><a href="https://therealmjp.github.io/posts/breaking-down-barriers-part-1-whats-a-barrier/">What’s a Barrier?</a></li>
        <li><a href="https://therealmjp.github.io/posts/breaking-down-barriers-part-2-synchronizing-gpu-threads/">Synchronizing GPU Threads</a></li>
        <li><a href="https://therealmjp.github.io/posts/breaking-down-barriers-part-3-multiple-command-processors/">https://therealmjp.github.io/posts/breaking-down-barriers-part-3-multiple-command-processors/</a></li>
        <li><a href="https://therealmjp.github.io/posts/breaking-down-barriers-part-4-gpu-preemption/">https://therealmjp.github.io/posts/breaking-down-barriers-part-4-gpu-preemption/</a></li>
        <li><a href="https://therealmjp.github.io/posts/breaking-down-barriers-part-5-back-to-the-real-world/">https://therealmjp.github.io/posts/breaking-down-barriers-part-5-back-to-the-real-world/</a></li>
        <li><a href="https://therealmjp.github.io/posts/breaking-down-barriers-part-6-experimenting-with-overlap-and-preemption/">https://therealmjp.github.io/posts/breaking-down-barriers-part-6-experimenting-with-overlap-and-preemption/</a></li>
      </ol>

      <p class="TODO">
        Test with the overlap case with my performance profiling. See if it reports "parallel" handling.
        <code>PerformanceTesting.loopExpensiveMultiple()</code>.
        This was removed from main, see <a href="https://github.com/phetsims/alpenglow/blob/478aaf7e41fcd1b348aae0b1346fea37a74958ee/js/webgpu/PerformanceTesting.ts#L17">here</a>.
        Then profile it with PIX to see if our profiling matches PIX.
      </p>

      <h4 id="WGSL-documentation">Documentation</h4>

      <p class="TODO">
        Better document all of our existing base templates.
      </p>

      <h4 id="WGSL-deprecated">Deprecated</h4>

      <p class="TODO">
        Entire .wgsl file system. Port over tests for /gpu/, then we can remove a lot of "newer but deprecated" items.
      </p>

      <p>
        DualSnippet (we don't need the "dual" nature, and we're not using snippets for templating).
      </p>

      <p>
        Preprocessing system (all the #ifdef #bindings #import #option stuff). Including the chipper transpilation.
        Move the minification/mangling code elsewhere (might be useful).
      </p>

      <p>
        Execution/BasicExecution/BaseExecution, and the DeviceContext calls. See what we can remove.
      </p>

      <p class="TODO">Note chipper <code>grunt transpile --live</code></p>

      <p>
        For minification, investigate <a href="https://github.com/LucentFlux/wgsl-minifier">https://github.com/LucentFlux/wgsl-minifier</a>
        and <a href="https://github.com/mgnauck/wgslminify">https://github.com/mgnauck/wgslminify</a>
      </p>

      <h3 id="asyncSimulation">Async Simulation</h3>

      <p>
        For use in rapid prototyping and debugging WGSL-like structured code, we've created the ability to run code with
        a similar execution model to WGSL with async/await code in JS/TS.
      </p>

      <p>
        It uses async/await to simulate the execution model by:
      </p>

      <ul>
        <li>
          Running threads/invocations and workgroups in parallel, by awaiting every storage/workgroup read/write (or
          synchronization barrier).
        </li>
        <li>
          Blocking all progress on threads in a workgroup on a synchronization barrier until all threads have reached it.
        </li>
        <li>
          Recording all of the read/writes to storage/workgroup arrays (and which thread/workgroup they were set by),
          with logic on synchronization barriers. This allows detecting potential data races, uniformity issues,
          missing barriers, or other issues.
        </li>
        <li>
          Returning indeterminate values from out-of-bounds or un-initialized array accesses.
        </li>
      </ul>

      <p>
        In the model, typically <code>ParallelStorageArray</code>s are created, and then shaders are dispatched
        by running <code>ParallelExecutor</code>'s <code>dispatch()</code> method (with the dispatch size passed in).
        <code>ParallelExecutor</code>s are created with <code>ParallelKernel</code>, which is roughly equivalent to
        individual shaders. They will specify an async execute function, in addition to a way of creating workgroup-scoped
        values (in a statically-typed way). Workgroup-scoped values can be either atomics, or <code>ParallelWorkgroupArray</code>s.
      </p>

      <p>
        The execute function is given a reference to a context (<code>ParallelContext</code>) that will provide access to
        workgroup-scoped values and synchronization barriers (workgroupBarrier/storageBarrier) directly.
        It's also passed to storage/workgroup data read/writes, so that we can record the accesses and fail out assertions
        if certain data access pattersn are detected.
      </p>

      <p>
        <code>ParallelRaster</code> is an example of TypeScript code validating the model used for the raster-clipping
        process. <code>ParallelRasterSplitScan</code> is an example of a single shader in this model.
      </p>

      <h3 id="depthSort">Depth Sort</h3>

      <p>
        If we store data about what 3D plane each face is on (essentially assigning a depth to each 2d point), it is
        possible to analytically determine the exact divisions between sections where one face is in front of the other.
        For any given pair of planes, there is a 2d line in screen space that delineates the boundary between the two
        areas (the "depth sort line"). We will use this to further split our polygons into disjoint sections where we have
        a total ordering of the planar faces (recall, Alpenglow stages will already have found us a section that is fully
        contained within the faces).
      </p>

      <p>
        This can be used to render 3d scenes, using sorting/splitting of the faces (instead of the normal technique of
        using a depth buffer). We will have a fully vector-friendly description of the 3d scene, down to an arbitrary
        precision!
      </p>

      <div class="row-fluid" style="margin-bottom: 15px;">
        <div class="span4 example">
          <div id="teapot-faces"><div class="example placeholder" style="width: 200px; height: 200px;"></div></div>
          Faces
        </div>
        <div class="span4 example">
          <div id="teapot-normals"><div class="example placeholder" style="width: 200px; height: 200px;"></div></div>
          Interpolated Normals
        </div>
        <div class="span4 example">
          <div id="teapot-phong"><div class="example placeholder" style="width: 200px; height: 200px;"></div></div>
          Phong
        </div>
      </div>

      <p>
        Due to the total ordering of faces, we can actually <strong>trivially</strong> support multiple transparent
        objects correctly!
      </p>

      <div class="example">
        <div id="transparent-3d"><div class="example placeholder" style="width: 128px; height: 128px;"></div></div>
      </div>

      <p class="TODO">
        Something wrong with normals in teapot example, set to 256x256 in demo.
      </p>

      <p class="TODO">
        Analytic perspective correction, see notes on paper --- looks like possible to integrate?
        <a href="https://en.wikipedia.org/wiki/Texture_mapping#Perspective_correctness">https://en.wikipedia.org/wiki/Texture_mapping#Perspective_correctness</a>
      </p>

      <p class="TODO">
        Show arbitrary graphics with 3d perspective:
        Quads in 3D (and putting vector graphics in them!) we will just transform our graphics with perspective too?
        RenderPlanar with 3 points will work for quads just fine.
        Needs normal interpolation for quads.
        We can store the projective transform, to get exact gradients (in fact, we just need to do perspective correction).
      </p>

      <p class="TODO">
        Consider face consolidation after depth sort split! Should save a lot in rasterization?
      </p>

      <p class="TODO">
        Need to do breadth-first for our splitting in DepthSort, otherwise we are doing a lot of unnecessary splitting.
      </p>

      <p class="TODO">
        Checkerboard ground could be texture with analytic nearest image.
      </p>

      <p class="TODO">
        Planar getDepthSplit should check to see if the line goes through the bounds. We can skip expensive clipping a lot
      </p>

      <p class="TODO">
        Better name for RenderPlanar.
      </p>

      <p class="TODO">
        Near-far plane clipping in depth sort?
      </p>

      <p class="TODO">
        Stereographic/orthographic projection
      </p>

      <p class="TODO">
        For 3d how to plug shaders into RenderProgram?
      </p>

      <p class="TODO">
        Once we have RenderableFaces, we could split things up based on whatever shaders need to run - for instance, if
        we only can have certain shaders loaded, we can find the faces that use it? (multiple passes, for the different shaders?).
        If we are combining later anyway, why not skip the first (traced) consolidation?
        (Also for rasterization, we probably should NOT combine from separate tiles).
      </p>

      <h3 id="vectorCanvas">Vector Canvas</h3>

      <p class="TODO">
        Partially implemented in <code>VectorCanvas</code>. Basically it is possible to create a vector-accurate representation
        with our boolean operations. Any drawing operation will affect a (split) region, and it will update the RenderProgram in
        that region. We will simplify all of those, then can consolidate areas with the same RenderProgram back together.
        For this to fully work nicely, we would want to store the rational half edges permanently, and work in the rational
        space.
      </p>

      <p class="TODO">
        Radial and linear stuff in the vectorTest() uncommented can cause failures
      </p>

      <h3 id="testing">Testing</h3>

      <p class="TODO">
        More unit tests! Simplification. Clippers (fuzz, ensure we preserve area and are region-constricted). Compare to
        SVG/Canvas. Fuzz the rasterizer. Fuzz RenderPrograms.
      </p>

      <p class="TODO">
        Add assertions to make sure our faces have area that sums to the total!
      </p>

      <h2 id="stages">Stages</h2>

      <p class="TODO">
        Figure out importing Canvas/SVG APIs (we should be able to do all of SVG natively, no?)
      </p>

      <p class="TODO">
        IMPORTANT! Separate buffer for uniform configs. Storage configs copies to it. Can probably have these in a
        separate bind group. Also could just leave it as a storage buffer.
      </p>

      <p class="TODO">
        See if we can inverleave CPU/GPU work better. Can we record scene, kick it off on the GPU, then record
        render programs and then kick THAT off on the GPU? Might reduce latency that way.
      </p>

      <p class="TODO">
        Create a visual debugger for this setup - visualize the RenderChunks, etc.
      </p>

      <p class="TODO">
        Tile things in raster clipper. Load data together, then operate together.
      </p>

      <p class="TODO">
        Batch our atomic updates (especially in rasterization).
      </p>

      <h3 id="stage-transforms">Transforms</h3>

      <p>
        Use a stack monoid setup to compute the nested transforms for every path. This is not particularly required from
        Scenery (we CAN compute them CPU-side, but it uses up a lot of CPU time). However, we can ship over paths in an
        instanced fashion (e.g. font glyphs), and this stage would convert from the DAG to effectively a flat array of
        paths.
      </p>

      <p>
        NOTE: This will include perspective projection for 3D?
      </p>

      <h3 id="stage-subdivision">Subdivision</h3>

      <p>
        We will want to convert our paths to polygons (a piecewise linear form). This could come before or after the
        transform step. Might need more code to do it after the transforms (logic needed to transform elliptical arcs,
        etc., but it would be more efficient.
      </p>

      <h3 id="stage-bounds">Bounds</h3>

      <p>
        We will want the bounds of each path for later stages.
      </p>

      <h3 id="stage-tiling">Tiling</h3>

      <p>
        We will subdivide the rendered area into tiles, and for each path we will either skip it, fully include it, or clip it
        to the size of the tile.
      </p>

      <p>
        Certain potential final rasterization steps benefit from a guaranteed maximum tile size of path (e.g. if we are
        256x256, it would take 16 steps of binary subdivision to reach the pixel level).
      </p>

      <p>
        Tiling is also helpful to reduce the potential amount of cases for the later intersection step, AND with our
        integer transform, we will be able to keep more precision for the CAG steps.
      </p>

      <p class="TODO">
        Note possibility: Just use tiling constraints in intersections and face holes. OTHERWISE we should be able to use
        the heterogeneous combination of tiles for other steps - but we might want to tile for rasterization too.
      </p>

      <h3 id="stage-integerTransform">Integer Transform</h3>

      <p class="TODO">
        Can we cut precision here further to get better edge sorting? Radix sort on the rationals might be expensive.
      </p>

      <p>
        We will scale up and transform our coordinate frame (for each tile) to integer coordinates, spaced out so that we are
        keeping about 20 bits of information for each coordinate. This will allow us to use rational numbers to compute
        the intersections between line segments (and to sort edges around each vertex) in an exact robust way, keeping our
        numerator and denominator within 64 bits each. Since the outputs will be signed, we will want to put the origin
        approximately in the center of our "integer" coordinate frame.
      </p>

      <p>
        Our data will be in the <code>IntegerEdge</code> format, which essentially just stores the start point, end point,
        and the path ID.
      </p>

      <h3 id="stage-hilbertSort">Hilbert Sort</h3>

      <p>
        We will sort the the edges using a higher-dimensional
        <a href="https://en.wikipedia.org/wiki/Hilbert_curve">Hilbert space-filling curve</a>, so that edges are somewhat
        spatially coherent (edges in a similar part of the list will likely have similar positions or sizes). This helps
        reduce the workload on the next step (intersection), since it would have theoretical O(n^2) performance.
        In practice, by having typical visual data AND sorting, we get a small fraction of that!
      </p>

      <p>
        This may be unnecessary for some workloads (where the edges provided are in a spatially coherent order due to
        the scene graph), however it is absolutely necessary for others (e.g. 3d meshes).
      </p>

      <p>
        This is inspired by the bulk loading of R-Trees (see <span class="citation" data-citation-id="10.1145/1963190.2025380"></span>),
        using Hilbert Curve concepts noted in <span class="citation" data-citation-id="Bader2013"></span>.
      </p>

      <p>
        Planned to use a radix sort on GPU. Software is currently using a 6-dimensional Hilbert sort
        (centerX, centerY, minX, minY, maxX, maxY), but it is probably best to reduce that to a 4-dimensional sort.
      </p>

      <p class="TODO">
        Review paper for more insights. Potentially store Hilbert partials during the sort, OR just compute them to a given
        depth and store the result? (If we get something small however with a ton of dispersed edges... could CRATER
        performance).
      </p>

      <h3 id="stage-intersection">Intersection</h3>

      <p>
        We will need to find all of the pairwise intersections between edges. In software, this is done by computing a
        binary tree of bounding boxes (where the Hilbert sort helps significantly). We can ignore the intersection between
        two subtrees whose bounding boxes do not intersect. Furthermore, if there are no horizontal lines in either
        subtree, we can allow the bounding boxes to "touch" horizontally (sharing a single line of intersection) without
        having any substantive pairwise intersection in edges, since we only care about internal intersections (those
        which aren't endpoint-to-endpoint).
      </p>

      <p>
        We will store the intersections with their rational coordinates AND parametric value of intersection (also an
        exact rational) for each internal intersection within each of the edges.
      </p>

      <p>
        This intersection will also detect overlapping segments, and will find the endpoints that are internal to the
        other edge. Thus overlapping edges will thus be fully overlapping, and can be detected by future steps.
      </p>

      <p>
        Potentially plan to use bump allocation on the GPU, with a linked list for each edge (with atomic operations).
      </p>

      <p>
        NOTE: A great synchronous algorithm for this type of thing is <span class="citation" data-citation-id="10.1145/129902.129906"></span>,
        but it is very serial.
      </p>

      <p class="TODO">
        Do hilbert sort (or skip if data allows), segment into workgroup size chunks. Store bounds tree (or compute) for
        every chunk. At least bounds for total. THEN compute upper triangular grid of intersections by intersecting chunks.
        Filter by bounds down to... groups of 16? So most workgroups will be no op because of bounds.
      </p>

      <p class="TODO">
        Parallel plane-sweep? <a href="https://docs.lib.purdue.edu/cgi/viewcontent.cgi?article=1481&context=cstech">https://docs.lib.purdue.edu/cgi/viewcontent.cgi?article=1481&context=cstech</a>.
        Or more of a general parallel method (CREW PRAM): <a href="https://dl.acm.org/doi/pdf/10.1145/72935.72950">https://dl.acm.org/doi/pdf/10.1145/72935.72950</a>.
      </p>

      <p class="TODO">
        For parallel, could augment merge-sort approaches.
      </p>

      <p class="TODO">
        <a href="https://gpuopen.com/download/publications/HPLOC.pdf">https://gpuopen.com/download/publications/HPLOC.pdf</a>
      </p>

      <p class="TODO">
        We are somewhat solving a "bounding box" intersection problem, but can we filter in a better way that solves the
        "line segment intersection" problem? When we split a line segment, remember it gets split into two smaller
        bounding boxes (not just splitting the large bounding box).
      </p>

      <p class="TODO">
        How inexpensively can we test 45-degree-oriented bounding boxes? This might be worth it
      </p>

      <p class="TODO">
        Flesh out that "alternative" plan in <code>BoundsIntersectionFilter</code>.
      </p>

      <p class="TODO">
        Can we store information about contiguous lists of segments with no intersections? Like, from guaranteed inputs,
        or from the same curved segment (when it is split into line segments) where we can guarantee no self-intersection?
      </p>

      <p class="TODO">
        Can we use homogeneous coordinates in intersections/etc. to avoid lots of division? equality or sort is hard? Read
        2005 Skala.
      </p>

      <h3 id="stage-split">Split</h3>

      <p>
        Each integer edge will have a list of internal intersections. We will sort those intersections by their parametric
        value, and then we will output a list of "rational" edges (using `RationalHalfEdge`) that make up each
        non-intersected interval of the original edge.
      </p>

      <p>
        We will store the winding information for each path in this information. So if our edge in path X goes from A to B,
        we will create two half-edges: A -> B (winding +1) and B -> A (winding -1), both marked for path X.
      </p>

      <p class="TODO">
        Look into merge sort for sorting the linked-list accumulated splits.
      </p>

      <p class="TODO">
        If we don't sort them, is it quadratic time? However, hopefully we won't have too many intersections, and that
        could potentially be acceptable? (Sort only if it is more than N splits in an edge?)
      </p>

      <h3 id="stage-edgeSort">Edge Sort</h3>

      <p>
        We will sort our rational half-edges for the next stage (first by their starting point, then by their angle, then
        by their ID).
      </p>

      <p class="TODO">
        Perhaps use MergeSort instead of RadixSort on GPU? Radix sort might need a lot of bits, and it seems like we would be
        doing 8 bits per pass?
      </p>

      <h3 id="stage-filterConnect">Filter &amp; Connect</h3>

      <p>
        There are two goals here:
      </p>

      <ul>
        <li>
          Combine otherwise identical overlapping half-edges together. Before this, each directional edge has one path
          that it is from, but now each half-edge will be from potentially multiple paths. We will combine the winding
          contributions into a winding map (path => winding) for these cases. (NOTE: we are currently duplicating effort
          for the reverse edges here). We will want to filter out duplicates (stream compaction on GPU).
        </li>
        <li>
          Connecting our half-edge structure together. Each half-edge already had a link to its "opposite" edge. However
          since we now have edges that are sorted by start THEN angle, we have effectively "ordered" all of the outgoing
          edges around each vertex, and thus we can store our "next" and "previous" half-edges if tracing
          counter-clockwise around the vertex. This will allow tracing of the edges in future steps.
        </li>
      </ul>

      <p>
        Notes: Monoid to scan edges, choose lowest index on each edge, propagate to the other edges. Presumably
        we should use half-edges for these data structures? (Should we be computing signed area in this pass?)
      </p>

      <h3 id="stage-boundaryTrace">Boundary Trace</h3>

      <p>
        We will follow each half-edge until it loops back on itself. This will give us a boundary. It might be an
        "inner" or "outer" boundary. We can determine which it is by looking at the total signed area of the boundary
        (using the shoelace method). If it is positive, it is an inner boundary (is counter-clockwise), and if it is
        negative, it is an outer boundary (is clockwise). Outer boundaries are either (a) the outside of the bounding box
        or whatever region we have, or (b) a hole.
      </p>

      <p>
        Since we have signed area here, we can discard all edges with boundaries with zero area.
      </p>

      <p>
        On the GPU, we will be able to use reduction techniques to make this efficient (like the linked list traversal).
      </p>

      <h3 id="stage-faceHoles">Face Holes</h3>

      <p>
        Each inner boundary determines a face. We need to compute which outer boundaries are holes, and for which faces.
        Once we figure out which boundaries are holes for which faces, we can forget about whether they are inner/outer
        and just combine all of the relevant boundaries together to represent the face.
      </p>

      <p>
        We will first sort faces by their signed area. Then for each outer boundary, we will take the point on it with the
        smallest x value (compatible with our edge sort above), and we will scan for the smallest face that contains the
        point. We can filter a lot, ignoring faces smaller than the hole, faces whose bounding boxes don't fully
        (strictly) contain our hole, and we can take the smallest face that contains the point. Point-in-polygon testing
        is done with Dan Sunday's algorithm.
      </p>

      <p class="TODO">
        Look up what paper most of the filtering techniques are from!
      </p>

      <p>
        As part of this, we will determine the "unbounded" outer face (the one that effectively contains the content OUTSIDE
        of the region we are rendering).
      </p>

      <h3 id="stage-windingMaps">Winding Maps</h3>

      <p>
        We need to compute the winding map for each face. The unbounded face contains zero winding, so we can start with
        it and find adjacent faces (through edges). Each time we find an adjacent edge with one face that has a winding
        map and one that doesn't, we can compute the winding map for the face that doesn't have one (based on the
        information in the edge and in the one face).
      </p>

      <h3 id="stage-renderProgramSimplification">RenderProgram Simplification</h3>

      <p>
        Our winding map for each face, combined with the winding rule for each path, gives us a boolean map (whether
        a path is included or not) for each face. In general these are sparse, so they will only mention the included
        faces. We can simplify the RenderProgram for each face by simplifying it with the path inclusion information
        (replacing each <code>RenderPathBoolean</code> with either the inside or outside RenderProgram), combined with
        simplification.
      </p>

      <p>
        We will precompute (possibly on the CPU) a replacement acceleration structure, since we will have many primitives
        that will simplify sparse path inclusion significantly. Thus if we have 1000 paths, and 3 of them are included
        in a face, we can create a simplified version of that RenderProgram without having to do a full scan of the 1000
        items. See <code>RenderPathReplacer</code> for more details.
      </p>

      <p class="TODO">
        Write up RenderPathReplacer documentation.
      </p>

      <p>
        Now, many of the faces will have constant-valued RenderPrograms, which can be rendered much more efficiently!
        We won't have to do per-pixel blending at all in these cases.
      </p>

      <h3 id="stage-renderableFaceCreation">RenderableFace Creation</h3>

      <p>
        There now might be multiple faces that have equivalent RenderPrograms (especially those that are adjacent faces).
        Based on performance characteristics, we may want to do one of the following:
      </p>

      <dl>
        <dt>
          Keep the faces separate
        </dt>
        <dd>
          Straightforward, but we will process more edges than needed.
        </dd>
        <dt>
          Combine adjacent equivalent faces (just delete edges)
        </dt>
        <dd>
          We can scan edges to find ones with equivalent faces on both sides. We can then discard these edges, and deliver
          edges out in the "unsorted edge" format (ideally edge-clipped) for future stages.
        </dd>
        <dt>
          Combine adjacent equivalent faces (trace edges, skip equivalent)
        </dt>
        <dd>
          If we instead trace edges, and while traversing we skip over edges that have equivalent faces, we will generate
          polygonal data for each face. This will leave us with the best end-result, but might be more costly.
        </dd>
        <dt>
          Fully combine all equivalent faces
        </dt>
        <dd>
          This could be done with "unsorted edges" or traced. We will get a full path for each RenderProgram, however
          depending on the rasterization method used, having two far-away small items in the same face could hurt
          performance.
        </dd>
      </dl>

      <p class="TODO">
        NOTE: This is better implemented by generating hashes for every simplified RenderProgram, so we can do quick
        comparisons. Find everywhere we are doing equality right now (probably somewhat inefficiently?).
        ONCE this is done, it will let us DAG RenderPrograms nicely on the CPU, if we store a hashmap of simplified
        programs? (Is that useful?)
      </p>

      <p class="TODO">
        Visuals/examples would be helpful here.
      </p>

      <h3 id="stage-splitPrograms">Split Programs</h3>

      <p>
        Certain types of RenderPrograms will cause their RenderableFace to be split into multiple RenderableFaces. For
        example, the <code>RenderDepthSort</code> for 3d meshes will be able to find exact split lines between different
        3d faces fully included in the RenderableFace. Additionally, for high quality output, gradients can split their
        faces so that gradient stop transitions are properly anti-aliased.
      </p>

      <h3 id="stage-rasterize">Rasterize</h3>

      <p class="TODO">
        Document two-pass rasterization (which is really... three-pass due to tiling split?)
      </p>

      <p class="TODO">
        Two-pass: Subdivide grid so that we have 16/32-thread warps more coherent?
      </p>

      <p class="TODO">
        Two-pass: If a tile/bin is full-area AND constant, can we short-circuit and have more efficient rasterization?
      </p>

      <p class="TODO">
        Two-pass: Add image WGSL, especially the sampled types! We might want to do per-edge operations
      </p>

      <p class="TODO">
        Two-pass: coalesced edge data loading?
      </p>

      <p class="TODO">
        Two-pass: Create ConcreteType improvements for "structs", etc.
      </p>

      <p class="TODO">
        Two-pass deduplication of duplicate RenderPrograms? Hash RenderPrograms?
      </p>

      <p class="TODO">
        Two-pass: Add Mitchell-Netravali filtering (first the general case, then the grid-specific case)
      </p>

      <p class="TODO">
        Two-pass alternatives: tiling uses two segmented scans (first one determines area/num-edges, second one determines placement in final array)
      </p>

      <p class="TODO">
        Two-pass bind-group optimizations?
      </p>

      <p class="TODO">
        Is it crazy to do delta-based rendering ever? (i.e. figure out which places changed, and render the before/after pixels as a subtraction)
      </p>

      <p class="TODO">
        Can we get our workgroups somehow doing coarse and fine grid splits?
      </p>

      <p class="TODO">
        Can we include things into one buffer, and just... append to arrays of edges/chunks/etc.?
      </p>

      <p>
        Other approaches include <span class="citation" data-citation-id="10.1145/1409060.1409088"></span> and
        <span class="citation" data-citation-id="10.1145/2018323.2018337"></span>, both of which Vello seems to pull ideas
        from. See <span class="citation" data-citation-id="10.1145/2661229.2661274"></span> and
        <span class="citation" data-citation-id="10.1145/800248.807360"></span>.
      </p>

      <div class="example" id="rasterize-initial-demo-container" style="display: none;">
        <p style="text-align: left;">
          Below, Alpenglow's WebGPU rasterization output is demonstrated.
        </p>
        <div id="rasterize-initial-demo"><div class="example placeholder" style="width: 512px; height: 512px;"></div></div>
      </div>

      <p class="TODO">
        See if we should use something like Lucidchart, Mermaid is fighting us with font size and layout.
      </p>

      <p class="TODO">
        Customizable execution instruction limits, error out if past a certain level. This might be what is killing performance.
      </p>

      <p class="TODO">
        Read RAVG/Massive/Vello again, might switch to a technique more like that.
      </p>

      <p class="TODO">
        See if out-of-bounds writes are wrecking our correctness. They can scribble anywhere in the binding?
      </p>

      <p class="TODO">
        Restructure Parallel code handling so that out-of-bounds writes result in failures.
      </p>

      <p class="TODO">
        Can we combine all of the reduces into one buffer?
      </p>

      <p class="TODO">
        Can we combine our complete chunks and complete edges into one buffer? We are exporting to a bunch of other
        buffers currently? Or are we reusing it? Perhaps use atomics to store what is used/freed, and GC?
      </p>

      <p class="TODO">
        Try different workgroup sizes, see if it is better (especially when we can rake things)
      </p>

      <p>
        Track <a href="https://github.com/googlefonts/gpu-path-rendering-paper">https://github.com/googlefonts/gpu-path-rendering-paper</a>
      </p>

      <p class="TODO">
        Gamut mapping in WGSL seems to not be working.
      </p>

      <p class="TODO">
        Run a queue encoder for every tile independently?
      </p>

      <p class="TODO">
        Measure workgroup execution order on GPUs to see if we can schedule asymmetric (large) load workgroups first somehow.
      </p>

      <p class="TODO">
        Test better accumulation than our atomics strategy?
      </p>

      <p class="TODO">
        Analyze with GPT-4 for bug detection?
      </p>

      <p class="TODO">
        Create a "performance" section, to note things. Prefer Firefox for profiling CPU currently. Convert things to Rust
        and use WGPU to get better GPU profiling? (Or if we can profile Metal using the debug tools, that could help).
        Look into <a href="https://github.com/takahirox/webgpu-devtools">webgpu-devtools</a>. But for now, we can use FPS
        tests.
      </p>

      <p class="TODO">
        IMPORTANT! Our resulting edge-clipped counts should be cyclic, right? Only use 2 bits per count!!
        They should wrap around or accumulate as long as we do arithmetic mod 4.
        Right now we are taking up 4x i32 for this, and that's (a) excessive, (b) killing performance, and (c) reducing
        the size of our potential workgroups.
      </p>

      <p class="TODO">
        Modify our approach to grid-clip (2 levels?) - 256x256 tile can be done with two successive 16x16 grid clips.
      </p>

      <p class="TODO">
        There are graphical artifacts on <code>rasterTest()</code>. Assert that each stage is area-preserving
        (in parallel software first, then in hardware). Perhaps fuzz too.
      </p>

      <p class="TODO">
        Is indirect dispatch killing performance?
      </p>

      <p class="TODO">
        Better handle "constant" RenderPrograms, we can exit immediately!
      </p>

      <p class="TODO">
        See if we can change flags around our uniform config buffer?
      </p>

      <p class="TODO">
        Is memory bandwidth (and the number of dispatches with memory reads/writes) killing performance?
      </p>

      <p class="TODO">
        Performance seems to be very affected by the buffer sizes. Modify things so that we are working within the same
        buffer? I recall seeing a source saying they handle their own memory within a large buffer, instead of multiple
        buffers? See <a href="https://developer.nvidia.com/vulkan-memory-management">info</a>. Sub-allocation
        could be really efficient? How can we make sure our reads/writes are to the same area? Examine Vello approaches.
      </p>

      <p class="TODO">
        Alternative approaches, e.g. workgroup-per-initial-chunk? RenderProgram evaluation needs accurate edges
        unfortunately, and can use them in a linear way. NOTE: Maybe we can do a more efficient approach for the constant
        faces? (For instance, if we take the signed area of each edge, and accumulate that times the color, we do not
        have to do any cross-workgroup computation, and it is even more parallel). That does NOT work for RenderImage,
        or things where we really need the concept of "centroid" or face. We can't split up the centroid with that, since
        we need the full area in the first place to compute with (so we would have to have the input area for faces?) - that
        is technically possible.
      </p>

      <p class="TODO">
        Raking and other better primitives in raster-clip!
      </p>

      <p class="TODO">
        Document the approaches here.
      </p>

      <p class="TODO">
        Check how will reduced precision affect centroid computations? Area also?
      </p>

      <p class="TODO">
        Add mermaid chart here. Add accumulate and to_texture steps. Add update_config/uniform step (before edge index
        patch). Potentially put config buffer in.
      </p>

      <p class="TODO">
        Show which parts finish in which stages, if we stick to that?
      </p>

      <p class="TODO">
        Can use different techniques for different classes of rasterization. One is “constant/variable/centroid face
        needed”. Other is quantity of edges
      </p>

      <p class="TODO">
        Note "9/4 worst case ratio for quad-clip grid-clip tiling"
      </p>

      <p class="TODO">
        Consider "CombinedRaster" like split? So we can avoid expensive mapping. Does this actually save us anything?
        It really only helps on the CPU?
      </p>

      <p class="TODO">
        Consider how we could split the raster-accumulate up based on what textures are needed (some passes run with some
        textures, other passes run with others).
      </p>

      <p class="DEFERRED">
        Use frames to split this left side!
      </p>

      <h2 id="references">References</h2>

      <div id="references-container"></div>
    </div>
  </div>
  <div class="span2"></div>
</div>

</body>
</html>