diff --git a/chafa/chafa-symbol-map.c b/chafa/chafa-symbol-map.c
index 8222f2e8..1b13222b 100644
--- a/chafa/chafa-symbol-map.c
+++ b/chafa/chafa-symbol-map.c
@@ -279,11 +279,12 @@ glyph_to_bitmap (gint width, gint height,
 
     /* Scale to cell dimensions */
 
-    smol_scale_simple ((SmolPixelType) pixel_format, pixels, width, height, rowstride,
-                       SMOL_PIXEL_RGBA8_PREMULTIPLIED,
+    smol_scale_simple (pixels, (SmolPixelType) pixel_format, width, height, rowstride,
                        (gpointer) scaled_pixels,
+                       SMOL_PIXEL_RGBA8_UNASSOCIATED,  /* FIXME: Premul */
                        CHAFA_SYMBOL_WIDTH_PIXELS, CHAFA_SYMBOL_HEIGHT_PIXELS,
-                       CHAFA_SYMBOL_WIDTH_PIXELS * 4);
+                       CHAFA_SYMBOL_WIDTH_PIXELS * 4,
+                       SMOL_NO_FLAGS);
 
     /* Generate coverage map */
 
@@ -308,11 +309,12 @@ glyph_to_bitmap_wide (gint width, gint height,
 
     /* Scale to cell dimensions */
 
-    smol_scale_simple ((SmolPixelType) pixel_format, pixels, width, height, rowstride,
-                       SMOL_PIXEL_RGBA8_PREMULTIPLIED,
+    smol_scale_simple (pixels, (SmolPixelType) pixel_format, width, height, rowstride,
                        (gpointer) scaled_pixels,
+                       SMOL_PIXEL_RGBA8_UNASSOCIATED,  /* FIXME: Premul */
                        CHAFA_SYMBOL_WIDTH_PIXELS * 2, CHAFA_SYMBOL_HEIGHT_PIXELS,
-                       CHAFA_SYMBOL_WIDTH_PIXELS * 4 * 2);
+                       CHAFA_SYMBOL_WIDTH_PIXELS * 4 * 2,
+                       SMOL_NO_FLAGS);
 
     /* Generate coverage map */
 
@@ -1872,10 +1874,11 @@ chafa_symbol_map_get_glyph (ChafaSymbolMap *symbol_map,
         gpointer temp_pixels = g_malloc (width * CHAFA_SYMBOL_HEIGHT_PIXELS * 4);
 
         /* Convert to desired pixel format */
-        smol_scale_simple (SMOL_PIXEL_ARGB8_PREMULTIPLIED, *pixels_out,
+        smol_scale_simple (*pixels_out, SMOL_PIXEL_ARGB8_UNASSOCIATED,  /* FIXME: Premul */
                            width, height, rowstride,
-                           (SmolPixelType) pixel_format, temp_pixels,
-                           width, height, rowstride);
+                           temp_pixels, (SmolPixelType) pixel_format,
+                           width, height, rowstride,
+                           SMOL_NO_FLAGS);
         g_free (*pixels_out);
         *pixels_out = temp_pixels;
     }
diff --git a/chafa/internal/chafa-indexed-image.c b/chafa/internal/chafa-indexed-image.c
index b3fd5e46..88023aa4 100644
--- a/chafa/internal/chafa-indexed-image.c
+++ b/chafa/internal/chafa-indexed-image.c
@@ -427,6 +427,7 @@ chafa_indexed_image_draw_pixels (ChafaIndexedImage *indexed_image,
                                  gint dest_width, gint dest_height)
 {
     DrawPixelsCtx ctx;
+    ChafaColor bg;
 
     g_return_if_fail (dest_width == indexed_image->width);
     g_return_if_fail (dest_height <= indexed_image->height);
@@ -444,23 +445,54 @@ chafa_indexed_image_draw_pixels (ChafaIndexedImage *indexed_image,
     ctx.dest_width = dest_width;
     ctx.dest_height = dest_height;
 
+#if 0
+    /* FIXME: Need a new smolscale compositing mode that preserves src
+     * alpha before this can be implemented */
+    bg = *chafa_palette_get_color (&indexed_image->palette,
+                                   CHAFA_COLOR_SPACE_RGB,
+                                   CHAFA_PALETTE_INDEX_BG);
+    bg.ch [3] = 0xff;
+#else
     gen_color_lut_rgba8 (ctx.bg_color_lut,
                          *chafa_palette_get_color (&indexed_image->palette,
                                                    CHAFA_COLOR_SPACE_RGB,
                                                    CHAFA_PALETTE_INDEX_BG));
+#endif
 
     ctx.scaled_data = g_new (guint32, dest_width * dest_height);
-    ctx.scale_ctx = smol_scale_new_full ((SmolPixelType) src_pixel_type,
+    ctx.scale_ctx = smol_scale_new_full (/* Source */
                                          (const guint32 *) src_pixels,
+                                         (SmolPixelType) src_pixel_type,
                                          src_width,
                                          src_height,
                                          src_rowstride,
-                                         SMOL_PIXEL_RGBA8_PREMULTIPLIED,
+                                         /* Fill */
+#if 0
+                                         bg.ch,
+#else
+                                         NULL,
+#endif
+                                         SMOL_PIXEL_RGBA8_UNASSOCIATED,
+                                         /* Destination */
                                          NULL,
+                                         SMOL_PIXEL_RGBA8_PREMULTIPLIED,
                                          dest_width,
                                          dest_height,
                                          dest_width * sizeof (guint32),
+                                         /* Placement */
+                                         0,
+                                         0,
+                                         dest_width * SMOL_SUBPIXEL_MUL,
+                                         dest_height * SMOL_SUBPIXEL_MUL,
+                                         /* Extra args */
+                                         SMOL_COMPOSITE_SRC_CLEAR_DEST,
+#if 0
+                                         SMOL_NO_FLAGS,
+                                         NULL,
+#else
+                                         SMOL_DISABLE_SRGB_LINEARIZATION,
                                          post_scale_row,
+#endif
                                          &ctx);
 
     draw_pixels (&ctx);
diff --git a/chafa/internal/chafa-iterm2-canvas.c b/chafa/internal/chafa-iterm2-canvas.c
index e305ad60..31cb7399 100644
--- a/chafa/internal/chafa-iterm2-canvas.c
+++ b/chafa/internal/chafa-iterm2-canvas.c
@@ -76,10 +76,17 @@ typedef enum
 }
 TiffTagId;
 
+typedef enum
+{
+    TIFF_EXTRA_SAMPLE_UNSPECIFIED = 0,
+    TIFF_EXTRA_SAMPLE_ASSOC_ALPHA = 1,
+    TIFF_EXTRA_SAMPLE_UNASSOC_ALPHA = 2
+}
+TiffExtraSampleType;
+
 #define TIFF_PHOTOMETRIC_INTERPRETATION_RGB 2
 #define TIFF_ORIENTATION_TOPLEFT 1
 #define TIFF_PLANAR_CONFIGURATION_CONTIGUOUS 1
-#define TIFF_EXTRA_SAMPLE_ASSOC_ALPHA 1
 
 typedef struct
 {
@@ -150,16 +157,29 @@ chafa_iterm2_canvas_draw_all_pixels (ChafaIterm2Canvas *iterm2_canvas, ChafaPixe
         return;
 
     ctx.iterm2_canvas = iterm2_canvas;
-    ctx.scale_ctx = smol_scale_new_full ((SmolPixelType) src_pixel_type,
+    ctx.scale_ctx = smol_scale_new_full (/* Source */
                                          (const guint32 *) src_pixels,
+                                         (SmolPixelType) src_pixel_type,
                                          src_width,
                                          src_height,
                                          src_rowstride,
-                                         SMOL_PIXEL_RGBA8_PREMULTIPLIED,
+                                         /* Fill */
+                                         NULL,
+                                         SMOL_PIXEL_RGBA8_UNASSOCIATED,
+                                         /* Destination */
                                          NULL,
+                                         SMOL_PIXEL_RGBA8_UNASSOCIATED,  /* FIXME: Premul? */
                                          iterm2_canvas->width,
                                          iterm2_canvas->height,
                                          iterm2_canvas->width * sizeof (guint32),
+                                         /* Placement */
+                                         0,
+                                         0,
+                                         iterm2_canvas->width * SMOL_SUBPIXEL_MUL,
+                                         iterm2_canvas->height * SMOL_SUBPIXEL_MUL,
+                                         /* Extra args */
+                                         SMOL_COMPOSITE_SRC,
+                                         SMOL_DISABLE_SRGB_LINEARIZATION,
                                          NULL,
                                          &ctx);
 
@@ -246,7 +266,7 @@ chafa_iterm2_canvas_build_ansi (ChafaIterm2Canvas *iterm2_canvas, ChafaTermInfo
     generate_tag (&base64, out_str, TIFF_TAG_STRIP_BYTE_COUNTS, TIFF_TYPE_LONG, 1,
                   iterm2_canvas->width * iterm2_canvas->height * 4);
     generate_tag (&base64, out_str, TIFF_TAG_PLANAR_CONFIGURATION, TIFF_TYPE_SHORT, 1, TIFF_PLANAR_CONFIGURATION_CONTIGUOUS);
-    generate_tag (&base64, out_str, TIFF_TAG_EXTRA_SAMPLES, TIFF_TYPE_SHORT, 1, TIFF_EXTRA_SAMPLE_ASSOC_ALPHA);
+    generate_tag (&base64, out_str, TIFF_TAG_EXTRA_SAMPLES, TIFF_TYPE_SHORT, 1, TIFF_EXTRA_SAMPLE_UNASSOC_ALPHA);
 
     /* Next IFD offset (terminator) */
 
diff --git a/chafa/internal/chafa-kitty-canvas.c b/chafa/internal/chafa-kitty-canvas.c
index 411f7eaa..bdd70062 100644
--- a/chafa/internal/chafa-kitty-canvas.c
+++ b/chafa/internal/chafa-kitty-canvas.c
@@ -41,8 +41,6 @@ typedef struct
 {
     ChafaKittyCanvas *kitty_canvas;
     SmolScaleCtx *scale_ctx;
-    ChafaColor bg_color;
-    gboolean flatten_alpha;
 }
 DrawCtx;
 
@@ -130,14 +128,6 @@ draw_pixels_worker (ChafaBatchInfo *batch, const DrawCtx *ctx)
                            ((guint32 *) ctx->kitty_canvas->rgba_image) + (ctx->kitty_canvas->width * batch->first_row),
                            batch->first_row,
                            batch->n_rows);
-
-    /* FIXME: Smolscale should be able to do this */
-    if (ctx->flatten_alpha)
-        chafa_composite_rgba_on_solid_color (ctx->bg_color,
-                                             ctx->kitty_canvas->rgba_image,
-                                             ctx->kitty_canvas->width,
-                                             batch->first_row,
-                                             batch->n_rows);
 }
 
 void
@@ -146,7 +136,9 @@ chafa_kitty_canvas_draw_all_pixels (ChafaKittyCanvas *kitty_canvas, ChafaPixelTy
                                     gint src_width, gint src_height, gint src_rowstride,
                                     ChafaColor bg_color)
 {
+    uint8_t bg_color_rgba [4];
     DrawCtx ctx;
+    gboolean flatten_alpha;
 
     g_return_if_fail (kitty_canvas != NULL);
     g_return_if_fail (src_pixel_type < CHAFA_PIXEL_MAX);
@@ -157,22 +149,36 @@ chafa_kitty_canvas_draw_all_pixels (ChafaKittyCanvas *kitty_canvas, ChafaPixelTy
     if (src_width == 0 || src_height == 0)
         return;
 
+    flatten_alpha = bg_color.ch [3] == 0;
+    bg_color.ch [3] = 0xff;
+    chafa_color8_store_to_rgba8 (bg_color, bg_color_rgba);
+
     ctx.kitty_canvas = kitty_canvas;
-    ctx.scale_ctx = smol_scale_new_full ((SmolPixelType) src_pixel_type,
+    ctx.scale_ctx = smol_scale_new_full (/* Source */
                                          (const guint32 *) src_pixels,
+                                         (SmolPixelType) src_pixel_type,
                                          src_width,
                                          src_height,
                                          src_rowstride,
-                                         SMOL_PIXEL_RGBA8_PREMULTIPLIED,
+                                         /* Fill */
+                                         flatten_alpha ? bg_color_rgba : NULL,
+                                         SMOL_PIXEL_RGBA8_UNASSOCIATED,
+                                         /* Destination */
                                          NULL,
+                                         SMOL_PIXEL_RGBA8_UNASSOCIATED,  /* FIXME: Opaque? */
                                          kitty_canvas->width,
                                          kitty_canvas->height,
                                          kitty_canvas->width * sizeof (guint32),
+                                         /* Placement */
+                                         0,
+                                         0,
+                                         kitty_canvas->width * SMOL_SUBPIXEL_MUL,
+                                         kitty_canvas->height * SMOL_SUBPIXEL_MUL,
+                                         /* Extra args */
+                                         SMOL_COMPOSITE_SRC,
+                                         SMOL_NO_FLAGS,
                                          NULL,
                                          &ctx);
-    ctx.bg_color = bg_color;
-    ctx.flatten_alpha = bg_color.ch [3] == 0;
-
     chafa_process_batches (&ctx,
                            (GFunc) draw_pixels_worker,
                            NULL,
diff --git a/chafa/internal/chafa-pixops.c b/chafa/internal/chafa-pixops.c
index 6dfd7ac1..de6658bb 100644
--- a/chafa/internal/chafa-pixops.c
+++ b/chafa/internal/chafa-pixops.c
@@ -620,33 +620,6 @@ composite_alpha_on_bg (ChafaColor bg_color,
     }
 }
 
-/* FIXME: Could we always destroy the alpha channel and eliminate the other
- * variant? */
-static void
-composite_alpha_on_solid (ChafaColor bg_color,
-                          ChafaPixel *pixels, gint width, gint first_row, gint n_rows)
-{
-    ChafaPixel *p0, *p1;
-
-    p0 = pixels + first_row * width;
-    p1 = p0 + n_rows * width;
-
-    for ( ; p0 < p1; p0++)
-    {
-        p0->col.ch [0] += (bg_color.ch [0] * (255 - (guint32) p0->col.ch [3])) / 255;
-        p0->col.ch [1] += (bg_color.ch [1] * (255 - (guint32) p0->col.ch [3])) / 255;
-        p0->col.ch [2] += (bg_color.ch [2] * (255 - (guint32) p0->col.ch [3])) / 255;
-        p0->col.ch [3] = 0xff;
-    }
-}
-
-void
-chafa_composite_rgba_on_solid_color (ChafaColor color,
-                                     ChafaPixel *pixels, gint width, gint first_row, gint n_rows)
-{
-    composite_alpha_on_solid (color, pixels, width, first_row, n_rows);
-}
-
 static void
 prepare_pixels_2_worker (ChafaBatchInfo *batch, PrepareContext *prep_ctx)
 {
@@ -798,16 +771,17 @@ chafa_prepare_pixel_data_for_symbols (const ChafaPalette *palette,
     prep_ctx.dest_width = dest_width;
     prep_ctx.dest_height = dest_height;
 
-    prep_ctx.scale_ctx = smol_scale_new ((SmolPixelType) prep_ctx.src_pixel_type,
-                                         (const guint32 *) prep_ctx.src_pixels,
-                                         prep_ctx.src_width,
-                                         prep_ctx.src_height,
-                                         prep_ctx.src_rowstride,
-                                         SMOL_PIXEL_RGBA8_PREMULTIPLIED,
-                                         NULL,
-                                         prep_ctx.dest_width,
-                                         prep_ctx.dest_height,
-                                         prep_ctx.dest_width * sizeof (guint32));
+    prep_ctx.scale_ctx = smol_scale_new_simple (prep_ctx.src_pixels,
+                                                (SmolPixelType) prep_ctx.src_pixel_type,
+                                                prep_ctx.src_width,
+                                                prep_ctx.src_height,
+                                                prep_ctx.src_rowstride,
+                                                NULL,
+                                                SMOL_PIXEL_RGBA8_UNASSOCIATED,  /* FIXME: Premul */
+                                                prep_ctx.dest_width,
+                                                prep_ctx.dest_height,
+                                                prep_ctx.dest_width * sizeof (guint32),
+                                                SMOL_NO_FLAGS);
 
     prepare_pixels_pass_1 (&prep_ctx);
     prepare_pixels_pass_2 (&prep_ctx);
diff --git a/chafa/internal/chafa-pixops.h b/chafa/internal/chafa-pixops.h
index 5adab95e..5980dada 100644
--- a/chafa/internal/chafa-pixops.h
+++ b/chafa/internal/chafa-pixops.h
@@ -43,9 +43,6 @@ void chafa_sort_pixel_index_by_channel (guint8 *index,
                                         const ChafaPixel *pixels, gint n_pixels,
                                         gint ch);
 
-void chafa_composite_rgba_on_solid_color (ChafaColor color,
-                                          ChafaPixel *pixels, gint width, gint first_row, gint n_rows);
-
 G_END_DECLS
 
 #endif /* __CHAFA_PIXOPS_H__ */
diff --git a/chafa/internal/smolscale/Makefile.am b/chafa/internal/smolscale/Makefile.am
index ae60118b..47c59f7a 100644
--- a/chafa/internal/smolscale/Makefile.am
+++ b/chafa/internal/smolscale/Makefile.am
@@ -14,6 +14,7 @@ libsmolscale_la_LIBADD =
 libsmolscale_la_SOURCES = \
 	smolscale.c \
 	smolscale.h \
+	smolscale-generic.c \
 	smolscale-private.h
 
 if HAVE_AVX2_INTRINSICS
diff --git a/chafa/internal/smolscale/smolscale-avx2.c b/chafa/internal/smolscale/smolscale-avx2.c
index adc96966..7f179ea5 100644
--- a/chafa/internal/smolscale/smolscale-avx2.c
+++ b/chafa/internal/smolscale/smolscale-avx2.c
@@ -10,1042 +10,600 @@
 #include <immintrin.h>
 #include "smolscale-private.h"
 
+/* ---------------------- *
+ * Context initialization *
+ * ---------------------- */
 
-/* --- Linear interpolation helpers --- */
+/* Number of horizontal pixels to process in a single batch. The define exists for
+ * clarity and cannot be changed without significant changes to the code elsewhere. */
+#define BILIN_HORIZ_BATCH_PIXELS 16
 
-#define LERP_SIMD256_EPI32(a, b, f)                                     \
-    _mm256_add_epi32 (                                                  \
-    _mm256_srli_epi32 (                                                 \
-    _mm256_mullo_epi32 (                                                \
-    _mm256_sub_epi32 ((a), (b)), factors), 8), (b))
-
-#define LERP_SIMD128_EPI32(a, b, f)                                     \
-    _mm_add_epi32 (                                                     \
-    _mm_srli_epi32 (                                                    \
-    _mm_mullo_epi32 (                                                   \
-    _mm_sub_epi32 ((a), (b)), factors), 8), (b))
-
-#define LERP_SIMD256_EPI32_AND_MASK(a, b, f, mask)                      \
-    _mm256_and_si256 (LERP_SIMD256_EPI32 ((a), (b), (f)), (mask))
-
-#define LERP_SIMD128_EPI32_AND_MASK(a, b, f, mask)                      \
-    _mm_and_si128 (LERP_SIMD128_EPI32 ((a), (b), (f)), (mask))
-
-/* --- Premultiplication --- */
-
-#define INVERTED_DIV_SHIFT 21
-#define INVERTED_DIV_ROUNDING (1U << (INVERTED_DIV_SHIFT - 1))
-#define INVERTED_DIV_ROUNDING_128BPP \
-    (((uint64_t) INVERTED_DIV_ROUNDING << 32) | INVERTED_DIV_ROUNDING)
-
-/* This table is used to divide by an integer [1..255] using only a lookup,
- * multiplication and a shift. This is faster than plain division on most
- * architectures.
+/* Batched precalc array layout:
  *
- * Each entry represents the integer 2097152 (1 << 21) divided by the index
- * of the entry. Consequently,
+ * 16 offsets followed by 16 factors, repeating until epilogue. The epilogue
+ * has offsets and factors alternating one by one, and will always have fewer
+ * than 16 o/f pairs:
  *
- * (v / i) ~= (v * inverted_div_table [i] + (1 << 20)) >> 21
+ * ooooooooooooooooffffffffffffffffooooooooooooooooffffffffffffffffofofofofof...
  *
- * (1 << 20) is added for nearest rounding. It would've been nice to keep
- * this table in uint16_t, but alas, we need the extra bits for sufficient
- * precision. */
-static const uint32_t inverted_div_table [256] =
-{
-         0,2097152,1048576, 699051, 524288, 419430, 349525, 299593,
-    262144, 233017, 209715, 190650, 174763, 161319, 149797, 139810,
-    131072, 123362, 116508, 110376, 104858,  99864,  95325,  91181,
-     87381,  83886,  80660,  77672,  74898,  72316,  69905,  67650,
-     65536,  63550,  61681,  59919,  58254,  56680,  55188,  53773,
-     52429,  51150,  49932,  48771,  47663,  46603,  45590,  44620,
-     43691,  42799,  41943,  41121,  40330,  39569,  38836,  38130,
-     37449,  36792,  36158,  35545,  34953,  34380,  33825,  33288,
-     32768,  32264,  31775,  31301,  30840,  30394,  29959,  29537,
-     29127,  28728,  28340,  27962,  27594,  27236,  26887,  26546,
-     26214,  25891,  25575,  25267,  24966,  24672,  24385,  24105,
-     23831,  23564,  23302,  23046,  22795,  22550,  22310,  22075,
-     21845,  21620,  21400,  21183,  20972,  20764,  20560,  20361,
-     20165,  19973,  19784,  19600,  19418,  19240,  19065,  18893,
-     18725,  18559,  18396,  18236,  18079,  17924,  17772,  17623,
-     17476,  17332,  17190,  17050,  16913,  16777,  16644,  16513,
-     16384,  16257,  16132,  16009,  15888,  15768,  15650,  15534,
-     15420,  15308,  15197,  15087,  14980,  14873,  14769,  14665,
-     14564,  14463,  14364,  14266,  14170,  14075,  13981,  13888,
-     13797,  13707,  13618,  13530,  13443,  13358,  13273,  13190,
-     13107,  13026,  12945,  12866,  12788,  12710,  12633,  12558,
-     12483,  12409,  12336,  12264,  12193,  12122,  12053,  11984,
-     11916,  11848,  11782,  11716,  11651,  11586,  11523,  11460,
-     11398,  11336,  11275,  11215,  11155,  11096,  11038,  10980,
-     10923,  10866,  10810,  10755,  10700,  10645,  10592,  10538,
-     10486,  10434,  10382,  10331,  10280,  10230,  10180,  10131,
-     10082,  10034,   9986,   9939,   9892,   9846,   9800,   9754,
-      9709,   9664,   9620,   9576,   9533,   9489,   9447,   9404,
-      9362,   9321,   9279,   9239,   9198,   9158,   9118,   9079,
-      9039,   9001,   8962,   8924,   8886,   8849,   8812,   8775,
-      8738,   8702,   8666,   8630,   8595,   8560,   8525,   8490,
-      8456,   8422,   8389,   8355,   8322,   8289,   8257,   8224,
-};
-
-/* Masking and shifting out the results is left to the caller. In
- * and out may not overlap. */
-static SMOL_INLINE void
-unpremul_i_to_u_128bpp (const uint64_t * SMOL_RESTRICT in,
-                        uint64_t * SMOL_RESTRICT out,
-                        uint8_t alpha)
-{
-    out [0] = ((in [0] * (uint64_t) inverted_div_table [alpha]
-                + INVERTED_DIV_ROUNDING_128BPP) >> INVERTED_DIV_SHIFT);
-    out [1] = ((in [1] * (uint64_t) inverted_div_table [alpha]
-                + INVERTED_DIV_ROUNDING_128BPP) >> INVERTED_DIV_SHIFT);
-}
+ * 16 offsets layout: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 16 factors layout: 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
+ */
 
-static SMOL_INLINE void
-unpremul_p_to_u_128bpp (const uint64_t * SMOL_RESTRICT in,
-                        uint64_t * SMOL_RESTRICT out,
-                        uint8_t alpha)
+static uint32_t
+array_offset_offset (uint32_t elem_i, int max_index, int do_batches)
 {
-    out [0] = (((in [0] << 8) * (uint64_t) inverted_div_table [alpha])
-               >> INVERTED_DIV_SHIFT);
-    out [1] = (((in [1] << 8) * (uint64_t) inverted_div_table [alpha])
-               >> INVERTED_DIV_SHIFT);
+    if (do_batches
+        && (max_index - ((elem_i / BILIN_HORIZ_BATCH_PIXELS) * BILIN_HORIZ_BATCH_PIXELS)
+            >= BILIN_HORIZ_BATCH_PIXELS))
+    {
+        return (elem_i / (BILIN_HORIZ_BATCH_PIXELS)) * (BILIN_HORIZ_BATCH_PIXELS * 2)
+            + (elem_i % BILIN_HORIZ_BATCH_PIXELS);
+    }
+    else
+    {
+        return elem_i * 2;
+    }
 }
 
-static SMOL_INLINE uint64_t
-unpremul_p_to_u_64bpp (const uint64_t in,
-                       uint8_t alpha)
+static uint32_t
+array_offset_factor (uint32_t elem_i, int max_index, int do_batches)
 {
-    uint64_t in_128bpp [2];
-    uint64_t out_128bpp [2];
+    const uint8_t o [BILIN_HORIZ_BATCH_PIXELS] = { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
 
-    in_128bpp [0] = (in & 0x000000ff000000ff);
-    in_128bpp [1] = (in & 0x00ff000000ff0000) >> 16;
-
-    unpremul_p_to_u_128bpp (in_128bpp, out_128bpp, alpha);
-
-    return (out_128bpp [0] & 0x000000ff000000ff)
-           | ((out_128bpp [1] & 0x000000ff000000ff) << 16);
+    if (do_batches
+        && (max_index - ((elem_i / BILIN_HORIZ_BATCH_PIXELS) * BILIN_HORIZ_BATCH_PIXELS)
+            >= BILIN_HORIZ_BATCH_PIXELS))
+    {
+        return (elem_i / (BILIN_HORIZ_BATCH_PIXELS)) * (BILIN_HORIZ_BATCH_PIXELS * 2)
+            + BILIN_HORIZ_BATCH_PIXELS + o [elem_i % BILIN_HORIZ_BATCH_PIXELS];
+    }
+    else
+    {
+        return elem_i * 2 + 1;
+    }
 }
 
-static SMOL_INLINE uint64_t
-premul_u_to_p_64bpp (const uint64_t in,
-                     uint8_t alpha)
+static void
+precalc_linear_range (uint16_t *array_out,
+                      int first_index,
+                      int last_index,
+                      int max_index,
+                      uint64_t first_sample_ofs,
+                      uint64_t sample_step,
+                      int sample_ofs_px_max,
+                      int32_t dest_clip_before_px,
+                      int do_batches,
+                      int *array_i_inout)
 {
-    return ((in * ((uint16_t) alpha + 1)) >> 8) & 0x00ff00ff00ff00ff;
-}
-
-/* --- Packing --- */
-
-/* It's nice to be able to shift by a negative amount */
-#define SHIFT_S(in, s) ((s >= 0) ? (in) << (s) : (in) >> -(s))
-
-#if 0
-/* Currently unused */
-
-/* This is kind of bulky (~13 x86 insns), but it's about the same as using
- * unions, and we don't have to worry about endianness. */
-#define PACK_FROM_1234_64BPP(in, a, b, c, d)                  \
-     ((SHIFT_S ((in), ((a) - 1) * 16 + 8 - 32) & 0xff000000)  \
-    | (SHIFT_S ((in), ((b) - 1) * 16 + 8 - 40) & 0x00ff0000)  \
-    | (SHIFT_S ((in), ((c) - 1) * 16 + 8 - 48) & 0x0000ff00)  \
-    | (SHIFT_S ((in), ((d) - 1) * 16 + 8 - 56) & 0x000000ff))
-#endif
-
-#define PACK_FROM_1234_128BPP(in, a, b, c, d)                                         \
-     ((SHIFT_S ((in [((a) - 1) >> 1]), (((a) - 1) & 1) * 32 + 24 - 32) & 0xff000000)  \
-    | (SHIFT_S ((in [((b) - 1) >> 1]), (((b) - 1) & 1) * 32 + 24 - 40) & 0x00ff0000)  \
-    | (SHIFT_S ((in [((c) - 1) >> 1]), (((c) - 1) & 1) * 32 + 24 - 48) & 0x0000ff00)  \
-    | (SHIFT_S ((in [((d) - 1) >> 1]), (((d) - 1) & 1) * 32 + 24 - 56) & 0x000000ff))
-
-#define SWAP_2_AND_3(n) ((n) == 2 ? 3 : (n) == 3 ? 2 : n)
-
-#define PACK_FROM_1324_64BPP(in, a, b, c, d)                               \
-     ((SHIFT_S ((in), (SWAP_2_AND_3 (a) - 1) * 16 + 8 - 32) & 0xff000000)  \
-    | (SHIFT_S ((in), (SWAP_2_AND_3 (b) - 1) * 16 + 8 - 40) & 0x00ff0000)  \
-    | (SHIFT_S ((in), (SWAP_2_AND_3 (c) - 1) * 16 + 8 - 48) & 0x0000ff00)  \
-    | (SHIFT_S ((in), (SWAP_2_AND_3 (d) - 1) * 16 + 8 - 56) & 0x000000ff))
-
-#if 0
-/* Currently unused */
-
-#define PACK_FROM_1324_128BPP(in, a, b, c, d)                               \
-     ((SHIFT_S ((in [(SWAP_2_AND_3 (a) - 1) >> 1]),                         \
-                ((SWAP_2_AND_3 (a) - 1) & 1) * 32 + 24 - 32) & 0xff000000)  \
-    | (SHIFT_S ((in [(SWAP_2_AND_3 (b) - 1) >> 1]),                         \
-                ((SWAP_2_AND_3 (b) - 1) & 1) * 32 + 24 - 40) & 0x00ff0000)  \
-    | (SHIFT_S ((in [(SWAP_2_AND_3 (c) - 1) >> 1]),                         \
-                ((SWAP_2_AND_3 (c) - 1) & 1) * 32 + 24 - 48) & 0x0000ff00)  \
-    | (SHIFT_S ((in [(SWAP_2_AND_3 (d) - 1) >> 1]),                         \
-                ((SWAP_2_AND_3 (d) - 1) & 1) * 32 + 24 - 56) & 0x000000ff))
-#endif
+    uint64_t sample_ofs;
+    int i;
 
-/* Pack p -> p */
+    sample_ofs = first_sample_ofs;
 
-static SMOL_INLINE uint32_t
-pack_pixel_1324_p_to_1234_p_64bpp (uint64_t in)
-{
-    return in | (in >> 24);
-}
+    for (i = first_index; i < last_index; i++)
+    {
+        uint16_t sample_ofs_px = sample_ofs / SMOL_BILIN_MULTIPLIER;
 
-static void
-pack_row_1324_p_to_1234_p_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint32_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint32_t *row_out_max = row_out + n_pixels;
+        if (sample_ofs_px >= sample_ofs_px_max - 1)
+        {
+            if (i >= dest_clip_before_px)
+            {
+                array_out [array_offset_offset ((*array_i_inout), max_index, do_batches)] = sample_ofs_px_max - 2;
+                array_out [array_offset_factor ((*array_i_inout), max_index, do_batches)] = 0;
+                (*array_i_inout)++;
+            }
+            continue;
+        }
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+        if (i >= dest_clip_before_px)
+        {
+            array_out [array_offset_offset ((*array_i_inout), max_index, do_batches)] = sample_ofs_px;
+            array_out [array_offset_factor ((*array_i_inout), max_index, do_batches)] = SMOL_SMALL_MUL
+                - ((sample_ofs / (SMOL_BILIN_MULTIPLIER / SMOL_SMALL_MUL)) % SMOL_SMALL_MUL);
+            (*array_i_inout)++;
+        }
 
-    while (row_out != row_out_max)
-    {
-        *(row_out++) = pack_pixel_1324_p_to_1234_p_64bpp (*(row_in++));
+        sample_ofs += sample_step;
     }
 }
 
 static void
-pack_row_132a_p_to_123_p_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                uint8_t * SMOL_RESTRICT row_out,
-                                uint32_t n_pixels)
+precalc_bilinear_array (uint16_t *array,
+                        uint64_t src_dim_spx,
+                        uint64_t dest_ofs_spx,
+                        uint64_t dest_dim_spx,
+                        uint32_t dest_dim_prehalving_px,
+                        unsigned int n_halvings,
+                        int32_t dest_clip_before_px,
+                        int32_t dest_clip_after_px,
+                        unsigned int do_batches)
 {
-    uint8_t *row_out_max = row_out + n_pixels * 3;
+    uint32_t src_dim_px = SMOL_SPX_TO_PX (src_dim_spx);
+    uint64_t first_sample_ofs [3];
+    uint64_t sample_step;
+    int i = 0;
+
+    assert (src_dim_px > 1);
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+    dest_ofs_spx %= SMOL_SUBPIXEL_MUL;
 
-    while (row_out != row_out_max)
+    if (src_dim_spx > dest_dim_spx)
     {
-        /* FIXME: Would be faster to shift directly */
-        uint32_t p = pack_pixel_1324_p_to_1234_p_64bpp (*(row_in++));
-        *(row_out++) = p >> 24;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 8;
+        /* Minification */
+        sample_step = ((uint64_t) src_dim_spx * SMOL_BILIN_MULTIPLIER) / dest_dim_spx;
+        first_sample_ofs [0] = (sample_step - SMOL_BILIN_MULTIPLIER) / 2;
+        first_sample_ofs [1] = ((sample_step - SMOL_BILIN_MULTIPLIER) / 2)
+            + ((sample_step * (SMOL_SUBPIXEL_MUL - dest_ofs_spx) * (1 << n_halvings)) / SMOL_SUBPIXEL_MUL);
     }
+    else
+    {
+        /* Magnification */
+        sample_step = ((src_dim_spx - SMOL_SUBPIXEL_MUL) * SMOL_BILIN_MULTIPLIER)
+            / (dest_dim_spx > SMOL_SUBPIXEL_MUL ? (dest_dim_spx - SMOL_SUBPIXEL_MUL) : 1);
+        first_sample_ofs [0] = 0;
+        first_sample_ofs [1] = (sample_step * (SMOL_SUBPIXEL_MUL - dest_ofs_spx)) / SMOL_SUBPIXEL_MUL;
+    }
+
+    first_sample_ofs [2] = (((uint64_t) src_dim_spx * SMOL_BILIN_MULTIPLIER) / SMOL_SUBPIXEL_MUL)
+        + ((sample_step - SMOL_BILIN_MULTIPLIER) / 2)
+        - sample_step * (1U << n_halvings);
+
+    /* Left fringe */
+    precalc_linear_range (array,
+                          0,
+                          1 << n_halvings,
+                          dest_dim_prehalving_px - dest_clip_after_px,
+                          first_sample_ofs [0],
+                          sample_step,
+                          src_dim_px,
+                          dest_clip_before_px,
+                          do_batches,
+                          &i);
+
+    /* Main range */
+    precalc_linear_range (array,
+                          1 << n_halvings,
+                          dest_dim_prehalving_px - (1 << n_halvings),
+                          dest_dim_prehalving_px - dest_clip_after_px,
+                          first_sample_ofs [1],
+                          sample_step,
+                          src_dim_px,
+                          dest_clip_before_px,
+                          do_batches,
+                          &i);
+
+    /* Right fringe */
+    precalc_linear_range (array,
+                          dest_dim_prehalving_px - (1 << n_halvings),
+                          dest_dim_prehalving_px,
+                          dest_dim_prehalving_px - dest_clip_after_px,
+                          first_sample_ofs [2],
+                          sample_step,
+                          src_dim_px,
+                          dest_clip_before_px,
+                          do_batches,
+                          &i);
 }
 
 static void
-pack_row_132a_p_to_321_p_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                uint8_t * SMOL_RESTRICT row_out,
-                                uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
+precalc_boxes_array (uint32_t *array,
+                     uint32_t *span_step,
+                     uint32_t *span_mul,
+                     uint32_t src_dim_spx,
+                     int32_t dest_dim,
+                     uint32_t dest_ofs_spx,
+                     uint32_t dest_dim_spx,
+                     int32_t dest_clip_before_px)
+{
+    uint64_t fracF, frac_stepF;
+    uint64_t f;
+    uint64_t stride;
+    uint64_t a, b;
+    int i, dest_i;
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+    dest_ofs_spx %= SMOL_SUBPIXEL_MUL;
 
-    while (row_out != row_out_max)
-    {
-        /* FIXME: Would be faster to shift directly */
-        uint32_t p = pack_pixel_1324_p_to_1234_p_64bpp (*(row_in++));
-        *(row_out++) = p >> 8;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 24;
-    }
-}
+    /* Output sample can't be less than a pixel. Fringe opacity is applied in
+     * a separate step. FIXME: May cause wrong subpixel distribution -- revisit. */
+    if (dest_dim_spx < 256)
+        dest_dim_spx = 256;
 
-#define DEF_PACK_FROM_1324_P_TO_P_64BPP(a, b, c, d)                     \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_1324_p_to_##a##b##c##d##_p_64bpp (uint64_t in)               \
-{                                                                       \
-    return PACK_FROM_1324_64BPP (in, a, b, c, d);                       \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_1324_p_to_##a##b##c##d##_p_64bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                           uint32_t * SMOL_RESTRICT row_out, \
-                                           uint32_t n_pixels)           \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-        *(row_out++) = pack_pixel_1324_p_to_##a##b##c##d##_p_64bpp (*(row_in++)); \
-}
+    frac_stepF = ((uint64_t) src_dim_spx * SMOL_BIG_MUL) / (uint64_t) dest_dim_spx;
+    fracF = 0;
 
-DEF_PACK_FROM_1324_P_TO_P_64BPP (1, 4, 3, 2)
-DEF_PACK_FROM_1324_P_TO_P_64BPP (2, 3, 4, 1)
-DEF_PACK_FROM_1324_P_TO_P_64BPP (3, 2, 1, 4)
-DEF_PACK_FROM_1324_P_TO_P_64BPP (4, 1, 2, 3)
-DEF_PACK_FROM_1324_P_TO_P_64BPP (4, 3, 2, 1)
+    stride = frac_stepF / (uint64_t) SMOL_BIG_MUL;
+    f = (frac_stepF / SMOL_SMALL_MUL) % SMOL_SMALL_MUL;
 
-static SMOL_INLINE uint32_t
-pack_pixel_1234_p_to_1234_p_128bpp (const uint64_t *in)
-{
-    /* FIXME: Are masks needed? */
-    return ((in [0] >> 8) & 0xff000000)
-           | ((in [0] << 16) & 0x00ff0000)
-           | ((in [1] >> 24) & 0x0000ff00)
-           | (in [1] & 0x000000ff);
-}
+    /* We divide by (b + 1) instead of just (b) to avoid overflows in
+     * scale_128bpp_half(), which would affect horizontal box scaling. The
+     * fudge factor counters limited precision in the inverted division
+     * operation. It causes 16-bit values to undershoot by less than 127/65535
+     * (<.2%). Since the final output is 8-bit, and rounding neutralizes the
+     * error, this doesn't matter. */
 
-static void
-pack_row_1234_p_to_1234_p_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                  uint32_t * SMOL_RESTRICT row_out,
-                                  uint32_t n_pixels)
-{
-    uint32_t *row_out_max = row_out + n_pixels;
+    a = (SMOL_BOXES_MULTIPLIER * 255);
+    b = ((stride * 255) + ((f * 255) / 256));
+    *span_step = frac_stepF / SMOL_SMALL_MUL;
+    *span_mul = (a + (b / 2)) / (b + 1);
+
+    /* Left fringe */
+    i = 0;
+    dest_i = 0;
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+    if (dest_i >= dest_clip_before_px)
+        array [i++] = 0;
 
-    while (row_out != row_out_max)
+    /* Main range */
+    fracF = ((frac_stepF * (SMOL_SUBPIXEL_MUL - dest_ofs_spx)) / SMOL_SUBPIXEL_MUL);
+    for (dest_i = 1; dest_i < dest_dim - 1; dest_i++)
     {
-        *(row_out++) = pack_pixel_1234_p_to_1234_p_128bpp (row_in);
-        row_in += 2;
+        if (dest_i >= dest_clip_before_px)
+            array [i++] = fracF / SMOL_SMALL_MUL;
+        fracF += frac_stepF;
     }
-}
 
-#define DEF_PACK_FROM_1234_P_TO_P_128BPP(a, b, c, d)                    \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_1234_p_to_##a##b##c##d##_p_128bpp (const uint64_t * SMOL_RESTRICT in) \
-{                                                                       \
-    return PACK_FROM_1234_128BPP (in, a, b, c, d);                      \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_1234_p_to_##a##b##c##d##_p_128bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                            uint32_t * SMOL_RESTRICT row_out, \
-                                            uint32_t n_pixels)          \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-    {                                                                   \
-        *(row_out++) = pack_pixel_1234_p_to_##a##b##c##d##_p_128bpp (row_in); \
-        row_in += 2;                                                    \
-    }                                                                   \
+    /* Right fringe */
+    if (dest_dim > 1 && dest_i >= dest_clip_before_px)
+        array [i++] = (((uint64_t) src_dim_spx * SMOL_SMALL_MUL - frac_stepF) / SMOL_SMALL_MUL);
 }
 
-DEF_PACK_FROM_1234_P_TO_P_128BPP (1, 4, 3, 2)
-DEF_PACK_FROM_1234_P_TO_P_128BPP (2, 3, 4, 1)
-DEF_PACK_FROM_1234_P_TO_P_128BPP (3, 2, 1, 4)
-DEF_PACK_FROM_1234_P_TO_P_128BPP (4, 1, 2, 3)
-DEF_PACK_FROM_1234_P_TO_P_128BPP (4, 3, 2, 1)
-
 static void
-pack_row_123a_p_to_123_p_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
+init_dim (SmolDim *dim, int do_batches)
 {
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
+    if (dim->filter_type == SMOL_FILTER_ONE || dim->filter_type == SMOL_FILTER_COPY)
     {
-        *(row_out++) = *row_in >> 32;
-        *(row_out++) = *(row_in++);
-        *(row_out++) = *(row_in++) >> 32;
     }
-}
-
-static void
-pack_row_123a_p_to_321_p_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
+    else if (dim->filter_type == SMOL_FILTER_BOX)
     {
-        *(row_out++) = row_in [1] >> 32;
-        *(row_out++) = row_in [0];
-        *(row_out++) = row_in [0] >> 32;
-        row_in += 2;
+        precalc_boxes_array (dim->precalc,
+                             &dim->span_step,
+                             &dim->span_mul,
+                             dim->src_size_spx,
+                             dim->placement_size_px,
+                             dim->placement_ofs_spx,
+                             dim->placement_size_spx,
+                             dim->clip_before_px);
     }
-}
-
-/* Pack p (alpha last) -> u */
-
-static SMOL_INLINE uint32_t
-pack_pixel_132a_p_to_1234_u_64bpp (uint64_t in)
-{
-    uint8_t alpha = in;
-    in = (unpremul_p_to_u_64bpp (in, alpha) & 0xffffffffffffff00) | alpha;
-    return in | (in >> 24);
-}
-
-static void
-pack_row_132a_p_to_1234_u_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint32_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint32_t *row_out_max = row_out + n_pixels;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
+    else /* SMOL_FILTER_BILINEAR_?H */
     {
-        *(row_out++) = pack_pixel_132a_p_to_1234_u_64bpp (*(row_in++));
+        precalc_bilinear_array (dim->precalc,
+                                dim->src_size_spx,
+                                dim->placement_ofs_spx,
+                                dim->placement_size_prehalving_spx,
+                                dim->placement_size_prehalving_px,
+                                dim->n_halvings,
+                                dim->clip_before_px,
+                                dim->clip_after_px,
+                                do_batches);
     }
 }
 
 static void
-pack_row_132a_p_to_123_u_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                uint8_t * SMOL_RESTRICT row_out,
-                                uint32_t n_pixels)
+init_horizontal (SmolScaleCtx *scale_ctx)
 {
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_132a_p_to_1234_u_64bpp (*(row_in++));
-        *(row_out++) = p >> 24;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 8;
-    }
+    init_dim (&scale_ctx->hdim,
+              scale_ctx->storage_type == SMOL_STORAGE_64BPP ? TRUE : FALSE);
 }
 
 static void
-pack_row_132a_p_to_321_u_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                uint8_t * SMOL_RESTRICT row_out,
-                                uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_132a_p_to_1234_u_64bpp (*(row_in++));
-        *(row_out++) = p >> 8;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 24;
-    }
-}
-
-#define DEF_PACK_FROM_132A_P_TO_U_64BPP(a, b, c, d)                     \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_132a_p_to_##a##b##c##d##_u_64bpp (uint64_t in)               \
-{                                                                       \
-    uint8_t alpha = in;                                                 \
-    in = (unpremul_p_to_u_64bpp (in, alpha) & 0xffffffffffffff00) | alpha; \
-    return PACK_FROM_1324_64BPP (in, a, b, c, d);                       \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_132a_p_to_##a##b##c##d##_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                           uint32_t * SMOL_RESTRICT row_out, \
-                                           uint32_t n_pixels)           \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-        *(row_out++) = pack_pixel_132a_p_to_##a##b##c##d##_u_64bpp (*(row_in++)); \
-}
-
-DEF_PACK_FROM_132A_P_TO_U_64BPP (3, 2, 1, 4)
-DEF_PACK_FROM_132A_P_TO_U_64BPP (4, 1, 2, 3)
-DEF_PACK_FROM_132A_P_TO_U_64BPP (4, 3, 2, 1)
-
-#define DEF_PACK_FROM_123A_P_TO_U_128BPP(a, b, c, d)                    \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_123a_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT in) \
-{                                                                       \
-    uint64_t t [2];                                                     \
-    uint8_t alpha = in [1];                                             \
-    unpremul_p_to_u_128bpp (in, t, alpha);                              \
-    t [1] = (t [1] & 0xffffffff00000000) | alpha;                       \
-    return PACK_FROM_1234_128BPP (t, a, b, c, d);                       \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_123a_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                            uint32_t * SMOL_RESTRICT row_out, \
-                                            uint32_t n_pixels)          \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-    {                                                                   \
-        *(row_out++) = pack_pixel_123a_p_to_##a##b##c##d##_u_128bpp (row_in); \
-        row_in += 2;                                                    \
-    }                                                                   \
-}
-
-DEF_PACK_FROM_123A_P_TO_U_128BPP (1, 2, 3, 4)
-DEF_PACK_FROM_123A_P_TO_U_128BPP (3, 2, 1, 4)
-DEF_PACK_FROM_123A_P_TO_U_128BPP (4, 1, 2, 3)
-DEF_PACK_FROM_123A_P_TO_U_128BPP (4, 3, 2, 1)
-
-static void
-pack_row_123a_p_to_123_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
+init_vertical (SmolScaleCtx *scale_ctx)
 {
-    uint8_t *row_out_max = row_out + n_pixels * 3;
+    init_dim (&scale_ctx->vdim, FALSE);
+}
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+/* ----------------- *
+ * Premultiplication *
+ * ----------------- */
 
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_123a_p_to_1234_u_128bpp (row_in);
-        row_in += 2;
-        *(row_out++) = p >> 24;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 8;
-    }
+static SMOL_INLINE void
+premul_u_to_p8_128bpp (uint64_t * SMOL_RESTRICT inout,
+                       uint8_t alpha)
+{
+    inout [0] = (((inout [0] + 0x0000000100000001) * ((uint16_t) alpha + 1) - 0x0000000100000001)
+                 >> 8) & 0x000000ff000000ff;
+    inout [1] = (((inout [1] + 0x0000000100000001) * ((uint16_t) alpha + 1) - 0x0000000100000001)
+                 >> 8) & 0x000000ff000000ff;
 }
 
-static void
-pack_row_123a_p_to_321_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
+static SMOL_INLINE void
+unpremul_p8_to_u_128bpp (const uint64_t *in,
+                         uint64_t *out,
+                         uint8_t alpha)
 {
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_123a_p_to_1234_u_128bpp (row_in);
-        row_in += 2;
-        *(row_out++) = p >> 8;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 24;
-    }
+    out [0] = ((in [0] * _smol_inv_div_p8_lut [alpha])
+               >> INVERTED_DIV_SHIFT_P8) & 0x000000ff000000ff;
+    out [1] = ((in [1] * _smol_inv_div_p8_lut [alpha])
+               >> INVERTED_DIV_SHIFT_P8) & 0x000000ff000000ff;
 }
 
-/* Pack p (alpha first) -> u */
-
-static SMOL_INLINE uint32_t
-pack_pixel_a324_p_to_1234_u_64bpp (uint64_t in)
+static SMOL_INLINE uint64_t
+premul_u_to_p8_64bpp (const uint64_t in,
+                      uint8_t alpha)
 {
-    uint8_t alpha = (in >> 48) & 0xff;  /* FIXME: May not need mask */
-    in = (unpremul_p_to_u_64bpp (in, alpha) & 0x0000ffffffffffff) | ((uint64_t) alpha << 48);
-    return in | (in >> 24);
+    return (((in + 0x0001000100010001) * ((uint16_t) alpha + 1) - 0x0001000100010001)
+            >> 8) & 0x00ff00ff00ff00ff;
 }
 
-static void
-pack_row_a324_p_to_1234_u_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint32_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
+static SMOL_INLINE uint64_t
+unpremul_p8_to_u_64bpp (const uint64_t in,
+                        uint8_t alpha)
 {
-    uint32_t *row_out_max = row_out + n_pixels;
+    uint64_t in_128bpp [2];
+    uint64_t out_128bpp [2];
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+    in_128bpp [0] = (in & 0x000000ff000000ff);
+    in_128bpp [1] = (in & 0x00ff000000ff0000) >> 16;
 
-    while (row_out != row_out_max)
-    {
-        *(row_out++) = pack_pixel_a324_p_to_1234_u_64bpp (*(row_in++));
-    }
+    unpremul_p8_to_u_128bpp (in_128bpp, out_128bpp, alpha);
+
+    return out_128bpp [0] | (out_128bpp [1] << 16);
 }
 
-static void
-pack_row_a324_p_to_234_u_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                uint8_t * SMOL_RESTRICT row_out,
-                                uint32_t n_pixels)
+static SMOL_INLINE void
+premul_u_to_p16_128bpp (uint64_t *inout,
+                        uint8_t alpha)
 {
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_a324_p_to_1234_u_64bpp (*(row_in++));
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 8;
-        *(row_out++) = p;
-    }
+    inout [0] = inout [0] * alpha;
+    inout [1] = inout [1] * alpha;
 }
 
-static void
-pack_row_a324_p_to_432_u_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                uint8_t * SMOL_RESTRICT row_out,
-                                uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_a324_p_to_1234_u_64bpp (*(row_in++));
-        *(row_out++) = p;
-        *(row_out++) = p >> 8;
-        *(row_out++) = p >> 16;
-    }
-}
-
-#define DEF_PACK_FROM_A324_P_TO_U_64BPP(a, b, c, d)                     \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_a324_p_to_##a##b##c##d##_u_64bpp (uint64_t in)               \
-{                                                                       \
-    uint8_t alpha = (in >> 48) & 0xff;  /* FIXME: May not need mask */  \
-    in = (unpremul_p_to_u_64bpp (in, alpha) & 0x0000ffffffffffff) | ((uint64_t) alpha << 48); \
-    return PACK_FROM_1324_64BPP (in, a, b, c, d);                       \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_a324_p_to_##a##b##c##d##_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                           uint32_t * SMOL_RESTRICT row_out, \
-                                           uint32_t n_pixels)           \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-        *(row_out++) = pack_pixel_a324_p_to_##a##b##c##d##_u_64bpp (*(row_in++)); \
-}
-
-DEF_PACK_FROM_A324_P_TO_U_64BPP (1, 4, 3, 2)
-DEF_PACK_FROM_A324_P_TO_U_64BPP (2, 3, 4, 1)
-DEF_PACK_FROM_A324_P_TO_U_64BPP (4, 3, 2, 1)
-
-#define DEF_PACK_FROM_A234_P_TO_U_128BPP(a, b, c, d)                    \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_a234_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT in) \
-{                                                                       \
-    uint64_t t [2];                                                     \
-    uint8_t alpha = in [0] >> 32;                                       \
-    unpremul_p_to_u_128bpp (in, t, alpha);                              \
-    t [0] = (t [0] & 0x00000000ffffffff) | ((uint64_t) alpha << 32);    \
-    return PACK_FROM_1234_128BPP (t, a, b, c, d);                       \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_a234_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                            uint32_t * SMOL_RESTRICT row_out, \
-                                            uint32_t n_pixels)          \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-    {                                                                   \
-        *(row_out++) = pack_pixel_a234_p_to_##a##b##c##d##_u_128bpp (row_in); \
-        row_in += 2;                                                    \
-    }                                                                   \
-}
-
-DEF_PACK_FROM_A234_P_TO_U_128BPP (1, 2, 3, 4)
-DEF_PACK_FROM_A234_P_TO_U_128BPP (1, 4, 3, 2)
-DEF_PACK_FROM_A234_P_TO_U_128BPP (2, 3, 4, 1)
-DEF_PACK_FROM_A234_P_TO_U_128BPP (4, 3, 2, 1)
-
-static void
-pack_row_a234_p_to_234_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
+static SMOL_INLINE void
+unpremul_p16_to_u_128bpp (const uint64_t * SMOL_RESTRICT in,
+                          uint64_t * SMOL_RESTRICT out,
+                          uint8_t alpha)
 {
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_a234_p_to_1234_u_128bpp (row_in);
-        row_in += 2;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 8;
-        *(row_out++) = p;
-    }
+    out [0] = ((in [0] * _smol_inv_div_p16_lut [alpha])
+               >> INVERTED_DIV_SHIFT_P16) & 0x000000ff000000ffULL;
+    out [1] = ((in [1] * _smol_inv_div_p16_lut [alpha])
+               >> INVERTED_DIV_SHIFT_P16) & 0x000000ff000000ffULL;
 }
 
-static void
-pack_row_a234_p_to_432_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
+/* --------- *
+ * Repacking *
+ * --------- */
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+/* PACK_SHUF_MM256_EPI8_32_TO_128() 
+ *
+ * Generates a shuffling register for packing 8bpc pixel channels in the
+ * provided order. The order (1, 2, 3, 4) is neutral and corresponds to
+ *
+ * _mm256_set_epi8 (13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
+ *                  13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
+ */
+#define SHUF_ORDER_32_TO_128 0x01000302U
+#define SHUF_CH_32_TO_128(n) ((char) (SHUF_ORDER_32_TO_128 >> ((4 - (n)) * 8)))
+#define SHUF_QUAD_CH_32_TO_128(q, n) (4 * (q) + SHUF_CH_32_TO_128 (n))
+#define SHUF_QUAD_32_TO_128(q, a, b, c, d) \
+    SHUF_QUAD_CH_32_TO_128 ((q), (a)), \
+    SHUF_QUAD_CH_32_TO_128 ((q), (b)), \
+    SHUF_QUAD_CH_32_TO_128 ((q), (c)), \
+    SHUF_QUAD_CH_32_TO_128 ((q), (d))
+#define PACK_SHUF_EPI8_LANE_32_TO_128(a, b, c, d) \
+    SHUF_QUAD_32_TO_128 (3, (a), (b), (c), (d)), \
+    SHUF_QUAD_32_TO_128 (2, (a), (b), (c), (d)), \
+    SHUF_QUAD_32_TO_128 (1, (a), (b), (c), (d)), \
+    SHUF_QUAD_32_TO_128 (0, (a), (b), (c), (d))
+#define PACK_SHUF_MM256_EPI8_32_TO_128(a, b, c, d) _mm256_set_epi8 ( \
+    PACK_SHUF_EPI8_LANE_32_TO_128 ((a), (b), (c), (d)), \
+    PACK_SHUF_EPI8_LANE_32_TO_128 ((a), (b), (c), (d)))
+
+/* PACK_SHUF_MM256_EPI8_32_TO_64()
+ *
+ * 64bpp version. Packs only once, so fewer contortions required. */
+#define SHUF_CH_32_TO_64(n) ((char) (4 - (n)))
+#define SHUF_QUAD_CH_32_TO_64(q, n) (4 * (q) + SHUF_CH_32_TO_64 (n))
+#define SHUF_QUAD_32_TO_64(q, a, b, c, d) \
+    SHUF_QUAD_CH_32_TO_64 ((q), (a)), \
+    SHUF_QUAD_CH_32_TO_64 ((q), (b)), \
+    SHUF_QUAD_CH_32_TO_64 ((q), (c)), \
+    SHUF_QUAD_CH_32_TO_64 ((q), (d))
+#define PACK_SHUF_EPI8_LANE_32_TO_64(a, b, c, d) \
+    SHUF_QUAD_32_TO_64 (3, (a), (b), (c), (d)), \
+    SHUF_QUAD_32_TO_64 (2, (a), (b), (c), (d)), \
+    SHUF_QUAD_32_TO_64 (1, (a), (b), (c), (d)), \
+    SHUF_QUAD_32_TO_64 (0, (a), (b), (c), (d))
+#define PACK_SHUF_MM256_EPI8_32_TO_64(a, b, c, d) _mm256_set_epi8 ( \
+    PACK_SHUF_EPI8_LANE_32_TO_64 ((a), (b), (c), (d)), \
+    PACK_SHUF_EPI8_LANE_32_TO_64 ((a), (b), (c), (d)))
 
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_a234_p_to_1234_u_128bpp (row_in);
-        row_in += 2;
-        *(row_out++) = p;
-        *(row_out++) = p >> 8;
-        *(row_out++) = p >> 16;
-    }
-}
+/* It's nice to be able to shift by a negative amount */
+#define SHIFT_S(in, s) ((s >= 0) ? (in) << (s) : (in) >> -(s))
 
-/* Pack i (alpha last) to u */
+/* This is kind of bulky (~13 x86 insns), but it's about the same as using
+ * unions, and we don't have to worry about endianness. */
+#define PACK_FROM_1234_64BPP(in, a, b, c, d) \
+    ((SHIFT_S ((in), ((a) - 1) * 16 + 8 - 32) & 0xff000000) \
+     | (SHIFT_S ((in), ((b) - 1) * 16 + 8 - 40) & 0x00ff0000) \
+     | (SHIFT_S ((in), ((c) - 1) * 16 + 8 - 48) & 0x0000ff00) \
+     | (SHIFT_S ((in), ((d) - 1) * 16 + 8 - 56) & 0x000000ff))
+
+#define PACK_FROM_1234_128BPP(in, a, b, c, d) \
+    ((SHIFT_S ((in [((a) - 1) >> 1]), (((a) - 1) & 1) * 32 + 24 - 32) & 0xff000000) \
+     | (SHIFT_S ((in [((b) - 1) >> 1]), (((b) - 1) & 1) * 32 + 24 - 40) & 0x00ff0000) \
+     | (SHIFT_S ((in [((c) - 1) >> 1]), (((c) - 1) & 1) * 32 + 24 - 48) & 0x0000ff00) \
+     | (SHIFT_S ((in [((d) - 1) >> 1]), (((d) - 1) & 1) * 32 + 24 - 56) & 0x000000ff))
 
-static SMOL_INLINE uint32_t
-pack_pixel_123a_i_to_1234_u_128bpp (const uint64_t * SMOL_RESTRICT in)
-{
-    uint8_t alpha = (in [1] >> 8) & 0xff;
-    uint64_t t [2];
+#define SWAP_2_AND_3(n) ((n) == 2 ? 3 : (n) == 3 ? 2 : n)
 
-    unpremul_i_to_u_128bpp (in, t, alpha);
+#define PACK_FROM_1324_64BPP(in, a, b, c, d) \
+    ((SHIFT_S ((in), (SWAP_2_AND_3 (a) - 1) * 16 + 8 - 32) & 0xff000000) \
+     | (SHIFT_S ((in), (SWAP_2_AND_3 (b) - 1) * 16 + 8 - 40) & 0x00ff0000) \
+     | (SHIFT_S ((in), (SWAP_2_AND_3 (c) - 1) * 16 + 8 - 48) & 0x0000ff00) \
+     | (SHIFT_S ((in), (SWAP_2_AND_3 (d) - 1) * 16 + 8 - 56) & 0x000000ff))
 
-    return ((t [0] >> 8) & 0xff000000)
-           | ((t [0] << 16) & 0x00ff0000)
-           | ((t [1] >> 24) & 0x0000ff00)
-           | alpha;
-}
+/* ---------------------- *
+ * Repacking: 24/32 -> 64 *
+ * ---------------------- */
 
 static void
-pack_8x_123a_i_to_xxxx_u_128bpp (const uint64_t * SMOL_RESTRICT *in,
-                                 uint32_t * SMOL_RESTRICT *out,
-                                 uint32_t * out_max,
-                                 const __m256i channel_shuf)
+unpack_8x_1234_p8_to_xxxx_p8_64bpp (const uint32_t * SMOL_RESTRICT *in,
+                                    uint64_t * SMOL_RESTRICT *out,
+                                    uint64_t *out_max,
+                                    const __m256i channel_shuf)
 {
-#define ALPHA_MUL (1 << (INVERTED_DIV_SHIFT - 8))
-#define ALPHA_MASK SMOL_8X1BIT (0, 1, 0, 0, 0, 1, 0, 0)
-
-    const __m256i ones = _mm256_set_epi32 (
-        ALPHA_MUL, ALPHA_MUL, ALPHA_MUL, ALPHA_MUL,
-        ALPHA_MUL, ALPHA_MUL, ALPHA_MUL, ALPHA_MUL);
-    const __m256i alpha_clean_mask = _mm256_set_epi32 (
-        0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff,
-        0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff);
-    const __m256i rounding = _mm256_set_epi32 (
-        INVERTED_DIV_ROUNDING, 0, INVERTED_DIV_ROUNDING, INVERTED_DIV_ROUNDING,
-        INVERTED_DIV_ROUNDING, 0, INVERTED_DIV_ROUNDING, INVERTED_DIV_ROUNDING);
-    __m256i m00, m01, m02, m03, m04, m05, m06, m07, m08;
+    const __m256i zero = _mm256_setzero_si256 ();
     const __m256i * SMOL_RESTRICT my_in = (const __m256i * SMOL_RESTRICT) *in;
     __m256i * SMOL_RESTRICT my_out = (__m256i * SMOL_RESTRICT) *out;
+    __m256i m0, m1, m2;
 
-    SMOL_ASSUME_ALIGNED (my_in, __m256i * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (my_out, __m256i * SMOL_RESTRICT);
 
-    while ((ptrdiff_t) (my_out + 1) <= (ptrdiff_t) out_max)
+    while ((ptrdiff_t) (my_out + 2) <= (ptrdiff_t) out_max)
     {
-        /* Load inputs */
-
-        m00 = _mm256_stream_load_si256 (my_in);
-        my_in++;
-        m01 = _mm256_stream_load_si256 (my_in);
-        my_in++;
-        m02 = _mm256_stream_load_si256 (my_in);
-        my_in++;
-        m03 = _mm256_stream_load_si256 (my_in);
+        m0 = _mm256_loadu_si256 (my_in);
         my_in++;
 
-        /* Load alpha factors */
-
-        m04 = _mm256_slli_si256 (m00, 4);
-        m06 = _mm256_srli_si256 (m03, 4);
-        m05 = _mm256_blend_epi32 (m04, m01, ALPHA_MASK);
-        m07 = _mm256_blend_epi32 (m06, m02, ALPHA_MASK);
-        m07 = _mm256_srli_si256 (m07, 4);
-
-        m04 = _mm256_blend_epi32 (m05, m07, SMOL_8X1BIT (0, 0, 1, 1, 0, 0, 1, 1));
-        m04 = _mm256_srli_epi32 (m04, 8);
-        m04 = _mm256_and_si256 (m04, alpha_clean_mask);
-        m04 = _mm256_i32gather_epi32 ((const void *) inverted_div_table, m04, 4);
-
-        /* 2 pixels times 4 */
-
-        m05 = _mm256_shuffle_epi32 (m04, SMOL_4X2BIT (3, 3, 3, 3));
-        m06 = _mm256_shuffle_epi32 (m04, SMOL_4X2BIT (2, 2, 2, 2));
-        m07 = _mm256_shuffle_epi32 (m04, SMOL_4X2BIT (1, 1, 1, 1));
-        m08 = _mm256_shuffle_epi32 (m04, SMOL_4X2BIT (0, 0, 0, 0));
-
-        m05 = _mm256_blend_epi32 (m05, ones, ALPHA_MASK);
-        m06 = _mm256_blend_epi32 (m06, ones, ALPHA_MASK);
-        m07 = _mm256_blend_epi32 (m07, ones, ALPHA_MASK);
-        m08 = _mm256_blend_epi32 (m08, ones, ALPHA_MASK);
-
-        m05 = _mm256_mullo_epi32 (m05, m00);
-        m06 = _mm256_mullo_epi32 (m06, m01);
-        m07 = _mm256_mullo_epi32 (m07, m02);
-        m08 = _mm256_mullo_epi32 (m08, m03);
-
-        m05 = _mm256_add_epi32 (m05, rounding);
-        m06 = _mm256_add_epi32 (m06, rounding);
-        m07 = _mm256_add_epi32 (m07, rounding);
-        m08 = _mm256_add_epi32 (m08, rounding);
-
-        m05 = _mm256_srli_epi32 (m05, INVERTED_DIV_SHIFT);
-        m06 = _mm256_srli_epi32 (m06, INVERTED_DIV_SHIFT);
-        m07 = _mm256_srli_epi32 (m07, INVERTED_DIV_SHIFT);
-        m08 = _mm256_srli_epi32 (m08, INVERTED_DIV_SHIFT);
-
-        /* Pack and store */
-
-        m00 = _mm256_packus_epi32 (m05, m06);
-        m01 = _mm256_packus_epi32 (m07, m08);
-        m00 = _mm256_packus_epi16 (m00, m01);
+        m0 = _mm256_shuffle_epi8 (m0, channel_shuf);
+        m0 = _mm256_permute4x64_epi64 (m0, SMOL_4X2BIT (3, 1, 2, 0));
 
-        m00 = _mm256_shuffle_epi8 (m00, channel_shuf);
-        m00 = _mm256_permute4x64_epi64 (m00, SMOL_4X2BIT (3, 1, 2, 0));
-        m00 = _mm256_shuffle_epi32 (m00, SMOL_4X2BIT (3, 1, 2, 0));
+        m1 = _mm256_unpacklo_epi8 (m0, zero);
+        m2 = _mm256_unpackhi_epi8 (m0, zero);
 
-        _mm256_storeu_si256 (my_out, m00);
-        my_out += 1;
+        _mm256_store_si256 (my_out, m1);
+        my_out++;
+        _mm256_store_si256 (my_out, m2);
+        my_out++;
     }
 
-    *out = (uint32_t * SMOL_RESTRICT) my_out;
-    *in = (const uint64_t * SMOL_RESTRICT) my_in;
-
-#undef ALPHA_MUL
-#undef ALPHA_MASK
+    *out = (uint64_t * SMOL_RESTRICT) my_out;
+    *in = (const uint32_t * SMOL_RESTRICT) my_in;
 }
 
-/* PACK_SHUF_MM256_EPI8() 
- *
- * Generates a shuffling register for packing 8bpc pixel channels in the
- * provided order. The order (1, 2, 3, 4) is neutral and corresponds to
- *
- * _mm256_set_epi8 (13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
- *                  13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
- */
-#define SHUF_ORDER 0x01000302U
-#define SHUF_CH(n) ((char) (SHUF_ORDER >> ((4 - (n)) * 8)))
-#define SHUF_QUAD_CH(q, n) (4 * (q) + SHUF_CH (n))
-#define SHUF_QUAD(q, a, b, c, d) \
-    SHUF_QUAD_CH ((q), (a)), \
-    SHUF_QUAD_CH ((q), (b)), \
-    SHUF_QUAD_CH ((q), (c)), \
-    SHUF_QUAD_CH ((q), (d))
-#define PACK_SHUF_EPI8_LANE(a, b, c, d) \
-    SHUF_QUAD (3, (a), (b), (c), (d)), \
-    SHUF_QUAD (2, (a), (b), (c), (d)), \
-    SHUF_QUAD (1, (a), (b), (c), (d)), \
-    SHUF_QUAD (0, (a), (b), (c), (d))
-#define PACK_SHUF_MM256_EPI8(a, b, c, d) _mm256_set_epi8 ( \
-    PACK_SHUF_EPI8_LANE ((a), (b), (c), (d)), \
-    PACK_SHUF_EPI8_LANE ((a), (b), (c), (d)))
-
-static void
-pack_row_123a_i_to_1234_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                  uint32_t * SMOL_RESTRICT row_out,
-                                  uint32_t n_pixels)
+static SMOL_INLINE uint64_t
+unpack_pixel_123_p8_to_132a_p8_64bpp (const uint8_t *p)
 {
-    uint32_t *row_out_max = row_out + n_pixels;
-    const __m256i channel_shuf = PACK_SHUF_MM256_EPI8 (1, 2, 3, 4);
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t * SMOL_RESTRICT);
-
-    pack_8x_123a_i_to_xxxx_u_128bpp (&row_in, &row_out, row_out_max,
-                                     channel_shuf);
-
-    while (row_out != row_out_max)
-    {
-        *(row_out++) = pack_pixel_123a_i_to_1234_u_128bpp (row_in);
-        row_in += 2;
-    }
+    return ((uint64_t) p [0] << 48) | ((uint32_t) p [1] << 16)
+        | ((uint64_t) p [2] << 32) | 0xff;
 }
 
-static void
-pack_row_123a_i_to_123_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
+SMOL_REPACK_ROW_DEF (123,  24,  8, PREMUL8, COMPRESSED,
+                     1324, 64, 64, PREMUL8, COMPRESSED) {
+    while (dest_row != dest_row_max)
     {
-        uint32_t p = pack_pixel_123a_i_to_1234_u_128bpp (row_in);
-        row_in += 2;
-        *(row_out++) = p >> 24;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 8;
+        *(dest_row++) = unpack_pixel_123_p8_to_132a_p8_64bpp (src_row);
+        src_row += 3;
     }
-}
-
-static void
-pack_row_123a_i_to_321_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_123a_i_to_1234_u_128bpp (row_in);
-        row_in += 2;
-        *(row_out++) = p >> 8;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 24;
-    }
-}
-
-#define DEF_PACK_FROM_123A_I_TO_U_128BPP(a, b, c, d)                    \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_123a_i_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT in) \
-{                                                                       \
-    uint8_t alpha = (in [1] >> 8) & 0xff;                               \
-    uint64_t t [2];                                                     \
-    unpremul_i_to_u_128bpp (in, t, alpha);                              \
-    t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;                    \
-    return PACK_FROM_1234_128BPP (t, a, b, c, d);                       \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_123a_i_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                            uint32_t * SMOL_RESTRICT row_out, \
-                                            uint32_t n_pixels)          \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    const __m256i channel_shuf = PACK_SHUF_MM256_EPI8 ((a), (b), (c), (d)); \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    pack_8x_123a_i_to_xxxx_u_128bpp (&row_in, &row_out, row_out_max,    \
-                                     channel_shuf);                     \
-    while (row_out != row_out_max)                                      \
-    {                                                                   \
-        *(row_out++) = pack_pixel_123a_i_to_##a##b##c##d##_u_128bpp (row_in); \
-        row_in += 2;                                                    \
-    }                                                                   \
-}
-
-DEF_PACK_FROM_123A_I_TO_U_128BPP(3, 2, 1, 4)
-DEF_PACK_FROM_123A_I_TO_U_128BPP(4, 1, 2, 3)
-DEF_PACK_FROM_123A_I_TO_U_128BPP(4, 3, 2, 1)
-
-/* Unpack p -> p */
+} SMOL_REPACK_ROW_DEF_END
 
 static SMOL_INLINE uint64_t
-unpack_pixel_1234_p_to_1324_p_64bpp (uint32_t p)
+unpack_pixel_1234_p8_to_1324_p8_64bpp (uint32_t p)
 {
     return (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff00ff);
 }
 
-/* AVX2 has a useful instruction for this: __m256i _mm256_cvtepu8_epi16 (__m128i a);
- * It results in a different channel ordering, so it'd be important to match with
- * the right kind of re-pack. */
-static void
-unpack_row_1234_p_to_1324_p_64bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                   uint64_t * SMOL_RESTRICT row_out,
-                                   uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels;
-
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
+SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED,
+                     1324, 64, 64, PREMUL8, COMPRESSED) {
+    const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_64 (1, 3, 2, 4);
+    unpack_8x_1234_p8_to_xxxx_p8_64bpp (&src_row, &dest_row, dest_row_max,
+                                        channel_shuf);
 
-    while (row_out != row_out_max)
+    while (dest_row != dest_row_max)
     {
-        *(row_out++) = unpack_pixel_1234_p_to_1324_p_64bpp (*(row_in++));
+        *(dest_row++) = unpack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++));
     }
-}
+} SMOL_REPACK_ROW_DEF_END
 
 static SMOL_INLINE uint64_t
-unpack_pixel_123_p_to_132a_p_64bpp (const uint8_t *p)
+unpack_pixel_1234_p8_to_3241_p8_64bpp (uint32_t p)
 {
-    return ((uint64_t) p [0] << 48) | ((uint32_t) p [1] << 16)
-        | ((uint64_t) p [2] << 32) | 0xff;
+    return (((uint64_t) p & 0x0000ff00) << 40)
+        | (((uint64_t) p & 0x00ff00ff) << 16) | (p >> 24);
 }
 
-static void
-unpack_row_123_p_to_132a_p_64bpp (const uint8_t * SMOL_RESTRICT row_in,
-                                  uint64_t * SMOL_RESTRICT row_out,
-                                  uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels;
-
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
+SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED,
+                     3241, 64, 64, PREMUL8, COMPRESSED) {
+    const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_64 (3, 2, 4, 1);
+    unpack_8x_1234_p8_to_xxxx_p8_64bpp (&src_row, &dest_row, dest_row_max,
+                                        channel_shuf);
 
-    while (row_out != row_out_max)
+    while (dest_row != dest_row_max)
     {
-        *(row_out++) = unpack_pixel_123_p_to_132a_p_64bpp (row_in);
-        row_in += 3;
+        *(dest_row++) = unpack_pixel_1234_p8_to_3241_p8_64bpp (*(src_row++));
     }
-}
+} SMOL_REPACK_ROW_DEF_END
 
-static SMOL_INLINE void
-unpack_pixel_1234_p_to_1234_p_128bpp (uint32_t p,
-                                      uint64_t *out)
+static SMOL_INLINE uint64_t
+unpack_pixel_1234_p8_to_2431_p8_64bpp (uint32_t p)
 {
     uint64_t p64 = p;
-    out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16);
-    out [1] = ((p64 & 0x0000ff00) << 24) | (p64 & 0x000000ff);
-}
 
-static void
-unpack_row_1234_p_to_1234_p_128bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                    uint64_t * SMOL_RESTRICT row_out,
-                                    uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels * 2;
+    return ((p64 & 0x00ff00ff) << 32) | ((p64 & 0x0000ff00) << 8)
+        | ((p64 & 0xff000000) >> 24);
+}
 
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
+SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED,
+                     2431, 64, 64, PREMUL8, COMPRESSED) {
+    const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_64 (2, 4, 3, 1);
+    unpack_8x_1234_p8_to_xxxx_p8_64bpp (&src_row, &dest_row, dest_row_max,
+                                        channel_shuf);
 
-    while (row_out != row_out_max)
+    while (dest_row != dest_row_max)
     {
-        unpack_pixel_1234_p_to_1234_p_128bpp (*(row_in++), row_out);
-        row_out += 2;
+        *(dest_row++) = unpack_pixel_1234_p8_to_2431_p8_64bpp (*(src_row++));
     }
-}
+} SMOL_REPACK_ROW_DEF_END
 
-static SMOL_INLINE void
-unpack_pixel_123_p_to_123a_p_128bpp (const uint8_t *in,
-                                     uint64_t *out)
-{
-    out [0] = ((uint64_t) in [0] << 32) | in [1];
-    out [1] = ((uint64_t) in [2] << 32) | 0xff;
-}
-
-static void
-unpack_row_123_p_to_123a_p_128bpp (const uint8_t * SMOL_RESTRICT row_in,
-                                    uint64_t * SMOL_RESTRICT row_out,
-                                    uint32_t n_pixels)
+static SMOL_INLINE uint64_t
+unpack_pixel_a234_u_to_324a_p8_64bpp (uint32_t p)
 {
-    uint64_t *row_out_max = row_out + n_pixels * 2;
+    uint64_t p64 = (((uint64_t) p & 0x0000ff00) << 40) | (((uint64_t) p & 0x00ff00ff) << 16);
+    uint8_t alpha = p >> 24;
 
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
+    return premul_u_to_p8_64bpp (p64, alpha) | alpha;
+}
 
-    while (row_out != row_out_max)
+SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED,
+                     3241, 64, 64, PREMUL8,      COMPRESSED) {
+    while (dest_row != dest_row_max)
     {
-        unpack_pixel_123_p_to_123a_p_128bpp (row_in, row_out);
-        row_in += 3;
-        row_out += 2;
+        *(dest_row++) = unpack_pixel_a234_u_to_324a_p8_64bpp (*(src_row++));
     }
-}
-
-/* Unpack u (alpha first) -> p */
+} SMOL_REPACK_ROW_DEF_END
 
 static SMOL_INLINE uint64_t
-unpack_pixel_a234_u_to_a324_p_64bpp (uint32_t p)
+unpack_pixel_1234_u_to_2431_p8_64bpp (uint32_t p)
 {
-    uint64_t p64 = (((uint64_t) p & 0x0000ff00) << 24) | (p & 0x00ff00ff);
+    uint64_t p64 = (((uint64_t) p & 0x00ff00ff) << 32) | (((uint64_t) p & 0x0000ff00) << 8);
     uint8_t alpha = p >> 24;
 
-    return premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha << 48);
+    return premul_u_to_p8_64bpp (p64, alpha) | alpha;
 }
 
-static void
-unpack_row_a234_u_to_a324_p_64bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                   uint64_t * SMOL_RESTRICT row_out,
-                                   uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels;
-
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
-
-    while (row_out != row_out_max)
+SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED,
+                     2431, 64, 64, PREMUL8,      COMPRESSED) {
+    while (dest_row != dest_row_max)
     {
-        *(row_out++) = unpack_pixel_a234_u_to_a324_p_64bpp (*(row_in++));
+        *(dest_row++) = unpack_pixel_1234_u_to_2431_p8_64bpp (*(src_row++));
     }
-}
+} SMOL_REPACK_ROW_DEF_END
 
-static SMOL_INLINE void
-unpack_pixel_a234_u_to_a234_p_128bpp (uint32_t p,
-                                      uint64_t *out)
+static SMOL_INLINE uint64_t
+unpack_pixel_123a_u_to_132a_p8_64bpp (uint32_t p)
 {
-    uint64_t p64 = (((uint64_t) p & 0x0000ff00) << 24) | (p & 0x00ff00ff);
-    uint8_t alpha = p >> 24;
+    uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000);
+    uint8_t alpha = p & 0xff;
 
-    p64 = premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha << 48);
-    out [0] = (p64 >> 16) & 0x000000ff000000ff;
-    out [1] = p64 & 0x000000ff000000ff;
+    return premul_u_to_p8_64bpp (p64, alpha) | alpha;
 }
 
-static void
-unpack_row_a234_u_to_a234_p_128bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                    uint64_t * SMOL_RESTRICT row_out,
-                                    uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels * 2;
-
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
-
-    while (row_out != row_out_max)
+SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED,
+                     1324, 64, 64, PREMUL8,      COMPRESSED) {
+    while (dest_row != dest_row_max)
     {
-        unpack_pixel_a234_u_to_a234_p_128bpp (*(row_in++), row_out);
-        row_out += 2;
+        *(dest_row++) = unpack_pixel_123a_u_to_132a_p8_64bpp (*(src_row++));
     }
-}
+} SMOL_REPACK_ROW_DEF_END
 
-/* Unpack u -> i (common) */
+/* ----------------------- *
+ * Repacking: 24/32 -> 128 *
+ * ----------------------- */
 
 static void
-unpack_8x_xxxx_u_to_123a_i_128bpp (const uint32_t * SMOL_RESTRICT *in,
-                                   uint64_t * SMOL_RESTRICT *out,
-                                   uint64_t *out_max,
-                                   const __m256i channel_shuf)
+unpack_8x_xxxx_u_to_123a_p16_128bpp (const uint32_t * SMOL_RESTRICT *in,
+                                     uint64_t * SMOL_RESTRICT *out,
+                                     uint64_t *out_max,
+                                     const __m256i channel_shuf)
 {
     const __m256i zero = _mm256_setzero_si256 ();
     const __m256i factor_shuf = _mm256_set_epi8 (
@@ -1057,10 +615,10 @@ unpack_8x_xxxx_u_to_123a_i_128bpp (const uint32_t * SMOL_RESTRICT *in,
     const __m256i alpha_add = _mm256_set_epi16 (
         0, 0x80, 0, 0,  0, 0x80, 0, 0,
         0, 0x80, 0, 0,  0, 0x80, 0, 0);
-    __m256i m0, m1, m2, m3, m4, m5, m6;
-    __m256i fact1, fact2;
     const __m256i * SMOL_RESTRICT my_in = (const __m256i * SMOL_RESTRICT) *in;
     __m256i * SMOL_RESTRICT my_out = (__m256i * SMOL_RESTRICT) *out;
+    __m256i m0, m1, m2, m3, m4, m5, m6;
+    __m256i fact1, fact2;
 
     SMOL_ASSUME_ALIGNED (my_out, __m256i * SMOL_RESTRICT);
 
@@ -1095,13 +653,13 @@ unpack_8x_xxxx_u_to_123a_i_128bpp (const uint32_t * SMOL_RESTRICT *in,
         m5 = _mm256_unpacklo_epi16 (m2, zero);
         m6 = _mm256_unpackhi_epi16 (m2, zero);
 
-        _mm256_store_si256 ((__m256i *) my_out, m3);
+        _mm256_store_si256 (my_out, m3);
         my_out++;
-        _mm256_store_si256 ((__m256i *) my_out, m4);
+        _mm256_store_si256 (my_out, m4);
         my_out++;
-        _mm256_store_si256 ((__m256i *) my_out, m5);
+        _mm256_store_si256 (my_out, m5);
         my_out++;
-        _mm256_store_si256 ((__m256i *) my_out, m6);
+        _mm256_store_si256 (my_out, m6);
         my_out++;
     }
 
@@ -1109,11 +667,84 @@ unpack_8x_xxxx_u_to_123a_i_128bpp (const uint32_t * SMOL_RESTRICT *in,
     *in = (const uint32_t * SMOL_RESTRICT) my_in;
 }
 
-/* Unpack u (alpha first) -> i */
+static SMOL_INLINE void
+unpack_pixel_123_p8_to_123a_p8_128bpp (const uint8_t *in,
+                                       uint64_t *out)
+{
+    out [0] = ((uint64_t) in [0] << 32) | in [1];
+    out [1] = ((uint64_t) in [2] << 32) | 0xff;
+}
+
+SMOL_REPACK_ROW_DEF (123,   24,  8, PREMUL8, COMPRESSED,
+                     1234, 128, 64, PREMUL8, COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_123_p8_to_123a_p8_128bpp (src_row, dest_row);
+        src_row += 3;
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
 
 static SMOL_INLINE void
-unpack_pixel_a234_u_to_234a_i_128bpp (uint32_t p,
-                                      uint64_t *out)
+unpack_pixel_123a_p8_to_123a_p8_128bpp (uint32_t p,
+                                        uint64_t *out)
+{
+    uint64_t p64 = p;
+    out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16);
+    out [1] = ((p64 & 0x0000ff00) << 24) | (p64 & 0x000000ff);
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, PREMUL8, COMPRESSED,
+                     1234, 128, 64, PREMUL8, COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_123a_p8_to_123a_p8_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_a234_p8_to_234a_p8_128bpp (uint32_t p,
+                                        uint64_t *out)
+{
+    uint64_t p64 = p;
+    out [0] = ((p64 & 0x00ff0000) << 16) | ((p64 & 0x0000ff00) >> 8);
+    out [1] = ((p64 & 0x000000ff) << 32) | ((p64 & 0xff000000) >> 24);
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, PREMUL8, COMPRESSED,
+                     2341, 128, 64, PREMUL8, COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_a234_p8_to_234a_p8_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_a234_u_to_234a_p8_128bpp (uint32_t p,
+                                       uint64_t *out)
+{
+    uint64_t p64 = (((uint64_t) p & 0x00ff00ff) << 32) | (((uint64_t) p & 0x0000ff00) << 8);
+    uint8_t alpha = p >> 24;
+
+    p64 = premul_u_to_p8_64bpp (p64, alpha) | alpha;
+    out [0] = (p64 >> 16) & 0x000000ff000000ff;
+    out [1] = p64 & 0x000000ff000000ff;
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, UNASSOCIATED, COMPRESSED,
+                     2341, 128, 64, PREMUL8,      COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_a234_u_to_234a_p8_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_a234_u_to_234a_p16_128bpp (uint32_t p,
+                                        uint64_t *out)
 {
     uint64_t p64 = p;
     uint64_t alpha = p >> 24;
@@ -1122,143 +753,515 @@ unpack_pixel_a234_u_to_234a_i_128bpp (uint32_t p,
     out [1] = (((((p64 & 0x000000ff) << 32) * alpha))) | (alpha << 8) | 0x80;
 }
 
-static void
-unpack_row_a234_u_to_234a_i_128bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                    uint64_t * SMOL_RESTRICT row_out,
-                                    uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels * 2;
-    const __m256i channel_shuf = _mm256_set_epi8 (
-        12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1,
-        12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
+SMOL_REPACK_ROW_DEF (1234,  32, 32, UNASSOCIATED, COMPRESSED,
+                     2341, 128, 64, PREMUL16,     COMPRESSED) {
+    const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_128 (2, 3, 4, 1);
+    unpack_8x_xxxx_u_to_123a_p16_128bpp (&src_row, &dest_row, dest_row_max,
+                                         channel_shuf);
 
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t * SMOL_RESTRICT);
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_a234_u_to_234a_p16_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
 
-    unpack_8x_xxxx_u_to_123a_i_128bpp (&row_in, &row_out, row_out_max,
-                                       channel_shuf);
+static SMOL_INLINE void
+unpack_pixel_123a_u_to_123a_p8_128bpp (uint32_t p,
+                                       uint64_t *out)
+{
+    uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000);
+    uint8_t alpha = p & 0xff;
+
+    p64 = premul_u_to_p8_64bpp (p64, alpha) | ((uint64_t) alpha);
+    out [0] = (p64 >> 16) & 0x000000ff000000ff;
+    out [1] = p64 & 0x000000ff000000ff;
+}
 
-    while (row_out != row_out_max)
+SMOL_REPACK_ROW_DEF (1234,  32, 32, UNASSOCIATED, COMPRESSED,
+                     1234, 128, 64, PREMUL8,      COMPRESSED) {
+    while (dest_row != dest_row_max)
     {
-        unpack_pixel_a234_u_to_234a_i_128bpp (*(row_in++), row_out);
-        row_out += 2;
+        unpack_pixel_123a_u_to_123a_p8_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
     }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_123a_u_to_123a_p16_128bpp (uint32_t p,
+                                        uint64_t *out)
+{
+    uint64_t p64 = p;
+    uint64_t alpha = p & 0xff;
+
+    out [0] = (((((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16)) * alpha));
+    out [1] = (((((p64 & 0x0000ff00) << 24) * alpha))) | (alpha << 8) | 0x80;
 }
 
-/* Unpack u (alpha last) -> p */
+SMOL_REPACK_ROW_DEF (1234,  32, 32, UNASSOCIATED, COMPRESSED,
+                     1234, 128, 64, PREMUL16,     COMPRESSED) {
+    const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_128 (1, 2, 3, 4);
+    unpack_8x_xxxx_u_to_123a_p16_128bpp (&src_row, &dest_row, dest_row_max,
+                                         channel_shuf);
 
-static SMOL_INLINE uint64_t
-unpack_pixel_123a_u_to_132a_p_64bpp (uint32_t p)
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_123a_u_to_123a_p16_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+/* ---------------------- *
+ * Repacking: 64 -> 24/32 *
+ * ---------------------- */
+
+static void
+pack_8x_1234_p8_to_xxxx_p8_64bpp (const uint64_t * SMOL_RESTRICT *in,
+                                  uint32_t * SMOL_RESTRICT *out,
+                                  uint32_t * out_max,
+                                  const __m256i channel_shuf)
 {
-    uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000);
-    uint8_t alpha = p & 0xff;
+    const __m256i * SMOL_RESTRICT my_in = (const __m256i * SMOL_RESTRICT) *in;
+    __m256i * SMOL_RESTRICT my_out = (__m256i * SMOL_RESTRICT) *out;
+    __m256i m0, m1;
+
+    SMOL_ASSUME_ALIGNED (my_in, __m256i * SMOL_RESTRICT);
+
+    while ((ptrdiff_t) (my_out + 1) <= (ptrdiff_t) out_max)
+    {
+        /* Load inputs */
+
+        m0 = _mm256_stream_load_si256 (my_in);
+        my_in++;
+        m1 = _mm256_stream_load_si256 (my_in);
+        my_in++;
+
+        /* Pack and store */
+
+        m0 = _mm256_packus_epi16 (m0, m1);
+        m0 = _mm256_shuffle_epi8 (m0, channel_shuf);
+        m0 = _mm256_permute4x64_epi64 (m0, SMOL_4X2BIT (3, 1, 2, 0));
+
+        _mm256_storeu_si256 (my_out, m0);
+        my_out++;
+    }
+
+    *out = (uint32_t * SMOL_RESTRICT) my_out;
+    *in = (const uint64_t * SMOL_RESTRICT) my_in;
+}
 
-    return premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha);
+static SMOL_INLINE uint32_t
+pack_pixel_1234_p8_to_1324_p8_64bpp (uint64_t in)
+{
+    return in | (in >> 24);
 }
 
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     132,  24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++));
+        *(dest_row++) = p >> 24;
+        *(dest_row++) = p >> 16;
+        *(dest_row++) = p >> 8;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     132,  24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha = *src_row;
+        uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha;
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t);
+        *(dest_row++) = p >> 24;
+        *(dest_row++) = p >> 16;
+        *(dest_row++) = p >> 8;
+        src_row++;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     231,  24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++));
+        *(dest_row++) = p >> 8;
+        *(dest_row++) = p >> 16;
+        *(dest_row++) = p >> 24;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     231,  24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha = *src_row;
+        uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha;
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t);
+        *(dest_row++) = p >> 8;
+        *(dest_row++) = p >> 16;
+        *(dest_row++) = p >> 24;
+        src_row++;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     324,  24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++));
+        *(dest_row++) = p >> 16;
+        *(dest_row++) = p >> 8;
+        *(dest_row++) = p;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     324,  24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha = *src_row >> 24;
+        uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha;
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t);
+        *(dest_row++) = p >> 16;
+        *(dest_row++) = p >> 8;
+        *(dest_row++) = p;
+        src_row++;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     423,  24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++));
+        *(dest_row++) = p;
+        *(dest_row++) = p >> 8;
+        *(dest_row++) = p >> 16;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     423,  24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha = *src_row >> 24;
+        uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha;
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t);
+        *(dest_row++) = p;
+        *(dest_row++) = p >> 8;
+        *(dest_row++) = p >> 16;
+        src_row++;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     1324, 32, 32, PREMUL8,       COMPRESSED) {
+    const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_64 (1, 3, 2, 4);
+    pack_8x_1234_p8_to_xxxx_p8_64bpp (&src_row, &dest_row, dest_row_max,
+                                      channel_shuf);
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++));
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     1324, 32, 32, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha = *src_row;
+        uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha;
+        *(dest_row++) = pack_pixel_1234_p8_to_1324_p8_64bpp (t);
+        src_row++;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+#define DEF_REPACK_FROM_1234_64BPP_TO_32BPP(a, b, c, d) \
+    SMOL_REPACK_ROW_DEF (1234,       64, 64, PREMUL8,       COMPRESSED, \
+                         a##b##c##d, 32, 32, PREMUL8,       COMPRESSED) { \
+        const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_64 ((a), (b), (c), (d)); \
+        pack_8x_1234_p8_to_xxxx_p8_64bpp (&src_row, &dest_row, dest_row_max, \
+                                          channel_shuf); \
+        while (dest_row != dest_row_max) \
+        { \
+            *(dest_row++) = PACK_FROM_1234_64BPP (*src_row, a, b, c, d); \
+            src_row++; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END \
+    SMOL_REPACK_ROW_DEF (1234,       64, 64, PREMUL8,       COMPRESSED, \
+                         a##b##c##d, 32, 32, UNASSOCIATED,  COMPRESSED) { \
+        while (dest_row != dest_row_max) \
+        { \
+            uint8_t alpha = *src_row; \
+            uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; \
+            *(dest_row++) = PACK_FROM_1234_64BPP (t, a, b, c, d); \
+            src_row++; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END
+
+DEF_REPACK_FROM_1234_64BPP_TO_32BPP (1, 4, 2, 3)
+DEF_REPACK_FROM_1234_64BPP_TO_32BPP (2, 3, 1, 4)
+DEF_REPACK_FROM_1234_64BPP_TO_32BPP (4, 1, 3, 2)
+DEF_REPACK_FROM_1234_64BPP_TO_32BPP (4, 2, 3, 1)
+
+/* ----------------------- *
+ * Repacking: 128 -> 24/32 *
+ * ----------------------- */
+
 static void
-unpack_row_123a_u_to_132a_p_64bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                   uint64_t * SMOL_RESTRICT row_out,
-                                   uint32_t n_pixels)
+pack_8x_123a_p16_to_xxxx_u_128bpp (const uint64_t * SMOL_RESTRICT *in,
+                                   uint32_t * SMOL_RESTRICT *out,
+                                   uint32_t * out_max,
+                                   const __m256i channel_shuf)
 {
-    uint64_t *row_out_max = row_out + n_pixels;
+#define ALPHA_MUL (1 << (INVERTED_DIV_SHIFT_P16 - 8))
+#define ALPHA_MASK SMOL_8X1BIT (0, 1, 0, 0, 0, 1, 0, 0)
+
+    const __m256i ones = _mm256_set_epi32 (
+        ALPHA_MUL, ALPHA_MUL, ALPHA_MUL, ALPHA_MUL,
+        ALPHA_MUL, ALPHA_MUL, ALPHA_MUL, ALPHA_MUL);
+    const __m256i alpha_clean_mask = _mm256_set_epi32 (
+        0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff,
+        0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff);
+    const __m256i * SMOL_RESTRICT my_in = (const __m256i * SMOL_RESTRICT) *in;
+    __m256i * SMOL_RESTRICT my_out = (__m256i * SMOL_RESTRICT) *out;
+    __m256i m0, m1, m2, m3, m4, m5, m6, m7, m8;
 
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
+    SMOL_ASSUME_ALIGNED (my_in, __m256i * SMOL_RESTRICT);
 
-    while (row_out != row_out_max)
+    while ((ptrdiff_t) (my_out + 1) <= (ptrdiff_t) out_max)
     {
-        *(row_out++) = unpack_pixel_123a_u_to_132a_p_64bpp (*(row_in++));
-    }
-}
+        /* Load inputs */
 
-static SMOL_INLINE void
-unpack_pixel_123a_u_to_123a_p_128bpp (uint32_t p,
-                                      uint64_t *out)
-{
-    uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000);
-    uint8_t alpha = p & 0xff;
+        m0 = _mm256_stream_load_si256 (my_in);
+        my_in++;
+        m1 = _mm256_stream_load_si256 (my_in);
+        my_in++;
+        m2 = _mm256_stream_load_si256 (my_in);
+        my_in++;
+        m3 = _mm256_stream_load_si256 (my_in);
+        my_in++;
 
-    p64 = premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha);
-    out [0] = (p64 >> 16) & 0x000000ff000000ff;
-    out [1] = p64 & 0x000000ff000000ff;
-}
+        /* Load alpha factors */
 
-static void
-unpack_row_123a_u_to_123a_p_128bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                    uint64_t * SMOL_RESTRICT row_out,
-                                    uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels * 2;
+        m4 = _mm256_slli_si256 (m0, 4);
+        m6 = _mm256_srli_si256 (m3, 4);
+        m5 = _mm256_blend_epi32 (m4, m1, ALPHA_MASK);
+        m7 = _mm256_blend_epi32 (m6, m2, ALPHA_MASK);
+        m7 = _mm256_srli_si256 (m7, 4);
 
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
+        m4 = _mm256_blend_epi32 (m5, m7, SMOL_8X1BIT (0, 0, 1, 1, 0, 0, 1, 1));
+        m4 = _mm256_srli_epi32 (m4, 8);
+        m4 = _mm256_and_si256 (m4, alpha_clean_mask);
+        m4 = _mm256_i32gather_epi32 ((const void *) _smol_inv_div_p16_lut, m4, 4);
 
-    while (row_out != row_out_max)
-    {
-        unpack_pixel_123a_u_to_123a_p_128bpp (*(row_in++), row_out);
-        row_out += 2;
-    }
-}
+        /* 2 pixels times 4 */
 
-/* Unpack u (alpha last) -> i */
+        m5 = _mm256_shuffle_epi32 (m4, SMOL_4X2BIT (3, 3, 3, 3));
+        m6 = _mm256_shuffle_epi32 (m4, SMOL_4X2BIT (2, 2, 2, 2));
+        m7 = _mm256_shuffle_epi32 (m4, SMOL_4X2BIT (1, 1, 1, 1));
+        m8 = _mm256_shuffle_epi32 (m4, SMOL_4X2BIT (0, 0, 0, 0));
 
-static SMOL_INLINE void
-unpack_pixel_123a_u_to_123a_i_128bpp (uint32_t p,
-                                      uint64_t *out)
-{
-    uint64_t p64 = p;
-    uint64_t alpha = p & 0xff;
+        m5 = _mm256_blend_epi32 (m5, ones, ALPHA_MASK);
+        m6 = _mm256_blend_epi32 (m6, ones, ALPHA_MASK);
+        m7 = _mm256_blend_epi32 (m7, ones, ALPHA_MASK);
+        m8 = _mm256_blend_epi32 (m8, ones, ALPHA_MASK);
 
-    out [0] = (((((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16)) * alpha));
-    out [1] = (((((p64 & 0x0000ff00) << 24) * alpha))) | (alpha << 8) | 0x80;
-}
+        m5 = _mm256_mullo_epi32 (m5, m0);
+        m6 = _mm256_mullo_epi32 (m6, m1);
+        m7 = _mm256_mullo_epi32 (m7, m2);
+        m8 = _mm256_mullo_epi32 (m8, m3);
 
-static void
-unpack_row_123a_u_to_123a_i_128bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                    uint64_t * SMOL_RESTRICT row_out,
-                                    uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels * 2;
-    const __m256i channel_shuf = _mm256_set_epi8 (
-        13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
-        13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
+        m5 = _mm256_srli_epi32 (m5, INVERTED_DIV_SHIFT_P16);
+        m6 = _mm256_srli_epi32 (m6, INVERTED_DIV_SHIFT_P16);
+        m7 = _mm256_srli_epi32 (m7, INVERTED_DIV_SHIFT_P16);
+        m8 = _mm256_srli_epi32 (m8, INVERTED_DIV_SHIFT_P16);
 
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t * SMOL_RESTRICT);
+        /* Pack and store */
 
-    unpack_8x_xxxx_u_to_123a_i_128bpp (&row_in, &row_out, row_out_max,
-                                       channel_shuf);
+        m0 = _mm256_packus_epi32 (m5, m6);
+        m1 = _mm256_packus_epi32 (m7, m8);
+        m0 = _mm256_packus_epi16 (m0, m1);
 
-    while (row_out != row_out_max)
-    {
-        unpack_pixel_123a_u_to_123a_i_128bpp (*(row_in++), row_out);
-        row_out += 2;
+        m0 = _mm256_shuffle_epi8 (m0, channel_shuf);
+        m0 = _mm256_permute4x64_epi64 (m0, SMOL_4X2BIT (3, 1, 2, 0));
+        m0 = _mm256_shuffle_epi32 (m0, SMOL_4X2BIT (3, 1, 2, 0));
+
+        _mm256_storeu_si256 (my_out, m0);
+        my_out += 1;
     }
-}
 
-/* --- Filter helpers --- */
+    *out = (uint32_t * SMOL_RESTRICT) my_out;
+    *in = (const uint64_t * SMOL_RESTRICT) my_in;
 
-static SMOL_INLINE const uint32_t *
-inrow_ofs_to_pointer (const SmolScaleCtx *scale_ctx,
-                      uint32_t inrow_ofs)
-{
-    return (const uint32_t *) (((const uint8_t *) scale_ctx->pixels_in) + scale_ctx->rowstride_in * inrow_ofs);
+#undef ALPHA_MUL
+#undef ALPHA_MASK
 }
 
-static SMOL_INLINE uint32_t *
-outrow_ofs_to_pointer (const SmolScaleCtx *scale_ctx,
-                       uint32_t outrow_ofs)
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8,       COMPRESSED,
+                     123,   24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = *src_row >> 32;
+        *(dest_row++) = *(src_row++);
+        *(dest_row++) = *(src_row++) >> 32;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8,       COMPRESSED,
+                     123,   24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = src_row [1];
+        unpremul_p8_to_u_128bpp (src_row, t, alpha);
+        t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;
+        *(dest_row++) = t [0] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [1] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16,      COMPRESSED,
+                     123,   24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = src_row [1];
+        unpremul_p16_to_u_128bpp (src_row, t, alpha);
+        t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;
+        *(dest_row++) = t [0] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [1] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8,       COMPRESSED,
+                     321,   24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = src_row [1] >> 32;
+        *(dest_row++) = src_row [0];
+        *(dest_row++) = src_row [0] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8,       COMPRESSED,
+                     321,   24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = src_row [1];
+        unpremul_p8_to_u_128bpp (src_row, t, alpha);
+        t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;
+        *(dest_row++) = t [1] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [0] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16,      COMPRESSED,
+                     321,   24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = src_row [1] >> 8;
+        unpremul_p16_to_u_128bpp (src_row, t, alpha);
+        t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;
+        *(dest_row++) = t [1] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [0] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+#define DEF_REPACK_FROM_1234_128BPP_TO_32BPP(a, b, c, d) \
+    SMOL_REPACK_ROW_DEF (1234,       128, 64, PREMUL8,       COMPRESSED, \
+                         a##b##c##d,  32, 32, PREMUL8,       COMPRESSED) { \
+        while (dest_row != dest_row_max) \
+        { \
+            *(dest_row++) = PACK_FROM_1234_128BPP (src_row, a, b, c, d); \
+            src_row += 2; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END \
+    SMOL_REPACK_ROW_DEF (1234,       128, 64, PREMUL8,       COMPRESSED, \
+                         a##b##c##d,  32, 32, UNASSOCIATED,  COMPRESSED) { \
+        while (dest_row != dest_row_max) \
+        { \
+            uint64_t t [2]; \
+            uint8_t alpha = src_row [1]; \
+            unpremul_p8_to_u_128bpp (src_row, t, alpha); \
+            t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \
+            *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \
+            src_row += 2; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END \
+    SMOL_REPACK_ROW_DEF (1234,       128, 64, PREMUL16,      COMPRESSED, \
+                         a##b##c##d,  32, 32, UNASSOCIATED,  COMPRESSED) { \
+        const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_128 ((a), (b), (c), (d)); \
+        pack_8x_123a_p16_to_xxxx_u_128bpp (&src_row, &dest_row, dest_row_max, \
+                                           channel_shuf);               \
+        while (dest_row != dest_row_max) \
+        { \
+            uint64_t t [2]; \
+            uint8_t alpha = src_row [1] >> 8; \
+            unpremul_p16_to_u_128bpp (src_row, t, alpha); \
+            t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \
+            *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \
+            src_row += 2; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END
+
+DEF_REPACK_FROM_1234_128BPP_TO_32BPP (1, 2, 3, 4)
+DEF_REPACK_FROM_1234_128BPP_TO_32BPP (3, 2, 1, 4)
+DEF_REPACK_FROM_1234_128BPP_TO_32BPP (4, 1, 2, 3)
+DEF_REPACK_FROM_1234_128BPP_TO_32BPP (4, 3, 2, 1)
+
+/* -------------- *
+ * Filter helpers *
+ * -------------- */
+
+#define LERP_SIMD256_EPI32(a, b, f) \
+    _mm256_add_epi32 ( \
+        _mm256_srli_epi32 ( \
+            _mm256_mullo_epi32 ( \
+                _mm256_sub_epi32 ((a), (b)), (f)), 8), (b))
+
+#define LERP_SIMD128_EPI32(a, b, f) \
+    _mm_add_epi32 ( \
+        _mm_srli_epi32 ( \
+            _mm_mullo_epi32 ( \
+                _mm_sub_epi32 ((a), (b)), (f)), 8), (b))
+
+#define LERP_SIMD256_EPI32_AND_MASK(a, b, f, mask) \
+    _mm256_and_si256 (LERP_SIMD256_EPI32 ((a), (b), (f)), (mask))
+
+#define LERP_SIMD128_EPI32_AND_MASK(a, b, f, mask) \
+    _mm_and_si128 (LERP_SIMD128_EPI32 ((a), (b), (f)), (mask))
+
+static SMOL_INLINE const char *
+src_row_ofs_to_pointer (const SmolScaleCtx *scale_ctx,
+                        uint32_t src_row_ofs)
 {
-    return (uint32_t *) (((uint8_t *) scale_ctx->pixels_out) + scale_ctx->rowstride_out * outrow_ofs);
+    return scale_ctx->src_pixels + scale_ctx->src_rowstride * src_row_ofs;
 }
 
 static SMOL_INLINE uint64_t
 weight_pixel_64bpp (uint64_t p,
                     uint16_t w)
 {
-    return ((p * w) >> 8) & 0x00ff00ff00ff00ff;
+    return ((p * w) >> 8) & 0x00ff00ff00ff00ffULL;
 }
 
 /* p and out may be the same address */
 static SMOL_INLINE void
-weight_pixel_128bpp (uint64_t *p,
+weight_pixel_128bpp (const uint64_t *p,
                      uint64_t *out,
                      uint16_t w)
 {
@@ -1331,8 +1334,7 @@ scale_128bpp_half (uint64_t accum,
     b = (accum & 0xffffffff00000000ULL) >> 32;
     b = (b * multiplier + SMOL_BOXES_MULTIPLIER / 2) / SMOL_BOXES_MULTIPLIER;
 
-    return (a & 0x000000000000ffffULL)
-           | ((b & 0x000000000000ffffULL) << 32);
+    return a | (b << 32);
 }
 
 static SMOL_INLINE void
@@ -1371,120 +1373,598 @@ add_parts (const uint64_t * SMOL_RESTRICT parts_in,
         *(parts_acc_out++) += *(parts_in++);
 }
 
-/* --- Horizontal scaling --- */
-
-#define DEF_INTERP_HORIZONTAL_BILINEAR(n_halvings)                      \
-static void                                                             \
-interp_horizontal_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \
-                                                  const uint64_t * SMOL_RESTRICT row_parts_in, \
-                                                  uint64_t * SMOL_RESTRICT row_parts_out) \
-{                                                                       \
-    uint64_t p, q;                                                      \
-    const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x;        \
-    uint64_t F;                                                         \
-    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out; \
-    int i;                                                              \
-                                                                        \
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);               \
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);                    \
-                                                                        \
-    do                                                                  \
-    {                                                                   \
-        uint64_t accum = 0;                                             \
-                                                                        \
-        for (i = 0; i < (1 << (n_halvings)); i++)                       \
-        {                                                               \
-            row_parts_in += *(ofs_x++);                                 \
-            F = *(ofs_x++);                                             \
-                                                                        \
-            p = *row_parts_in;                                          \
-            q = *(row_parts_in + 1);                                    \
-                                                                        \
-            accum += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \
-        }                                                               \
-        *(row_parts_out++) = ((accum) >> (n_halvings)) & 0x00ff00ff00ff00ffULL; \
-    }                                                                   \
-    while (row_parts_out != row_parts_out_max);                         \
-}                                                                       \
-                                                                        \
-static void                                                             \
-interp_horizontal_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \
-                                                   const uint64_t * SMOL_RESTRICT row_parts_in, \
-                                                   uint64_t * SMOL_RESTRICT row_parts_out) \
-{                                                                       \
-    const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x;        \
-    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out * 2; \
-    const __m128i mask128 = _mm_set_epi32 (                             \
-        0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff);                \
-    const __m256i zero256 = _mm256_setzero_si256 ();                    \
-    int i;                                                              \
-                                                                        \
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);               \
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);                    \
-                                                                        \
-    while (row_parts_out != row_parts_out_max)                          \
-    {                                                                   \
-        __m256i a0 = _mm256_setzero_si256 ();                           \
-        __m128i a1;                                                     \
-                                                                        \
-        for (i = 0; i < (1 << ((n_halvings) - 1)); i++)                 \
-        {                                                               \
-            __m256i m0, m1;                                             \
-            __m256i factors;                                            \
-            __m128i n0, n1, n2, n3, n4, n5;                             \
-                                                                        \
-            row_parts_in += *(ofs_x++) * 2;                             \
-            n4 = _mm_set1_epi16 (*(ofs_x++));                           \
-            n0 = _mm_load_si128 ((__m128i *) row_parts_in);             \
-            n1 = _mm_load_si128 ((__m128i *) row_parts_in + 1);         \
-                                                                        \
-            row_parts_in += *(ofs_x++) * 2;                             \
-            n5 = _mm_set1_epi16 (*(ofs_x++));                           \
-            n2 = _mm_load_si128 ((__m128i *) row_parts_in);             \
-            n3 = _mm_load_si128 ((__m128i *) row_parts_in + 1);         \
-                                                                        \
-            m0 = _mm256_set_m128i (n2, n0);                             \
-            m1 = _mm256_set_m128i (n3, n1);                             \
-            factors = _mm256_set_m128i (n5, n4);                        \
-            factors = _mm256_blend_epi16 (factors, zero256, 0xaa);      \
-                                                                        \
-            m0 = LERP_SIMD256_EPI32 (m0, m1, factors);                  \
-            a0 = _mm256_add_epi32 (a0, m0);                             \
-        }                                                               \
-                                                                        \
-        a1 = _mm_add_epi32 (_mm256_extracti128_si256 (a0, 0),           \
-                            _mm256_extracti128_si256 (a0, 1));          \
-        a1 = _mm_srli_epi32 (a1, (n_halvings));                         \
-        a1 = _mm_and_si128 (a1, mask128);                               \
-        _mm_store_si128 ((__m128i *) row_parts_out, a1);                \
-        row_parts_out += 2;                                             \
-    }                                                                   \
+static void
+copy_weighted_parts_64bpp (const uint64_t * SMOL_RESTRICT parts_in,
+                           uint64_t * SMOL_RESTRICT parts_acc_out,
+                           uint32_t n,
+                           uint16_t w)
+{
+    const uint64_t *parts_in_max = parts_in + n;
+
+    SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *);
+
+    while (parts_in < parts_in_max)
+    {
+        *(parts_acc_out++) = weight_pixel_64bpp (*(parts_in++), w);
+    }
+}
+
+static void
+copy_weighted_parts_128bpp (const uint64_t * SMOL_RESTRICT parts_in,
+                            uint64_t * SMOL_RESTRICT parts_acc_out,
+                            uint32_t n,
+                            uint16_t w)
+{
+    const uint64_t *parts_in_max = parts_in + n * 2;
+
+    SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *);
+
+    while (parts_in < parts_in_max)
+    {
+        weight_pixel_128bpp (parts_in, parts_acc_out, w);
+        parts_in += 2;
+        parts_acc_out += 2;
+    }
+}
+
+static void
+add_weighted_parts_64bpp (const uint64_t * SMOL_RESTRICT parts_in,
+                          uint64_t * SMOL_RESTRICT parts_acc_out,
+                          uint32_t n,
+                          uint16_t w)
+{
+    const uint64_t *parts_in_max = parts_in + n;
+
+    SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *);
+
+    while (parts_in < parts_in_max)
+    {
+        *(parts_acc_out++) += weight_pixel_64bpp (*(parts_in++), w);
+    }
+}
+
+static void
+add_weighted_parts_128bpp (const uint64_t * SMOL_RESTRICT parts_in,
+                           uint64_t * SMOL_RESTRICT parts_acc_out,
+                           uint32_t n,
+                           uint16_t w)
+{
+    const uint64_t *parts_in_max = parts_in + n * 2;
+
+    SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *);
+
+    while (parts_in < parts_in_max)
+    {
+        uint64_t t [2];
+
+        weight_pixel_128bpp (parts_in, t, w);
+        parts_acc_out [0] += t [0];
+        parts_acc_out [1] += t [1];
+        parts_in += 2;
+        parts_acc_out += 2;
+    }
+}
+
+static SMOL_INLINE void
+apply_subpixel_opacity_64bpp (uint64_t * SMOL_RESTRICT u64_inout, uint16_t opacity)
+{
+    *u64_inout = ((*u64_inout * opacity) >> SMOL_SUBPIXEL_SHIFT) & 0x00ff00ff00ff00ffULL;
+}
+
+static SMOL_INLINE void
+apply_subpixel_opacity_128bpp_half (uint64_t * SMOL_RESTRICT u64_inout, uint16_t opacity)
+{
+    *u64_inout = ((*u64_inout * opacity) >> SMOL_SUBPIXEL_SHIFT) & 0x00ffffff00ffffffULL;
+}
+
+static SMOL_INLINE void
+apply_subpixel_opacity_128bpp (uint64_t *u64_inout, uint16_t opacity)
+{
+    apply_subpixel_opacity_128bpp_half (u64_inout, opacity);
+    apply_subpixel_opacity_128bpp_half (u64_inout + 1, opacity);
+}
+
+static void
+apply_subpixel_opacity_row_copy_64bpp (uint64_t * SMOL_RESTRICT u64_in,
+                                       uint64_t * SMOL_RESTRICT u64_out,
+                                       int n_pixels,
+                                       uint16_t opacity)
+{
+    uint64_t *u64_out_max = u64_out + n_pixels;
+
+    while (u64_out != u64_out_max)
+    {
+        *u64_out = *u64_in++;
+        apply_subpixel_opacity_64bpp (u64_out, opacity);
+        u64_out++;
+    }
+}
+
+static void
+apply_subpixel_opacity_row_copy_128bpp (uint64_t * SMOL_RESTRICT u64_in,
+                                        uint64_t * SMOL_RESTRICT u64_out,
+                                        int n_pixels,
+                                        uint16_t opacity)
+{
+    uint64_t *u64_out_max = u64_out + (n_pixels * 2);
+
+    while (u64_out != u64_out_max)
+    {
+        u64_out [0] = u64_in [0];
+        u64_out [1] = u64_in [1];
+        apply_subpixel_opacity_128bpp_half (u64_out, opacity);
+        apply_subpixel_opacity_128bpp_half (u64_out + 1, opacity);
+        u64_in += 2;
+        u64_out += 2;
+    }
+}
+
+static void
+apply_horiz_edge_opacity (const SmolScaleCtx *scale_ctx,
+                          uint64_t *row_parts)
+{
+    if (scale_ctx->storage_type == SMOL_STORAGE_64BPP)
+    {
+        apply_subpixel_opacity_64bpp (&row_parts [0], scale_ctx->hdim.first_opacity);
+        apply_subpixel_opacity_64bpp (&row_parts [scale_ctx->hdim.placement_size_px - 1], scale_ctx->hdim.last_opacity);
+    }
+    else
+    {
+        apply_subpixel_opacity_128bpp (&row_parts [0], scale_ctx->hdim.first_opacity);
+        apply_subpixel_opacity_128bpp (&row_parts [(scale_ctx->hdim.placement_size_px - 1) * 2], scale_ctx->hdim.last_opacity);
+    }
+}
+
+/* ------------------ *
+ * Horizontal scaling *
+ * ------------------ */
+
+static SMOL_INLINE void
+hadd_pixels_16x_to_8x_64bpp (__m256i i0, __m256i i1, __m256i i2, __m256i i3,
+                             __m256i * SMOL_RESTRICT o0, __m256i * SMOL_RESTRICT o1)
+{
+    __m256i t0, t1, t2, t3;
+    int control_bits;
+
+    control_bits = SMOL_4X2BIT (1, 0, 3, 2);
+    t0 = _mm256_shuffle_epi32 (i0, control_bits);
+    t1 = _mm256_shuffle_epi32 (i1, control_bits);
+    t2 = _mm256_shuffle_epi32 (i2, control_bits);
+    t3 = _mm256_shuffle_epi32 (i3, control_bits);
+
+    t0 = _mm256_add_epi16 (t0, i0);
+    t1 = _mm256_add_epi16 (t1, i1);
+    t2 = _mm256_add_epi16 (t2, i2);
+    t3 = _mm256_add_epi16 (t3, i3);
+
+    control_bits = SMOL_8X1BIT (1, 1, 0, 0, 1, 1, 0, 0);
+    t0 = _mm256_blend_epi32 (t0, t1, control_bits);
+    t1 = _mm256_blend_epi32 (t2, t3, control_bits);
+
+    control_bits = SMOL_4X2BIT (3, 1, 2, 0);
+    t0 = _mm256_permute4x64_epi64 (t0, control_bits);
+    t1 = _mm256_permute4x64_epi64 (t1, control_bits);
+
+    *o0 = t0;
+    *o1 = t1;
+}
+
+static SMOL_INLINE __m256i
+hadd_pixels_8x_to_4x_64bpp (__m256i i0, __m256i i1)
+{
+    __m256i t0, t1;
+    int control_bits;
+
+    control_bits = SMOL_4X2BIT (1, 0, 3, 2);
+    t0 = _mm256_shuffle_epi32 (i0, control_bits);
+    t1 = _mm256_shuffle_epi32 (i1, control_bits);
+
+    t0 = _mm256_add_epi16 (t0, i0);
+    t1 = _mm256_add_epi16 (t1, i1);
+
+    control_bits = SMOL_8X1BIT (1, 1, 0, 0, 1, 1, 0, 0);
+    t0 = _mm256_blend_epi32 (t0, t1, control_bits);
+
+    control_bits = SMOL_4X2BIT (3, 1, 2, 0);
+    t0 = _mm256_permute4x64_epi64 (t0, control_bits);
+
+    return t0;
+}
+
+static SMOL_INLINE void
+interp_horizontal_bilinear_batch_64bpp (const uint64_t * SMOL_RESTRICT row_parts_in,
+                                        const uint16_t * SMOL_RESTRICT precalc_x,
+                                        __m256i * SMOL_RESTRICT o0,
+                                        __m256i * SMOL_RESTRICT o1,
+                                        __m256i * SMOL_RESTRICT o2,
+                                        __m256i * SMOL_RESTRICT o3)
+{
+    const __m256i mask = _mm256_set_epi16 (0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+                                           0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff);
+    const __m256i shuf_0 = _mm256_set_epi8 (3, 2, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0, 1, 0,
+                                            3, 2, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0, 1, 0);
+    const __m256i shuf_1 = _mm256_set_epi8 (7, 6, 7, 6, 7, 6, 7, 6, 5, 4, 5, 4, 5, 4, 5, 4,
+                                            7, 6, 7, 6, 7, 6, 7, 6, 5, 4, 5, 4, 5, 4, 5, 4);
+    const __m256i shuf_2 = _mm256_set_epi8 (11, 10, 11, 10, 11, 10, 11, 10, 9, 8, 9, 8, 9, 8, 9, 8,
+                                            11, 10, 11, 10, 11, 10, 11, 10, 9, 8, 9, 8, 9, 8, 9, 8);
+    const __m256i shuf_3 = _mm256_set_epi8 (15, 14, 15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 13, 12,
+                                            15, 14, 15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 13, 12);
+    __m256i m0, m1, m2, m3;
+    __m256i f0, f1, f2, f3;
+    __m256i q00, q10, q20, q30, q40, q50, q60, q70;
+    __m256i q01, q11, q21, q31, q41, q51, q61, q71;
+    __m256i p00, p01, p10, p11, p20, p21, p30, p31;
+    __m256i f;
+    int control_bits;
+
+    /* Fetch pixel pairs to interpolate between, two pairs per ymm register.
+     * This looks clumsy, but it's a lot faster than using _mm256_i32gather_epi64(),
+     * as benchmarked on both Haswell and Tiger Lake. */
+
+    q00 = _mm256_inserti128_si256 (_mm256_castsi128_si256 (
+                                       _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [0]))),
+                                   _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [1])), 1);
+    q10 = _mm256_inserti128_si256 (_mm256_castsi128_si256 (
+                                       _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [2]))),
+                                   _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [3])), 1);
+    q20 = _mm256_inserti128_si256 (_mm256_castsi128_si256 (
+                                       _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [4]))),
+                                   _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [5])), 1);
+    q30 = _mm256_inserti128_si256 (_mm256_castsi128_si256 (
+                                       _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [6]))),
+                                   _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [7])), 1);
+
+    q40 = _mm256_inserti128_si256 (_mm256_castsi128_si256 (
+                                       _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [8]))),
+                                   _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [9])), 1);
+    q50 = _mm256_inserti128_si256 (_mm256_castsi128_si256 (
+                                       _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [10]))),
+                                   _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [11])), 1);
+    q60 = _mm256_inserti128_si256 (_mm256_castsi128_si256 (
+                                       _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [12]))),
+                                   _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [13])), 1);
+    q70 = _mm256_inserti128_si256 (_mm256_castsi128_si256 (
+                                       _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [14]))),
+                                   _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [15])), 1);
+
+    f = _mm256_load_si256 ((const __m256i *) (precalc_x + 16));  /* Factors */
+
+    /* 0123 -> 0x2x, 1x3x. 4567 -> x4x6, x5x7. Etc. */
+
+    control_bits = SMOL_4X2BIT (1, 0, 3, 2);
+
+    q01 = _mm256_shuffle_epi32 (q00, control_bits);
+    q11 = _mm256_shuffle_epi32 (q10, control_bits);
+    q21 = _mm256_shuffle_epi32 (q20, control_bits);
+    q31 = _mm256_shuffle_epi32 (q30, control_bits);
+    q41 = _mm256_shuffle_epi32 (q40, control_bits);
+    q51 = _mm256_shuffle_epi32 (q50, control_bits);
+    q61 = _mm256_shuffle_epi32 (q60, control_bits);
+    q71 = _mm256_shuffle_epi32 (q70, control_bits);
+
+    /* 0x2x, x4x6 -> 0426. 1x3x, x5x7 -> 1537. Etc. */
+
+    control_bits = SMOL_8X1BIT (1, 1, 0, 0, 1, 1, 0, 0);
+
+    p00 = _mm256_blend_epi32 (q00, q11, control_bits);
+    p10 = _mm256_blend_epi32 (q20, q31, control_bits);
+    p20 = _mm256_blend_epi32 (q40, q51, control_bits);
+    p30 = _mm256_blend_epi32 (q60, q71, control_bits);
+
+    p01 = _mm256_blend_epi32 (q01, q10, control_bits);
+    p11 = _mm256_blend_epi32 (q21, q30, control_bits);
+    p21 = _mm256_blend_epi32 (q41, q50, control_bits);
+    p31 = _mm256_blend_epi32 (q61, q70, control_bits);
+
+    /* Interpolation. 0426 vs 1537. Etc. */
+
+    m0 = _mm256_sub_epi16 (p00, p01);
+    m1 = _mm256_sub_epi16 (p10, p11);
+    m2 = _mm256_sub_epi16 (p20, p21);
+    m3 = _mm256_sub_epi16 (p30, p31);
+
+    f0 = _mm256_shuffle_epi8 (f, shuf_0);
+    f1 = _mm256_shuffle_epi8 (f, shuf_1);
+    f2 = _mm256_shuffle_epi8 (f, shuf_2);
+    f3 = _mm256_shuffle_epi8 (f, shuf_3);
+
+    m0 = _mm256_mullo_epi16 (m0, f0);
+    m1 = _mm256_mullo_epi16 (m1, f1);
+    m2 = _mm256_mullo_epi16 (m2, f2);
+    m3 = _mm256_mullo_epi16 (m3, f3);
+
+    m0 = _mm256_srli_epi16 (m0, 8);
+    m1 = _mm256_srli_epi16 (m1, 8);
+    m2 = _mm256_srli_epi16 (m2, 8);
+    m3 = _mm256_srli_epi16 (m3, 8);
+
+    m0 = _mm256_add_epi16 (m0, p01);
+    m1 = _mm256_add_epi16 (m1, p11);
+    m2 = _mm256_add_epi16 (m2, p21);
+    m3 = _mm256_add_epi16 (m3, p31);
+
+    m0 = _mm256_and_si256 (m0, mask);
+    m1 = _mm256_and_si256 (m1, mask);
+    m2 = _mm256_and_si256 (m2, mask);
+    m3 = _mm256_and_si256 (m3, mask);
+
+    /* [0426/1537] -> [0246/1357]. Etc. */
+
+    control_bits = SMOL_4X2BIT (3, 1, 2, 0);
+
+    *o0 = _mm256_permute4x64_epi64 (m0, control_bits);
+    *o1 = _mm256_permute4x64_epi64 (m1, control_bits);
+    *o2 = _mm256_permute4x64_epi64 (m2, control_bits);
+    *o3 = _mm256_permute4x64_epi64 (m3, control_bits);
+}
+
+static __m256i
+interp_horizontal_bilinear_batch_to_4x_64bpp (const uint64_t * SMOL_RESTRICT row_parts_in,
+                                              const uint16_t * SMOL_RESTRICT precalc_x)
+{
+    __m256i m0, m1, m2, m3, s0, s1;
+
+    interp_horizontal_bilinear_batch_64bpp (row_parts_in, precalc_x, &m0, &m1, &m2, &m3);
+    hadd_pixels_16x_to_8x_64bpp (m0, m1, m2, m3, &s0, &s1);
+    return hadd_pixels_8x_to_4x_64bpp (s0, s1);
+}
+
+static __m256i
+interp_horizontal_bilinear_4x_batch_to_4x_64bpp (const uint64_t * SMOL_RESTRICT row_parts_in,
+                                                 const uint16_t * SMOL_RESTRICT precalc_x)
+{
+    __m256i t0, t1, t2, t3;
+
+    t0 = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x);
+    t1 = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x + 32);
+    t2 = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x + 64);
+    t3 = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x + 96);
+
+    hadd_pixels_16x_to_8x_64bpp (t0, t1, t2, t3, &t0, &t1);
+    return hadd_pixels_8x_to_4x_64bpp (t0, t1);
+}
+
+/* Note that precalc_x must point to offsets and factors interleaved one by one, i.e.
+ * offset - factor - offset - factor, and not 16x as with the batch function. */
+static SMOL_INLINE void
+interp_horizontal_bilinear_epilogue_64bpp (const uint64_t * SMOL_RESTRICT row_parts_in,
+                                           uint64_t * SMOL_RESTRICT row_parts_out,
+                                           uint64_t * SMOL_RESTRICT row_parts_out_max,
+                                           const uint16_t * SMOL_RESTRICT precalc_x,
+                                           int n_halvings)
+{
+    while (row_parts_out != row_parts_out_max)
+    {
+        uint64_t accum = 0;
+        int i;
+
+        for (i = 0; i < (1 << (n_halvings)); i++)
+        {
+            uint64_t p, q;
+            uint64_t F;
+
+            p = *(row_parts_in + (*precalc_x));
+            q = *(row_parts_in + (*precalc_x) + 1);
+            precalc_x++;
+            F = *(precalc_x++);
+
+            accum += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;
+        }
+
+        *(row_parts_out++) = ((accum) >> (n_halvings)) & 0x00ff00ff00ff00ffULL;
+    }
+}
+
+static void
+interp_horizontal_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx,
+                                     const uint64_t * SMOL_RESTRICT row_parts_in,
+                                     uint64_t * SMOL_RESTRICT row_parts_out)
+{
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc;
+    uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px;
+
+    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT);
+
+    while (row_parts_out + 16 <= row_parts_out_max)
+    {
+        __m256i m0, m1, m2, m3;
+
+        interp_horizontal_bilinear_batch_64bpp (row_parts_in, precalc_x, &m0, &m1, &m2, &m3);
+
+        _mm256_store_si256 ((__m256i *) row_parts_out + 0, m0);
+        _mm256_store_si256 ((__m256i *) row_parts_out + 1, m1);
+        _mm256_store_si256 ((__m256i *) row_parts_out + 2, m2);
+        _mm256_store_si256 ((__m256i *) row_parts_out + 3, m3);
+
+        row_parts_out += 16;
+        precalc_x += 32;
+    }
+
+    interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 0);
+}
+
+static void
+interp_horizontal_bilinear_1h_64bpp (const SmolScaleCtx *scale_ctx,
+                                     const uint64_t * SMOL_RESTRICT row_parts_in,
+                                     uint64_t * SMOL_RESTRICT row_parts_out)
+{
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc;
+    uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px;
+
+    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT);
+
+    while (row_parts_out + 8 <= row_parts_out_max)
+    {
+        __m256i m0, m1, m2, m3, s0, s1;
+
+        interp_horizontal_bilinear_batch_64bpp (row_parts_in, precalc_x, &m0, &m1, &m2, &m3);
+        hadd_pixels_16x_to_8x_64bpp (m0, m1, m2, m3, &s0, &s1);
+
+        s0 = _mm256_srli_epi16 (s0, 1);
+        s1 = _mm256_srli_epi16 (s1, 1);
+
+        _mm256_store_si256 ((__m256i *) row_parts_out, s0);
+        _mm256_store_si256 ((__m256i *) row_parts_out + 1, s1);
+
+        row_parts_out += 8;
+        precalc_x += 32;
+    }
+
+    interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 1);
+}
+
+static void
+interp_horizontal_bilinear_2h_64bpp (const SmolScaleCtx *scale_ctx,
+                                     const uint64_t * SMOL_RESTRICT row_parts_in,
+                                     uint64_t * SMOL_RESTRICT row_parts_out)
+{
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc;
+    uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px;
+
+    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT);
+
+    while (row_parts_out + 4 <= row_parts_out_max)
+    {
+        __m256i t = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x);
+        t = _mm256_srli_epi16 (t, 2);
+        _mm256_store_si256 ((__m256i *) row_parts_out, t);
+
+        row_parts_out += 4;
+        precalc_x += 32;
+    }
+
+    interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 2);
+}
+
+static void
+interp_horizontal_bilinear_3h_64bpp (const SmolScaleCtx *scale_ctx,
+                                     const uint64_t * SMOL_RESTRICT row_parts_in,
+                                     uint64_t * SMOL_RESTRICT row_parts_out)
+{
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc;
+    uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px;
+
+    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT);
+
+    while (row_parts_out + 4 <= row_parts_out_max)
+    {
+        __m256i s0, s1;
+
+        s0 = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x);
+        s1 = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x + 32);
+
+        s0 = hadd_pixels_8x_to_4x_64bpp (s0, s1);
+        s0 = _mm256_srli_epi16 (s0, 3);
+        _mm256_store_si256 ((__m256i *) row_parts_out, s0);
+
+        row_parts_out += 4;
+        precalc_x += 64;
+    }
+
+    interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 3);
+}
+
+static void
+interp_horizontal_bilinear_4h_64bpp (const SmolScaleCtx *scale_ctx,
+                                     const uint64_t * SMOL_RESTRICT row_parts_in,
+                                     uint64_t * SMOL_RESTRICT row_parts_out)
+{
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc;
+    uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px;
+
+    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT);
+
+    while (row_parts_out + 4 <= row_parts_out_max)
+    {
+        __m256i t0;
+
+        t0 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x);
+        t0 = _mm256_srli_epi16 (t0, 4);
+        _mm256_store_si256 ((__m256i *) row_parts_out, t0);
+
+        row_parts_out += 4;
+        precalc_x += 128;
+    }
+
+    interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 4);
+}
+
+static void
+interp_horizontal_bilinear_5h_64bpp (const SmolScaleCtx *scale_ctx,
+                                     const uint64_t * SMOL_RESTRICT row_parts_in,
+                                     uint64_t * SMOL_RESTRICT row_parts_out)
+{
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc;
+    uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px;
+
+    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT);
+
+    while (row_parts_out + 4 <= row_parts_out_max)
+    {
+        __m256i t0, t1;
+
+        t0 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x);
+        t1 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x + 128);
+
+        t0 = hadd_pixels_8x_to_4x_64bpp (t0, t1);
+        t0 = _mm256_srli_epi16 (t0, 5);
+        _mm256_store_si256 ((__m256i *) row_parts_out, t0);
+
+        row_parts_out += 4;
+        precalc_x += 256;
+    }
+
+    interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 5);
 }
 
 static void
-interp_horizontal_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx,
+interp_horizontal_bilinear_6h_64bpp (const SmolScaleCtx *scale_ctx,
                                      const uint64_t * SMOL_RESTRICT row_parts_in,
                                      uint64_t * SMOL_RESTRICT row_parts_out)
 {
-    uint64_t p, q;
-    const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x;
-    uint64_t F;
-    uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->width_out;
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc;
+    uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px;
 
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
+    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT);
+    SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT);
 
-    do
+    while (row_parts_out + 4 <= row_parts_out_max)
     {
-        row_parts_in += *(ofs_x++);
-        F = *(ofs_x++);
+        __m256i t0, t1, t2, t3;
+
+        t0 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x);
+        t1 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x + 128);
+        t2 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x + 256);
+        t3 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x + 384);
 
-        p = *row_parts_in;
-        q = *(row_parts_in + 1);
+        hadd_pixels_16x_to_8x_64bpp (t0, t1, t2, t3, &t0, &t1);
+        t0 = hadd_pixels_8x_to_4x_64bpp (t0, t1);
 
-        *(row_parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;
+        t0 = _mm256_srli_epi16 (t0, 6);
+        _mm256_store_si256 ((__m256i *) row_parts_out, t0);
+
+        row_parts_out += 4;
+        precalc_x += 512;
     }
-    while (row_parts_out != row_parts_out_max);
+
+    interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 6);
 }
 
 static void
@@ -1492,8 +1972,8 @@ interp_horizontal_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx,
                                       const uint64_t * SMOL_RESTRICT row_parts_in,
                                       uint64_t * SMOL_RESTRICT row_parts_out)
 {
-    const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x;
-    uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->width_out * 2;
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc;
+    uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px * 2;
     const __m256i mask256 = _mm256_set_epi32 (
         0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff,
         0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff);
@@ -1506,19 +1986,20 @@ interp_horizontal_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx,
 
     while (row_parts_out + 4 <= row_parts_out_max)
     {
-        __m256i m0, m1;
         __m256i factors;
+        __m256i m0, m1;
         __m128i n0, n1, n2, n3, n4, n5;
+        const uint64_t * SMOL_RESTRICT p0;
 
-        row_parts_in += *(ofs_x++) * 2;
-        n4 = _mm_set1_epi16 (*(ofs_x++));
-        n0 = _mm_load_si128 ((__m128i *) row_parts_in);
-        n1 = _mm_load_si128 ((__m128i *) row_parts_in + 1);
+        p0 = row_parts_in + *(precalc_x++) * 2;
+        n4 = _mm_set1_epi16 (*(precalc_x++));
+        n0 = _mm_load_si128 ((__m128i *) p0);
+        n1 = _mm_load_si128 ((__m128i *) p0 + 1);
 
-        row_parts_in += *(ofs_x++) * 2;
-        n5 = _mm_set1_epi16 (*(ofs_x++));
-        n2 = _mm_load_si128 ((__m128i *) row_parts_in);
-        n3 = _mm_load_si128 ((__m128i *) row_parts_in + 1);
+        p0 = row_parts_in + *(precalc_x++) * 2;
+        n5 = _mm_set1_epi16 (*(precalc_x++));
+        n2 = _mm_load_si128 ((__m128i *) p0);
+        n3 = _mm_load_si128 ((__m128i *) p0 + 1);
 
         m0 = _mm256_set_m128i (n2, n0);
         m1 = _mm256_set_m128i (n3, n1);
@@ -1533,16 +2014,17 @@ interp_horizontal_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx,
     /* No need for a loop here; let compiler know we're doing it at most once */
     if (row_parts_out != row_parts_out_max)
     {
-        __m128i m0, m1;
         __m128i factors;
+        __m128i m0, m1;
         uint32_t f;
+        const uint64_t * SMOL_RESTRICT p0;
 
-        row_parts_in += *(ofs_x++) * 2;
-        f = *(ofs_x++);
+        p0 = row_parts_in + *(precalc_x++) * 2;
+        f = *(precalc_x++);
 
         factors = _mm_set1_epi32 ((uint32_t) f);
-        m0 = _mm_stream_load_si128 ((__m128i *) row_parts_in);
-        m1 = _mm_stream_load_si128 ((__m128i *) row_parts_in + 1);
+        m0 = _mm_stream_load_si128 ((__m128i *) p0);
+        m1 = _mm_stream_load_si128 ((__m128i *) p0 + 1);
 
         m0 = LERP_SIMD128_EPI32_AND_MASK (m0, m1, factors, mask128);
         _mm_store_si128 ((__m128i *) row_parts_out, m0);
@@ -1550,142 +2032,170 @@ interp_horizontal_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx,
     }
 }
 
-DEF_INTERP_HORIZONTAL_BILINEAR(1)
-DEF_INTERP_HORIZONTAL_BILINEAR(2)
-DEF_INTERP_HORIZONTAL_BILINEAR(3)
-DEF_INTERP_HORIZONTAL_BILINEAR(4)
-DEF_INTERP_HORIZONTAL_BILINEAR(5)
-DEF_INTERP_HORIZONTAL_BILINEAR(6)
+#define DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(n_halvings) \
+static void \
+interp_horizontal_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \
+                                                   const uint64_t * SMOL_RESTRICT row_parts_in, \
+                                                   uint64_t * SMOL_RESTRICT row_parts_out) \
+{ \
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; \
+    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px * 2; \
+    const __m256i mask256 = _mm256_set_epi32 ( \
+        0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff, \
+        0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff); \
+    const __m128i mask128 = _mm_set_epi32 ( \
+        0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff); \
+    const __m256i zero256 = _mm256_setzero_si256 (); \
+    int i; \
+     \
+    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); \
+\
+    while (row_parts_out != row_parts_out_max) \
+    { \
+        __m256i a0 = _mm256_setzero_si256 (); \
+        __m128i a1; \
+\
+        for (i = 0; i < (1 << ((n_halvings) - 1)); i++) \
+        { \
+            __m256i m0, m1; \
+            __m256i factors; \
+            __m128i n0, n1, n2, n3, n4, n5; \
+            const uint64_t * SMOL_RESTRICT p0; \
+\
+            p0 = row_parts_in + *(precalc_x++) * 2; \
+            n4 = _mm_set1_epi16 (*(precalc_x++)); \
+            n0 = _mm_load_si128 ((__m128i *) p0); \
+            n1 = _mm_load_si128 ((__m128i *) p0 + 1); \
+\
+            p0 = row_parts_in + *(precalc_x++) * 2; \
+            n5 = _mm_set1_epi16 (*(precalc_x++)); \
+            n2 = _mm_load_si128 ((__m128i *) p0); \
+            n3 = _mm_load_si128 ((__m128i *) p0 + 1); \
+\
+            m0 = _mm256_set_m128i (n2, n0); \
+            m1 = _mm256_set_m128i (n3, n1); \
+            factors = _mm256_set_m128i (n5, n4); \
+            factors = _mm256_blend_epi16 (factors, zero256, 0xaa); \
+\
+            m0 = LERP_SIMD256_EPI32_AND_MASK (m0, m1, factors, mask256); \
+            a0 = _mm256_add_epi32 (a0, m0); \
+        } \
+\
+        a1 = _mm_add_epi32 (_mm256_extracti128_si256 (a0, 0), \
+                            _mm256_extracti128_si256 (a0, 1)); \
+        a1 = _mm_srli_epi32 (a1, (n_halvings)); \
+        a1 = _mm_and_si128 (a1, mask128); \
+        _mm_store_si128 ((__m128i *) row_parts_out, a1); \
+        row_parts_out += 2; \
+    } \
+}
+
+DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(1)
+DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(2)
+DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(3)
+DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(4)
+DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(5)
+DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(6)
+
+static SMOL_INLINE void
+unpack_box_precalc (const uint32_t precalc,
+                    uint32_t step,
+                    uint32_t *ofs0,
+                    uint32_t *ofs1,
+                    uint32_t *f0,
+                    uint32_t *f1,
+                    uint32_t *n)
+{
+    *ofs0 = precalc;
+    *ofs1 = *ofs0 + step;
+    *f0 = 256 - (*ofs0 % SMOL_SUBPIXEL_MUL);
+    *f1 = *ofs1 % SMOL_SUBPIXEL_MUL;
+    *ofs0 /= SMOL_SUBPIXEL_MUL;
+    *ofs1 /= SMOL_SUBPIXEL_MUL;
+    *n = *ofs1 - *ofs0 - 1;
+}
 
 static void
 interp_horizontal_boxes_64bpp (const SmolScaleCtx *scale_ctx,
-                               const uint64_t *row_parts_in,
-                               uint64_t * SMOL_RESTRICT row_parts_out)
+                               const uint64_t *src_row_parts,
+                               uint64_t * SMOL_RESTRICT dest_row_parts)
 {
     const uint64_t * SMOL_RESTRICT pp;
-    const uint16_t *ofs_x = scale_ctx->offsets_x;
-    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out - 1;
-    uint64_t accum = 0;
-    uint64_t p, q, r, s;
-    uint32_t n;
-    uint64_t F;
-
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
+    const uint32_t *precalc_x = scale_ctx->hdim.precalc;
+    uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px;
+    uint64_t accum;
 
-    pp = row_parts_in;
-    p = weight_pixel_64bpp (*(pp++), 256);
-    n = *(ofs_x++);
+    SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *);
 
-    while (row_parts_out != row_parts_out_max)
+    while (dest_row_parts < dest_row_parts_max)
     {
-        sum_parts_64bpp ((const uint64_t ** SMOL_RESTRICT) &pp, &accum, n);
-
-        F = *(ofs_x++);
-        n = *(ofs_x++);
+        uint32_t ofs0, ofs1;
+        uint32_t f0, f1;
+        uint32_t n;
 
-        r = *(pp++);
-        s = r * F;
+        unpack_box_precalc (*(precalc_x++),
+                            scale_ctx->hdim.span_step,
+                            &ofs0,
+                            &ofs1,
+                            &f0,
+                            &f1,
+                            &n);
 
-        q = (s >> 8) & 0x00ff00ff00ff00ffULL;
+        pp = src_row_parts + ofs0;
 
-        accum += p + q;
-
-        /* (255 * r) - (F * r) */
-        p = (((r << 8) - r - s) >> 8) & 0x00ff00ff00ff00ffULL;
+        accum = weight_pixel_64bpp (*(pp++), f0);
+        sum_parts_64bpp ((const uint64_t ** SMOL_RESTRICT) &pp, &accum, n);
+        accum += weight_pixel_64bpp (*pp, f1);
 
-        *(row_parts_out++) = scale_64bpp (accum, scale_ctx->span_mul_x);
-        accum = 0;
+        *(dest_row_parts++) = scale_64bpp (accum, scale_ctx->hdim.span_mul);
     }
-
-    /* Final box optionally features the rightmost fractional pixel */
-
-    sum_parts_64bpp ((const uint64_t ** SMOL_RESTRICT) &pp, &accum, n);
-
-    q = 0;
-    F = *(ofs_x);
-    if (F > 0)
-        q = weight_pixel_64bpp (*(pp), F);
-
-    accum += p + q;
-    *(row_parts_out++) = scale_64bpp (accum, scale_ctx->span_mul_x);
 }
 
 static void
 interp_horizontal_boxes_128bpp (const SmolScaleCtx *scale_ctx,
-                                const uint64_t *row_parts_in,
-                                uint64_t * SMOL_RESTRICT row_parts_out)
+                                const uint64_t *src_row_parts,
+                                uint64_t * SMOL_RESTRICT dest_row_parts)
 {
     const uint64_t * SMOL_RESTRICT pp;
-    const uint16_t *ofs_x = scale_ctx->offsets_x;
-    uint64_t *row_parts_out_max = row_parts_out + (scale_ctx->width_out - /* 2 */ 1) * 2;
-    uint64_t accum [2] = { 0, 0 };
-    uint64_t p [2], q [2], r [2], s [2];
-    uint32_t n;
-    uint64_t F;
-
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
-
-    pp = row_parts_in;
+    const uint32_t *precalc_x = scale_ctx->hdim.precalc;
+    uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px * 2;
+    uint64_t accum [2];
 
-    p [0] = *(pp++);
-    p [1] = *(pp++);
-    weight_pixel_128bpp (p, p, 256);
+    SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *);
 
-    n = *(ofs_x++);
-
-    while (row_parts_out != row_parts_out_max)
+    while (dest_row_parts < dest_row_parts_max)
     {
-        sum_parts_128bpp ((const uint64_t ** SMOL_RESTRICT) &pp, accum, n);
-
-        F = *(ofs_x++);
-        n = *(ofs_x++);
+        uint32_t ofs0, ofs1;
+        uint32_t f0, f1;
+        uint32_t n;
+        uint64_t t [2];
 
-        r [0] = *(pp++);
-        r [1] = *(pp++);
+        unpack_box_precalc (*(precalc_x++),
+                            scale_ctx->hdim.span_step,
+                            &ofs0,
+                            &ofs1,
+                            &f0,
+                            &f1,
+                            &n);
 
-        s [0] = r [0] * F;
-        s [1] = r [1] * F;
+        pp = src_row_parts + (ofs0 * 2);
 
-        q [0] = (s [0] >> 8) & 0x00ffffff00ffffff;
-        q [1] = (s [1] >> 8) & 0x00ffffff00ffffff;
+        weight_pixel_128bpp (pp, accum, f0);
+        pp += 2;
 
-        accum [0] += p [0] + q [0];
-        accum [1] += p [1] + q [1];
+        sum_parts_128bpp ((const uint64_t ** SMOL_RESTRICT) &pp, accum, n);
 
-        p [0] = (((r [0] << 8) - r [0] - s [0]) >> 8) & 0x00ffffff00ffffff;
-        p [1] = (((r [1] << 8) - r [1] - s [1]) >> 8) & 0x00ffffff00ffffff;
+        weight_pixel_128bpp (pp, t, f1);
+        accum [0] += t [0];
+        accum [1] += t [1];
 
         scale_and_store_128bpp (accum,
-                                scale_ctx->span_mul_x,
-                                (uint64_t ** SMOL_RESTRICT) &row_parts_out);
-
-        accum [0] = 0;
-        accum [1] = 0;
+                                scale_ctx->hdim.span_mul,
+                                (uint64_t ** SMOL_RESTRICT) &dest_row_parts);
     }
-
-    /* Final box optionally features the rightmost fractional pixel */
-
-    sum_parts_128bpp ((const uint64_t ** SMOL_RESTRICT) &pp, accum, n);
-
-    q [0] = 0;
-    q [1] = 0;
-
-    F = *(ofs_x);
-    if (F > 0)
-    {
-        q [0] = *(pp++);
-        q [1] = *(pp++);
-        weight_pixel_128bpp (q, q, F);
-    }
-
-    accum [0] += p [0] + q [0];
-    accum [1] += p [1] + q [1];
-
-    scale_and_store_128bpp (accum,
-                            scale_ctx->span_mul_x,
-                            (uint64_t ** SMOL_RESTRICT) &row_parts_out);
 }
 
 static void
@@ -1693,7 +2203,7 @@ interp_horizontal_one_64bpp (const SmolScaleCtx *scale_ctx,
                              const uint64_t * SMOL_RESTRICT row_parts_in,
                              uint64_t * SMOL_RESTRICT row_parts_out)
 {
-    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out;
+    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px;
     uint64_t part;
 
     SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
@@ -1709,7 +2219,7 @@ interp_horizontal_one_128bpp (const SmolScaleCtx *scale_ctx,
                               const uint64_t * SMOL_RESTRICT row_parts_in,
                               uint64_t * SMOL_RESTRICT row_parts_out)
 {
-    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out * 2;
+    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px * 2;
 
     SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
     SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
@@ -1729,7 +2239,7 @@ interp_horizontal_copy_64bpp (const SmolScaleCtx *scale_ctx,
     SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
     SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
 
-    memcpy (row_parts_out, row_parts_in, scale_ctx->width_out * sizeof (uint64_t));
+    memcpy (row_parts_out, row_parts_in, scale_ctx->hdim.placement_size_px * sizeof (uint64_t));
 }
 
 static void
@@ -1740,76 +2250,81 @@ interp_horizontal_copy_128bpp (const SmolScaleCtx *scale_ctx,
     SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
     SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
 
-    memcpy (row_parts_out, row_parts_in, scale_ctx->width_out * 2 * sizeof (uint64_t));
+    memcpy (row_parts_out, row_parts_in, scale_ctx->hdim.placement_size_px * 2 * sizeof (uint64_t));
 }
 
 static void
 scale_horizontal (const SmolScaleCtx *scale_ctx,
-                  SmolVerticalCtx *vertical_ctx,
-                  const uint32_t *row_in,
-                  uint64_t *row_parts_out)
+                  SmolLocalCtx *local_ctx,
+                  const char *src_row,
+                  uint64_t *dest_row_parts)
 {
-    uint64_t * SMOL_RESTRICT unpacked_in;
+    uint64_t * SMOL_RESTRICT src_row_unpacked;
 
-    unpacked_in = vertical_ctx->parts_row [3];
+    src_row_unpacked = local_ctx->parts_row [3];
 
     /* 32-bit unpackers need 32-bit alignment */
-    if ((((uintptr_t) row_in) & 3)
-        && scale_ctx->pixel_type_in != SMOL_PIXEL_RGB8
-        && scale_ctx->pixel_type_in != SMOL_PIXEL_BGR8)
+    if ((((uintptr_t) src_row) & 3)
+        && scale_ctx->src_pixel_type != SMOL_PIXEL_RGB8
+        && scale_ctx->src_pixel_type != SMOL_PIXEL_BGR8)
     {
-        if (!vertical_ctx->in_aligned)
-            vertical_ctx->in_aligned =
-                smol_alloc_aligned (scale_ctx->width_in * sizeof (uint32_t),
-                                    &vertical_ctx->in_aligned_storage);
-        memcpy (vertical_ctx->in_aligned, row_in, scale_ctx->width_in * sizeof (uint32_t));
-        row_in = vertical_ctx->in_aligned;
+        if (!local_ctx->src_aligned)
+            local_ctx->src_aligned =
+                smol_alloc_aligned (scale_ctx->hdim.src_size_px * sizeof (uint32_t),
+                                    &local_ctx->src_aligned_storage);
+        memcpy (local_ctx->src_aligned, src_row, scale_ctx->hdim.src_size_px * sizeof (uint32_t));
+        src_row = (const char *) local_ctx->src_aligned;
     }
 
-    scale_ctx->unpack_row_func (row_in,
-                                unpacked_in,
-                                scale_ctx->width_in);
+    scale_ctx->src_unpack_row_func (src_row,
+                                    src_row_unpacked,
+                                    scale_ctx->hdim.src_size_px);
     scale_ctx->hfilter_func (scale_ctx,
-                             unpacked_in,
-                             row_parts_out);
+                             src_row_unpacked,
+                             dest_row_parts);
+
+    apply_horiz_edge_opacity (scale_ctx, dest_row_parts);
 }
 
-/* --- Vertical scaling --- */
+/* ---------------- *
+ * Vertical scaling *
+ * ---------------- */
 
 static void
-update_vertical_ctx_bilinear (const SmolScaleCtx *scale_ctx,
-                              SmolVerticalCtx *vertical_ctx,
-                              uint32_t outrow_index)
+update_local_ctx_bilinear (const SmolScaleCtx *scale_ctx,
+                           SmolLocalCtx *local_ctx,
+                           uint32_t dest_row_index)
 {
-    uint32_t new_in_ofs = scale_ctx->offsets_y [outrow_index * 2];
+    uint16_t *precalc_y = scale_ctx->vdim.precalc;
+    uint32_t new_src_ofs = precalc_y [dest_row_index * 2];
 
-    if (new_in_ofs == vertical_ctx->in_ofs)
+    if (new_src_ofs == local_ctx->src_ofs)
         return;
 
-    if (new_in_ofs == vertical_ctx->in_ofs + 1)
+    if (new_src_ofs == local_ctx->src_ofs + 1)
     {
-        uint64_t *t = vertical_ctx->parts_row [0];
-        vertical_ctx->parts_row [0] = vertical_ctx->parts_row [1];
-        vertical_ctx->parts_row [1] = t;
+        uint64_t *t = local_ctx->parts_row [0];
+        local_ctx->parts_row [0] = local_ctx->parts_row [1];
+        local_ctx->parts_row [1] = t;
 
         scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, new_in_ofs + 1),
-                          vertical_ctx->parts_row [1]);
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, new_src_ofs + 1),
+                          local_ctx->parts_row [1]);
     }
     else
     {
         scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, new_in_ofs),
-                          vertical_ctx->parts_row [0]);
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, new_src_ofs),
+                          local_ctx->parts_row [0]);
         scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, new_in_ofs + 1),
-                          vertical_ctx->parts_row [1]);
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, new_src_ofs + 1),
+                          local_ctx->parts_row [1]);
     }
 
-    vertical_ctx->in_ofs = new_in_ofs;
+    local_ctx->src_ofs = new_src_ofs;
 }
 
 static void
@@ -1819,13 +2334,37 @@ interp_vertical_bilinear_store_64bpp (uint64_t F,
                                       uint64_t * SMOL_RESTRICT parts_out,
                                       uint32_t width)
 {
+    const __m256i mask = _mm256_set_epi16 (0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+                                           0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff);
     uint64_t *parts_out_last = parts_out + width;
+    __m256i F256;
 
     SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *);
     SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *);
     SMOL_ASSUME_ALIGNED (parts_out, uint64_t *);
 
-    do
+    F256 = _mm256_set1_epi16 ((uint16_t) F);
+
+    while (parts_out + 4 <= parts_out_last)
+    {
+        __m256i m0, m1;
+
+        m0 = _mm256_load_si256 ((const __m256i *) top_row_parts_in);
+        top_row_parts_in += 4;
+        m1 = _mm256_load_si256 ((const __m256i *) bottom_row_parts_in);
+        bottom_row_parts_in += 4;
+
+        m0 = _mm256_sub_epi16 (m0, m1);
+        m0 = _mm256_mullo_epi16 (m0, F256);
+        m0 = _mm256_srli_epi16 (m0, 8);
+        m0 = _mm256_add_epi16 (m0, m1);
+        m0 = _mm256_and_si256 (m0, mask);
+
+        _mm256_store_si256 ((__m256i *) parts_out, m0);
+        parts_out += 4;
+    }
+
+    while (parts_out != parts_out_last)
     {
         uint64_t p, q;
 
@@ -1834,23 +2373,76 @@ interp_vertical_bilinear_store_64bpp (uint64_t F,
 
         *(parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;
     }
-    while (parts_out != parts_out_last);
 }
 
 static void
-interp_vertical_bilinear_add_64bpp (uint64_t F,
-                                    const uint64_t * SMOL_RESTRICT top_row_parts_in,
-                                    const uint64_t * SMOL_RESTRICT bottom_row_parts_in,
-                                    uint64_t * SMOL_RESTRICT accum_out,
+interp_vertical_bilinear_store_with_opacity_64bpp (uint64_t F,
+                                                   const uint64_t * SMOL_RESTRICT top_src_row_parts,
+                                                   const uint64_t * SMOL_RESTRICT bottom_src_row_parts,
+                                                   uint64_t * SMOL_RESTRICT dest_parts,
+                                                   uint32_t width,
+                                                   uint16_t opacity)
+{
+    uint64_t *parts_dest_last = dest_parts + width;
+
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
+
+    do
+    {
+        uint64_t p, q;
+
+        p = *(top_src_row_parts++);
+        q = *(bottom_src_row_parts++);
+
+        *dest_parts = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;
+        apply_subpixel_opacity_64bpp (dest_parts, opacity);
+        dest_parts++;
+    }
+    while (dest_parts != parts_dest_last);
+}
+
+static void
+interp_vertical_bilinear_add_64bpp (uint16_t F,
+                                    const uint64_t *top_row_parts_in,
+                                    const uint64_t *bottom_row_parts_in,
+                                    uint64_t *accum_out,
                                     uint32_t width)
 {
+    const __m256i mask = _mm256_set_epi16 (0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+                                           0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff);
     uint64_t *accum_out_last = accum_out + width;
+    __m256i F256;
 
     SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *);
     SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *);
     SMOL_ASSUME_ALIGNED (accum_out, uint64_t *);
 
-    do
+    F256 = _mm256_set1_epi16 ((uint16_t) F);
+
+    while (accum_out + 4 <= accum_out_last)
+    {
+        __m256i m0, m1, o0;
+
+        m0 = _mm256_load_si256 ((const __m256i *) top_row_parts_in);
+        top_row_parts_in += 4;
+        m1 = _mm256_load_si256 ((const __m256i *) bottom_row_parts_in);
+        bottom_row_parts_in += 4;
+        o0 = _mm256_load_si256 ((const __m256i *) accum_out);
+
+        m0 = _mm256_sub_epi16 (m0, m1);
+        m0 = _mm256_mullo_epi16 (m0, F256);
+        m0 = _mm256_srli_epi16 (m0, 8);
+        m0 = _mm256_add_epi16 (m0, m1);
+        m0 = _mm256_and_si256 (m0, mask);
+
+        o0 = _mm256_add_epi16 (o0, m0);
+        _mm256_store_si256 ((__m256i *) accum_out, o0);
+        accum_out += 4;
+    }
+
+    while (accum_out != accum_out_last)
     {
         uint64_t p, q;
 
@@ -1859,7 +2451,6 @@ interp_vertical_bilinear_add_64bpp (uint64_t F,
 
         *(accum_out++) += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;
     }
-    while (accum_out != accum_out_last);
 }
 
 static void
@@ -1922,6 +2513,34 @@ interp_vertical_bilinear_store_128bpp (uint64_t F,
     }
 }
 
+static void
+interp_vertical_bilinear_store_with_opacity_128bpp (uint64_t F,
+                                                    const uint64_t * SMOL_RESTRICT top_src_row_parts,
+                                                    const uint64_t * SMOL_RESTRICT bottom_src_row_parts,
+                                                    uint64_t * SMOL_RESTRICT dest_parts,
+                                                    uint32_t width,
+                                                    uint16_t opacity)
+{
+    uint64_t *parts_dest_last = dest_parts + width;
+
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
+
+    do
+    {
+        uint64_t p, q;
+
+        p = *(top_src_row_parts++);
+        q = *(bottom_src_row_parts++);
+
+        *dest_parts = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL;
+        apply_subpixel_opacity_128bpp_half (dest_parts, opacity);
+        dest_parts++;
+    }
+    while (dest_parts != parts_dest_last);
+}
+
 static void
 interp_vertical_bilinear_add_128bpp (uint64_t F,
                                      const uint64_t * SMOL_RESTRICT top_row_parts_in,
@@ -1954,7 +2573,7 @@ interp_vertical_bilinear_add_128bpp (uint64_t F,
         m3 = _mm256_load_si256 ((const __m256i *) bottom_row_parts_in);
         bottom_row_parts_in += 4;
         o0 = _mm256_load_si256 ((const __m256i *) accum_out);
-        o1 = _mm256_load_si256 ((const __m256i *) accum_out + 4);
+        o1 = _mm256_load_si256 ((const __m256i *) (accum_out + 4));
 
         m0 = _mm256_sub_epi32 (m0, m1);
         m2 = _mm256_sub_epi32 (m2, m3);
@@ -1986,230 +2605,418 @@ interp_vertical_bilinear_add_128bpp (uint64_t F,
     }
 }
 
-#define DEF_INTERP_VERTICAL_BILINEAR_FINAL(n_halvings)                  \
-static void                                                             \
-interp_vertical_bilinear_final_##n_halvings##h_64bpp (uint64_t F,                \
+#define DEF_INTERP_VERTICAL_BILINEAR_FINAL(n_halvings) \
+static void \
+interp_vertical_bilinear_final_##n_halvings##h_64bpp (uint64_t F, \
                                                       const uint64_t * SMOL_RESTRICT top_row_parts_in, \
                                                       const uint64_t * SMOL_RESTRICT bottom_row_parts_in, \
                                                       uint64_t * SMOL_RESTRICT accum_inout, \
-                                                      uint32_t width)   \
-{                                                                       \
-    uint64_t *accum_inout_last = accum_inout + width;                   \
-                                                                        \
-    SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *);           \
-    SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *);        \
-    SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *);                      \
-                                                                        \
-    do                                                                  \
-    {                                                                   \
-        uint64_t p, q;                                                  \
-                                                                        \
-        p = *(top_row_parts_in++);                                      \
-        q = *(bottom_row_parts_in++);                                   \
-                                                                        \
-        p = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;         \
+                                                      uint32_t width) \
+{ \
+    const __m256i mask = _mm256_set_epi16 (0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, \
+                                           0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff); \
+    uint64_t *accum_inout_last = accum_inout + width; \
+    __m256i F256; \
+\
+    SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \
+\
+    F256 = _mm256_set1_epi16 ((uint16_t) F); \
+\
+    while (accum_inout + 4 <= accum_inout_last) \
+    { \
+        __m256i m0, m1, o0; \
+\
+        m0 = _mm256_load_si256 ((const __m256i *) top_row_parts_in); \
+        top_row_parts_in += 4; \
+        m1 = _mm256_load_si256 ((const __m256i *) bottom_row_parts_in); \
+        bottom_row_parts_in += 4; \
+        o0 = _mm256_load_si256 ((const __m256i *) accum_inout); \
+\
+        m0 = _mm256_sub_epi16 (m0, m1); \
+        m0 = _mm256_mullo_epi16 (m0, F256); \
+        m0 = _mm256_srli_epi16 (m0, 8); \
+        m0 = _mm256_add_epi16 (m0, m1); \
+        m0 = _mm256_and_si256 (m0, mask); \
+\
+        o0 = _mm256_add_epi16 (o0, m0); \
+        o0 = _mm256_srli_epi16 (o0, n_halvings); \
+\
+        _mm256_store_si256 ((__m256i *) accum_inout, o0); \
+        accum_inout += 4; \
+    } \
+\
+    while (accum_inout != accum_inout_last) \
+    { \
+        uint64_t p, q; \
+\
+        p = *(top_row_parts_in++); \
+        q = *(bottom_row_parts_in++); \
+\
+        p = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \
+        p = ((p + *accum_inout) >> n_halvings) & 0x00ff00ff00ff00ffULL; \
+\
+        *(accum_inout++) = p; \
+    } \
+} \
+static void \
+interp_vertical_bilinear_final_##n_halvings##h_with_opacity_64bpp (uint64_t F, \
+                                                                   const uint64_t * SMOL_RESTRICT top_src_row_parts, \
+                                                                   const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \
+                                                                   uint64_t * SMOL_RESTRICT accum_inout, \
+                                                                   uint32_t width, \
+                                                                   uint16_t opacity) \
+{ \
+    uint64_t *accum_inout_last = accum_inout + width; \
+\
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \
+\
+    do \
+    { \
+        uint64_t p, q; \
+\
+        p = *(top_src_row_parts++); \
+        q = *(bottom_src_row_parts++); \
+\
+        p = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \
         p = ((p + *accum_inout) >> n_halvings) & 0x00ff00ff00ff00ffULL; \
-                                                                        \
-        *(accum_inout++) = p;                                           \
-    }                                                                   \
-    while (accum_inout != accum_inout_last);                            \
-}                                                                       \
-                                                                        \
-static void                                                             \
-interp_vertical_bilinear_final_##n_halvings##h_128bpp (uint64_t F,      \
-                                                       const uint64_t * SMOL_RESTRICT top_row_parts_in, \
-                                                       const uint64_t * SMOL_RESTRICT bottom_row_parts_in, \
+\
+        apply_subpixel_opacity_64bpp (&p, opacity); \
+        *(accum_inout++) = p; \
+    } \
+    while (accum_inout != accum_inout_last); \
+} \
+\
+static void \
+interp_vertical_bilinear_final_##n_halvings##h_128bpp (uint64_t F, \
+                                                       const uint64_t * SMOL_RESTRICT top_src_row_parts, \
+                                                       const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \
                                                        uint64_t * SMOL_RESTRICT accum_inout, \
-                                                       uint32_t width)  \
-{                                                                       \
-    uint64_t *accum_inout_last = accum_inout + width;                   \
-                                                                        \
-    SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *);           \
-    SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *);        \
-    SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *);                      \
-                                                                        \
-    do                                                                  \
-    {                                                                   \
-        uint64_t p, q;                                                  \
-                                                                        \
-        p = *(top_row_parts_in++);                                      \
-        q = *(bottom_row_parts_in++);                                   \
-                                                                        \
-        p = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL;         \
+                                                       uint32_t width) \
+{ \
+    uint64_t *accum_inout_last = accum_inout + width; \
+\
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \
+\
+    do \
+    { \
+        uint64_t p, q; \
+\
+        p = *(top_src_row_parts++); \
+        q = *(bottom_src_row_parts++); \
+\
+        p = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \
         p = ((p + *accum_inout) >> n_halvings) & 0x00ffffff00ffffffULL; \
-                                                                        \
-        *(accum_inout++) = p;                                           \
-    }                                                                   \
-    while (accum_inout != accum_inout_last);                            \
-}
-
-#define DEF_SCALE_OUTROW_BILINEAR(n_halvings)                           \
-static void                                                             \
-scale_outrow_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \
-                                             SmolVerticalCtx *vertical_ctx, \
-                                             uint32_t outrow_index,     \
-                                             uint32_t *row_out)         \
-{                                                                       \
-    uint32_t bilin_index = outrow_index << (n_halvings);                \
-    unsigned int i;                                                     \
-                                                                        \
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \
-    interp_vertical_bilinear_store_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \
-                                          vertical_ctx->parts_row [0],  \
-                                          vertical_ctx->parts_row [1],  \
-                                          vertical_ctx->parts_row [2],  \
-                                          scale_ctx->width_out);        \
-    bilin_index++;                                                      \
-                                                                        \
-    for (i = 0; i < (1 << (n_halvings)) - 2; i++)                       \
-    {                                                                   \
-        update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \
-        interp_vertical_bilinear_add_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \
-                                            vertical_ctx->parts_row [0], \
-                                            vertical_ctx->parts_row [1], \
-                                            vertical_ctx->parts_row [2], \
-                                            scale_ctx->width_out);      \
-        bilin_index++;                                                  \
-    }                                                                   \
-                                                                        \
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \
-    interp_vertical_bilinear_final_##n_halvings##h_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \
-                                                          vertical_ctx->parts_row [0], \
-                                                          vertical_ctx->parts_row [1], \
-                                                          vertical_ctx->parts_row [2], \
-                                                          scale_ctx->width_out); \
-                                                                        \
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); \
-}                                                                       \
-                                                                        \
-static void                                                             \
-scale_outrow_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \
-                                              SmolVerticalCtx *vertical_ctx, \
-                                              uint32_t outrow_index,    \
-                                              uint32_t *row_out)        \
-{                                                                       \
-    uint32_t bilin_index = outrow_index << (n_halvings);                \
-    unsigned int i;                                                     \
-                                                                        \
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \
-    interp_vertical_bilinear_store_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \
-                                           vertical_ctx->parts_row [0], \
-                                           vertical_ctx->parts_row [1], \
-                                           vertical_ctx->parts_row [2], \
-                                           scale_ctx->width_out * 2);   \
-    bilin_index++;                                                      \
-                                                                        \
-    for (i = 0; i < (1 << (n_halvings)) - 2; i++)                       \
-    {                                                                   \
-        update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \
-        interp_vertical_bilinear_add_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \
-                                             vertical_ctx->parts_row [0], \
-                                             vertical_ctx->parts_row [1], \
-                                             vertical_ctx->parts_row [2], \
-                                             scale_ctx->width_out * 2); \
-        bilin_index++;                                                  \
-    }                                                                   \
-                                                                        \
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \
-    interp_vertical_bilinear_final_##n_halvings##h_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \
-                                                           vertical_ctx->parts_row [0], \
-                                                           vertical_ctx->parts_row [1], \
-                                                           vertical_ctx->parts_row [2], \
-                                                           scale_ctx->width_out * 2); \
-                                                                        \
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); \
-}
-
-static void
-scale_outrow_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx,
-                                SmolVerticalCtx *vertical_ctx,
-                                uint32_t outrow_index,
-                                uint32_t *row_out)
-{
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, outrow_index);
-    interp_vertical_bilinear_store_64bpp (scale_ctx->offsets_y [outrow_index * 2 + 1],
-                                          vertical_ctx->parts_row [0],
-                                          vertical_ctx->parts_row [1],
-                                          vertical_ctx->parts_row [2],
-                                          scale_ctx->width_out);
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out);
-}
+\
+        *(accum_inout++) = p; \
+    } \
+    while (accum_inout != accum_inout_last); \
+} \
+\
+static void \
+interp_vertical_bilinear_final_##n_halvings##h_with_opacity_128bpp (uint64_t F, \
+                                                                    const uint64_t * SMOL_RESTRICT top_src_row_parts, \
+                                                                    const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \
+                                                                    uint64_t * SMOL_RESTRICT accum_inout, \
+                                                                    uint32_t width, \
+                                                                    uint16_t opacity) \
+{ \
+    uint64_t *accum_inout_last = accum_inout + width; \
+\
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \
+\
+    do \
+    { \
+        uint64_t p, q; \
+\
+        p = *(top_src_row_parts++); \
+        q = *(bottom_src_row_parts++); \
+\
+        p = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \
+        p = ((p + *accum_inout) >> n_halvings) & 0x00ffffff00ffffffULL; \
+\
+        apply_subpixel_opacity_128bpp_half (&p, opacity); \
+        *(accum_inout++) = p; \
+    } \
+    while (accum_inout != accum_inout_last); \
+}
+
+#define DEF_SCALE_DEST_ROW_BILINEAR(n_halvings) \
+static int \
+scale_dest_row_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \
+                                               SmolLocalCtx *local_ctx, \
+                                               uint32_t dest_row_index) \
+{ \
+    uint16_t *precalc_y = scale_ctx->vdim.precalc; \
+    uint32_t bilin_index = dest_row_index << (n_halvings); \
+    unsigned int i; \
+\
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \
+    interp_vertical_bilinear_store_64bpp (precalc_y [bilin_index * 2 + 1], \
+                                          local_ctx->parts_row [0], \
+                                          local_ctx->parts_row [1], \
+                                          local_ctx->parts_row [2], \
+                                          scale_ctx->hdim.placement_size_px); \
+    bilin_index++; \
+\
+    for (i = 0; i < (1 << (n_halvings)) - 2; i++) \
+    { \
+        update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \
+        interp_vertical_bilinear_add_64bpp (precalc_y [bilin_index * 2 + 1], \
+                                            local_ctx->parts_row [0], \
+                                            local_ctx->parts_row [1], \
+                                            local_ctx->parts_row [2], \
+                                            scale_ctx->hdim.placement_size_px); \
+        bilin_index++; \
+    } \
+\
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \
+\
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) \
+        interp_vertical_bilinear_final_##n_halvings##h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1], \
+                                                                           local_ctx->parts_row [0], \
+                                                                           local_ctx->parts_row [1], \
+                                                                           local_ctx->parts_row [2], \
+                                                                           scale_ctx->hdim.placement_size_px, \
+                                                                           scale_ctx->vdim.first_opacity); \
+    else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) \
+        interp_vertical_bilinear_final_##n_halvings##h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1], \
+                                                                           local_ctx->parts_row [0], \
+                                                                           local_ctx->parts_row [1], \
+                                                                           local_ctx->parts_row [2], \
+                                                                           scale_ctx->hdim.placement_size_px, \
+                                                                           scale_ctx->vdim.last_opacity); \
+    else \
+        interp_vertical_bilinear_final_##n_halvings##h_64bpp (precalc_y [bilin_index * 2 + 1], \
+                                                              local_ctx->parts_row [0], \
+                                                              local_ctx->parts_row [1], \
+                                                              local_ctx->parts_row [2], \
+                                                              scale_ctx->hdim.placement_size_px); \
+\
+    return 2; \
+} \
+\
+static int \
+scale_dest_row_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \
+                                                SmolLocalCtx *local_ctx, \
+                                                uint32_t dest_row_index) \
+{ \
+    uint16_t *precalc_y = scale_ctx->vdim.precalc; \
+    uint32_t bilin_index = dest_row_index << (n_halvings); \
+    unsigned int i; \
+\
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \
+    interp_vertical_bilinear_store_128bpp (precalc_y [bilin_index * 2 + 1], \
+                                           local_ctx->parts_row [0], \
+                                           local_ctx->parts_row [1], \
+                                           local_ctx->parts_row [2], \
+                                           scale_ctx->hdim.placement_size_px * 2); \
+    bilin_index++; \
+\
+    for (i = 0; i < (1 << (n_halvings)) - 2; i++) \
+    { \
+        update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \
+        interp_vertical_bilinear_add_128bpp (precalc_y [bilin_index * 2 + 1], \
+                                             local_ctx->parts_row [0], \
+                                             local_ctx->parts_row [1], \
+                                             local_ctx->parts_row [2], \
+                                             scale_ctx->hdim.placement_size_px * 2); \
+        bilin_index++; \
+    } \
+\
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \
+\
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) \
+        interp_vertical_bilinear_final_##n_halvings##h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1], \
+                                                                            local_ctx->parts_row [0], \
+                                                                            local_ctx->parts_row [1], \
+                                                                            local_ctx->parts_row [2], \
+                                                                            scale_ctx->hdim.placement_size_px * 2, \
+                                                                            scale_ctx->vdim.first_opacity); \
+    else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) \
+        interp_vertical_bilinear_final_##n_halvings##h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1], \
+                                                                            local_ctx->parts_row [0], \
+                                                                            local_ctx->parts_row [1], \
+                                                                            local_ctx->parts_row [2], \
+                                                                            scale_ctx->hdim.placement_size_px * 2, \
+                                                                            scale_ctx->vdim.last_opacity); \
+    else \
+        interp_vertical_bilinear_final_##n_halvings##h_128bpp (precalc_y [bilin_index * 2 + 1], \
+                                                               local_ctx->parts_row [0], \
+                                                               local_ctx->parts_row [1], \
+                                                               local_ctx->parts_row [2], \
+                                                               scale_ctx->hdim.placement_size_px * 2); \
+\
+    return 2; \
+}
+
+static int
+scale_dest_row_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx,
+                                  SmolLocalCtx *local_ctx,
+                                  uint32_t dest_row_index)
+{
+    uint16_t *precalc_y = scale_ctx->vdim.precalc;
+
+    update_local_ctx_bilinear (scale_ctx, local_ctx, dest_row_index);
+
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+        interp_vertical_bilinear_store_with_opacity_64bpp (precalc_y [dest_row_index * 2 + 1],
+                                                           local_ctx->parts_row [0],
+                                                           local_ctx->parts_row [1],
+                                                           local_ctx->parts_row [2],
+                                                           scale_ctx->hdim.placement_size_px,
+                                                           scale_ctx->vdim.first_opacity);
+    else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256)
+        interp_vertical_bilinear_store_with_opacity_64bpp (precalc_y [dest_row_index * 2 + 1],
+                                                           local_ctx->parts_row [0],
+                                                           local_ctx->parts_row [1],
+                                                           local_ctx->parts_row [2],
+                                                           scale_ctx->hdim.placement_size_px,
+                                                           scale_ctx->vdim.last_opacity);
+    else
+        interp_vertical_bilinear_store_64bpp (precalc_y [dest_row_index * 2 + 1],
+                                              local_ctx->parts_row [0],
+                                              local_ctx->parts_row [1],
+                                              local_ctx->parts_row [2],
+                                              scale_ctx->hdim.placement_size_px);
+
+    return 2;
+}
+
+static int
+scale_dest_row_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx,
+                                   SmolLocalCtx *local_ctx,
+                                   uint32_t dest_row_index)
+{
+    uint16_t *precalc_y = scale_ctx->vdim.precalc;
+
+    update_local_ctx_bilinear (scale_ctx, local_ctx, dest_row_index);
+
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+        interp_vertical_bilinear_store_with_opacity_128bpp (precalc_y [dest_row_index * 2 + 1],
+                                                            local_ctx->parts_row [0],
+                                                            local_ctx->parts_row [1],
+                                                            local_ctx->parts_row [2],
+                                                            scale_ctx->hdim.placement_size_px * 2,
+                                                            scale_ctx->vdim.first_opacity);
+    else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256)
+        interp_vertical_bilinear_store_with_opacity_128bpp (precalc_y [dest_row_index * 2 + 1],
+                                                            local_ctx->parts_row [0],
+                                                            local_ctx->parts_row [1],
+                                                            local_ctx->parts_row [2],
+                                                            scale_ctx->hdim.placement_size_px * 2,
+                                                            scale_ctx->vdim.last_opacity);
+    else
+        interp_vertical_bilinear_store_128bpp (precalc_y [dest_row_index * 2 + 1],
+                                               local_ctx->parts_row [0],
+                                               local_ctx->parts_row [1],
+                                               local_ctx->parts_row [2],
+                                               scale_ctx->hdim.placement_size_px * 2);
 
-static void
-scale_outrow_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx,
-                                 SmolVerticalCtx *vertical_ctx,
-                                 uint32_t outrow_index,
-                                 uint32_t *row_out)
-{
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, outrow_index);
-    interp_vertical_bilinear_store_128bpp (scale_ctx->offsets_y [outrow_index * 2 + 1],
-                                           vertical_ctx->parts_row [0],
-                                           vertical_ctx->parts_row [1],
-                                           vertical_ctx->parts_row [2],
-                                           scale_ctx->width_out * 2);
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out);
+    return 2;
 }
 
 DEF_INTERP_VERTICAL_BILINEAR_FINAL(1)
 
-static void
-scale_outrow_bilinear_1h_64bpp (const SmolScaleCtx *scale_ctx,
-                                SmolVerticalCtx *vertical_ctx,
-                                uint32_t outrow_index,
-                                uint32_t *row_out)
-{
-    uint32_t bilin_index = outrow_index << 1;
-
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index);
-    interp_vertical_bilinear_store_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1],
-                                          vertical_ctx->parts_row [0],
-                                          vertical_ctx->parts_row [1],
-                                          vertical_ctx->parts_row [2],
-                                          scale_ctx->width_out);
+static int
+scale_dest_row_bilinear_1h_64bpp (const SmolScaleCtx *scale_ctx,
+                                  SmolLocalCtx *local_ctx,
+                                  uint32_t dest_row_index)
+{
+    uint16_t *precalc_y = scale_ctx->vdim.precalc;
+    uint32_t bilin_index = dest_row_index << 1;
+
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index);
+    interp_vertical_bilinear_store_64bpp (precalc_y [bilin_index * 2 + 1],
+                                          local_ctx->parts_row [0],
+                                          local_ctx->parts_row [1],
+                                          local_ctx->parts_row [2],
+                                          scale_ctx->hdim.placement_size_px);
     bilin_index++;
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index);
-    interp_vertical_bilinear_final_1h_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1],
-                                             vertical_ctx->parts_row [0],
-                                             vertical_ctx->parts_row [1],
-                                             vertical_ctx->parts_row [2],
-                                             scale_ctx->width_out);
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out);
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index);
+
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+        interp_vertical_bilinear_final_1h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1],
+                                                              local_ctx->parts_row [0],
+                                                              local_ctx->parts_row [1],
+                                                              local_ctx->parts_row [2],
+                                                              scale_ctx->hdim.placement_size_px,
+                                                              scale_ctx->vdim.first_opacity);
+    else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256)
+        interp_vertical_bilinear_final_1h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1],
+                                                              local_ctx->parts_row [0],
+                                                              local_ctx->parts_row [1],
+                                                              local_ctx->parts_row [2],
+                                                              scale_ctx->hdim.placement_size_px,
+                                                              scale_ctx->vdim.last_opacity);
+    else
+        interp_vertical_bilinear_final_1h_64bpp (precalc_y [bilin_index * 2 + 1],
+                                                 local_ctx->parts_row [0],
+                                                 local_ctx->parts_row [1],
+                                                 local_ctx->parts_row [2],
+                                                 scale_ctx->hdim.placement_size_px);
+
+    return 2;
 }
 
-static void
-scale_outrow_bilinear_1h_128bpp (const SmolScaleCtx *scale_ctx,
-                                 SmolVerticalCtx *vertical_ctx,
-                                 uint32_t outrow_index,
-                                 uint32_t *row_out)
-{
-    uint32_t bilin_index = outrow_index << 1;
-
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index);
-    interp_vertical_bilinear_store_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1],
-                                           vertical_ctx->parts_row [0],
-                                           vertical_ctx->parts_row [1],
-                                           vertical_ctx->parts_row [2],
-                                           scale_ctx->width_out * 2);
+static int
+scale_dest_row_bilinear_1h_128bpp (const SmolScaleCtx *scale_ctx,
+                                   SmolLocalCtx *local_ctx,
+                                   uint32_t dest_row_index)
+{
+    uint16_t *precalc_y = scale_ctx->vdim.precalc;
+    uint32_t bilin_index = dest_row_index << 1;
+
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index);
+    interp_vertical_bilinear_store_128bpp (precalc_y [bilin_index * 2 + 1],
+                                           local_ctx->parts_row [0],
+                                           local_ctx->parts_row [1],
+                                           local_ctx->parts_row [2],
+                                           scale_ctx->hdim.placement_size_px * 2);
     bilin_index++;
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index);
-    interp_vertical_bilinear_final_1h_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1],
-                                              vertical_ctx->parts_row [0],
-                                              vertical_ctx->parts_row [1],
-                                              vertical_ctx->parts_row [2],
-                                              scale_ctx->width_out * 2);
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out);
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index);
+
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+        interp_vertical_bilinear_final_1h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1],
+                                                               local_ctx->parts_row [0],
+                                                               local_ctx->parts_row [1],
+                                                               local_ctx->parts_row [2],
+                                                               scale_ctx->hdim.placement_size_px * 2,
+                                                               scale_ctx->vdim.first_opacity);
+    else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256)
+        interp_vertical_bilinear_final_1h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1],
+                                                               local_ctx->parts_row [0],
+                                                               local_ctx->parts_row [1],
+                                                               local_ctx->parts_row [2],
+                                                               scale_ctx->hdim.placement_size_px * 2,
+                                                               scale_ctx->vdim.last_opacity);
+    else
+        interp_vertical_bilinear_final_1h_128bpp (precalc_y [bilin_index * 2 + 1],
+                                                  local_ctx->parts_row [0],
+                                                  local_ctx->parts_row [1],
+                                                  local_ctx->parts_row [2],
+                                                  scale_ctx->hdim.placement_size_px * 2);
+
+    return 2;
 }
 
 DEF_INTERP_VERTICAL_BILINEAR_FINAL(2)
-DEF_SCALE_OUTROW_BILINEAR(2)
+DEF_SCALE_DEST_ROW_BILINEAR(2)
 DEF_INTERP_VERTICAL_BILINEAR_FINAL(3)
-DEF_SCALE_OUTROW_BILINEAR(3)
+DEF_SCALE_DEST_ROW_BILINEAR(3)
 DEF_INTERP_VERTICAL_BILINEAR_FINAL(4)
-DEF_SCALE_OUTROW_BILINEAR(4)
+DEF_SCALE_DEST_ROW_BILINEAR(4)
 DEF_INTERP_VERTICAL_BILINEAR_FINAL(5)
-DEF_SCALE_OUTROW_BILINEAR(5)
+DEF_SCALE_DEST_ROW_BILINEAR(5)
 DEF_INTERP_VERTICAL_BILINEAR_FINAL(6)
-DEF_SCALE_OUTROW_BILINEAR(6)
+DEF_SCALE_DEST_ROW_BILINEAR(6)
 
 static void
 finalize_vertical_64bpp (const uint64_t * SMOL_RESTRICT accums,
@@ -2229,581 +3036,418 @@ finalize_vertical_64bpp (const uint64_t * SMOL_RESTRICT accums,
 }
 
 static void
-weight_edge_row_64bpp (uint64_t *row,
-                       uint16_t w,
-                       uint32_t n)
+finalize_vertical_with_opacity_64bpp (const uint64_t * SMOL_RESTRICT accums,
+                                      uint64_t multiplier,
+                                      uint64_t * SMOL_RESTRICT dest_parts,
+                                      uint32_t n,
+                                      uint16_t opacity)
 {
-    uint64_t *row_max = row + n;
+    uint64_t *parts_dest_max = dest_parts + n;
 
-    SMOL_ASSUME_ALIGNED (row, uint64_t *);
+    SMOL_ASSUME_ALIGNED (accums, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
 
-    while (row != row_max)
+    while (dest_parts != parts_dest_max)
     {
-        *row = ((*row * w) >> 8) & 0x00ff00ff00ff00ffULL;
-        row++;
+        *dest_parts = scale_64bpp (*(accums++), multiplier);
+        apply_subpixel_opacity_64bpp (dest_parts, opacity);
+        dest_parts++;
     }
 }
 
-static void
-scale_and_weight_edge_rows_box_64bpp (const uint64_t * SMOL_RESTRICT first_row,
-                                      uint64_t * SMOL_RESTRICT last_row,
-                                      uint64_t * SMOL_RESTRICT accum,
-                                      uint16_t w2,
-                                      uint32_t n)
+static int
+scale_dest_row_box_64bpp (const SmolScaleCtx *scale_ctx,
+                        SmolLocalCtx *local_ctx,
+                        uint32_t dest_row_index)
 {
-    const uint64_t *first_row_max = first_row + n;
-
-    SMOL_ASSUME_ALIGNED (first_row, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (last_row, uint64_t *);
-    SMOL_ASSUME_ALIGNED (accum, uint64_t *);
+    uint32_t *precalc_y = scale_ctx->vdim.precalc;
+    uint32_t ofs_y, ofs_y_max;
+    uint32_t w1, w2;
+    uint32_t n, i;
 
-    while (first_row != first_row_max)
-    {
-        uint64_t r, s, p, q;
+    unpack_box_precalc (precalc_y [dest_row_index],
+                        scale_ctx->vdim.span_step,
+                        &ofs_y,
+                        &ofs_y_max,
+                        &w1,
+                        &w2,
+                        &n);
 
-        p = *(first_row++);
+    /* First input row */
 
-        r = *(last_row);
-        s = r * w2;
-        q = (s >> 8) & 0x00ff00ff00ff00ffULL;
-        /* (255 * r) - (F * r) */
-        *(last_row++) = (((r << 8) - r - s) >> 8) & 0x00ff00ff00ff00ffULL;
+    scale_horizontal (scale_ctx,
+                      local_ctx,
+                      src_row_ofs_to_pointer (scale_ctx, ofs_y),
+                      local_ctx->parts_row [0]);
+    copy_weighted_parts_64bpp (local_ctx->parts_row [0],
+                               local_ctx->parts_row [1],
+                               scale_ctx->hdim.placement_size_px,
+                               w1);
+    ofs_y++;
 
-        *(accum++) = p + q;
-    }
-}
+    /* Add up whole input rows */
 
-static void
-update_vertical_ctx_box_64bpp (const SmolScaleCtx *scale_ctx,
-                             SmolVerticalCtx *vertical_ctx,
-                             uint32_t ofs_y,
-                             uint32_t ofs_y_max,
-                             uint16_t w1,
-                             uint16_t w2)
-{
-    /* Old in_ofs is the previous max */
-    if (ofs_y == vertical_ctx->in_ofs)
+    for (i = 0; i < n; i++)
     {
-        uint64_t *t = vertical_ctx->parts_row [0];
-        vertical_ctx->parts_row [0] = vertical_ctx->parts_row [1];
-        vertical_ctx->parts_row [1] = t;
+        scale_horizontal (scale_ctx,
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, ofs_y),
+                          local_ctx->parts_row [0]);
+        add_parts (local_ctx->parts_row [0],
+                   local_ctx->parts_row [1],
+                   scale_ctx->hdim.placement_size_px);
+
+        ofs_y++;
     }
-    else
+
+    /* Last input row */
+
+    if (ofs_y < scale_ctx->vdim.src_size_px)
     {
         scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, ofs_y),
-                          vertical_ctx->parts_row [0]);
-        weight_edge_row_64bpp (vertical_ctx->parts_row [0], w1, scale_ctx->width_out);
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, ofs_y),
+                          local_ctx->parts_row [0]);
+        add_weighted_parts_64bpp (local_ctx->parts_row [0],
+                                  local_ctx->parts_row [1],
+                                  scale_ctx->hdim.placement_size_px,
+                                  w2);
     }
 
-    /* When w2 == 0, the final inrow may be out of bounds. Don't try to access it in
-     * that case. */
-    if (w2 || ofs_y_max < scale_ctx->height_in)
+    /* Finalize */
+
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256)
     {
-        scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, ofs_y_max),
-                          vertical_ctx->parts_row [1]);
+        finalize_vertical_with_opacity_64bpp (local_ctx->parts_row [1],
+                                              scale_ctx->vdim.span_mul,
+                                              local_ctx->parts_row [0],
+                                              scale_ctx->hdim.placement_size_px,
+                                              scale_ctx->vdim.first_opacity);
     }
-    else
+    else if (dest_row_index == scale_ctx->vdim.placement_size_px - 1 && scale_ctx->vdim.last_opacity < 256)
     {
-        memset (vertical_ctx->parts_row [1], 0, scale_ctx->width_out * sizeof (uint64_t));
+        finalize_vertical_with_opacity_64bpp (local_ctx->parts_row [1],
+                                              scale_ctx->vdim.span_mul,
+                                              local_ctx->parts_row [0],
+                                              scale_ctx->hdim.placement_size_px,
+                                              scale_ctx->vdim.last_opacity);
     }
-
-    vertical_ctx->in_ofs = ofs_y_max;
-}
-
-static void
-scale_outrow_box_64bpp (const SmolScaleCtx *scale_ctx,
-                        SmolVerticalCtx *vertical_ctx,
-                        uint32_t outrow_index,
-                        uint32_t *row_out)
-{
-    uint32_t ofs_y, ofs_y_max;
-    uint16_t w1, w2;
-
-    /* Get the inrow range for this outrow: [ofs_y .. ofs_y_max> */
-
-    ofs_y = scale_ctx->offsets_y [outrow_index * 2];
-    ofs_y_max = scale_ctx->offsets_y [(outrow_index + 1) * 2];
-
-    /* Scale the first and last rows, weight them and store in accumulator */
-
-    w1 = (outrow_index == 0) ? 256 : 255 - scale_ctx->offsets_y [outrow_index * 2 - 1];
-    w2 = scale_ctx->offsets_y [outrow_index * 2 + 1];
-
-    update_vertical_ctx_box_64bpp (scale_ctx, vertical_ctx, ofs_y, ofs_y_max, w1, w2);
-
-    scale_and_weight_edge_rows_box_64bpp (vertical_ctx->parts_row [0],
-                                          vertical_ctx->parts_row [1],
-                                          vertical_ctx->parts_row [2],
-                                          w2,
-                                          scale_ctx->width_out);
-
-    ofs_y++;
-
-    /* Add up whole rows */
-
-    while (ofs_y < ofs_y_max)
+    else
     {
-        scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, ofs_y),
-                          vertical_ctx->parts_row [0]);
-        add_parts (vertical_ctx->parts_row [0],
-                   vertical_ctx->parts_row [2],
-                   scale_ctx->width_out);
-
-        ofs_y++;
+        finalize_vertical_64bpp (local_ctx->parts_row [1],
+                                 scale_ctx->vdim.span_mul,
+                                 local_ctx->parts_row [0],
+                                 scale_ctx->hdim.placement_size_px);
     }
 
-    finalize_vertical_64bpp (vertical_ctx->parts_row [2],
-                             scale_ctx->span_mul_y,
-                             vertical_ctx->parts_row [0],
-                             scale_ctx->width_out);
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out);
+    return 0;
 }
 
 static void
 finalize_vertical_128bpp (const uint64_t * SMOL_RESTRICT accums,
                           uint64_t multiplier,
-                          uint64_t * SMOL_RESTRICT parts_out,
+                          uint64_t * SMOL_RESTRICT dest_parts,
                           uint32_t n)
 {
-    uint64_t *parts_out_max = parts_out + n * 2;
+    uint64_t *parts_dest_max = dest_parts + n * 2;
 
     SMOL_ASSUME_ALIGNED (accums, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (parts_out, uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
 
-    while (parts_out != parts_out_max)
+    while (dest_parts != parts_dest_max)
     {
-        *(parts_out++) = scale_128bpp_half (*(accums++), multiplier);
-        *(parts_out++) = scale_128bpp_half (*(accums++), multiplier);
+        *(dest_parts++) = scale_128bpp_half (*(accums++), multiplier);
+        *(dest_parts++) = scale_128bpp_half (*(accums++), multiplier);
     }
 }
 
 static void
-weight_row_128bpp (uint64_t *row,
-                   uint16_t w,
-                   uint32_t n)
+finalize_vertical_with_opacity_128bpp (const uint64_t * SMOL_RESTRICT accums,
+                                       uint64_t multiplier,
+                                       uint64_t * SMOL_RESTRICT dest_parts,
+                                       uint32_t n,
+                                       uint16_t opacity)
 {
-    uint64_t *row_max = row + (n * 2);
+    uint64_t *parts_dest_max = dest_parts + n * 2;
 
-    SMOL_ASSUME_ALIGNED (row, uint64_t *);
+    SMOL_ASSUME_ALIGNED (accums, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
 
-    while (row != row_max)
+    while (dest_parts != parts_dest_max)
     {
-        row [0] = ((row [0] * w) >> 8) & 0x00ffffff00ffffffULL;
-        row [1] = ((row [1] * w) >> 8) & 0x00ffffff00ffffffULL;
-        row += 2;
+        dest_parts [0] = scale_128bpp_half (*(accums++), multiplier);
+        dest_parts [1] = scale_128bpp_half (*(accums++), multiplier);
+        apply_subpixel_opacity_128bpp (dest_parts, opacity);
+        dest_parts += 2;
     }
 }
 
-static void
-scale_outrow_box_128bpp (const SmolScaleCtx *scale_ctx,
-                         SmolVerticalCtx *vertical_ctx,
-                         uint32_t outrow_index,
-                         uint32_t *row_out)
+static int
+scale_dest_row_box_128bpp (const SmolScaleCtx *scale_ctx,
+                           SmolLocalCtx *local_ctx,
+                           uint32_t dest_row_index)
 {
+    uint32_t *precalc_y = scale_ctx->vdim.precalc;
     uint32_t ofs_y, ofs_y_max;
-    uint16_t w;
+    uint32_t w1, w2;
+    uint32_t n, i;
 
-    /* Get the inrow range for this outrow: [ofs_y .. ofs_y_max> */
+    unpack_box_precalc (precalc_y [dest_row_index],
+                        scale_ctx->vdim.span_step,
+                        &ofs_y,
+                        &ofs_y_max,
+                        &w1,
+                        &w2,
+                        &n);
 
-    ofs_y = scale_ctx->offsets_y [outrow_index * 2];
-    ofs_y_max = scale_ctx->offsets_y [(outrow_index + 1) * 2];
-
-    /* Scale the first inrow and store it */
+    /* First input row */
 
     scale_horizontal (scale_ctx,
-                      vertical_ctx,
-                      inrow_ofs_to_pointer (scale_ctx, ofs_y),
-                      vertical_ctx->parts_row [0]);
-    weight_row_128bpp (vertical_ctx->parts_row [0],
-                       outrow_index == 0 ? 256 : 255 - scale_ctx->offsets_y [outrow_index * 2 - 1],
-                       scale_ctx->width_out);
+                      local_ctx,
+                      src_row_ofs_to_pointer (scale_ctx, ofs_y),
+                      local_ctx->parts_row [0]);
+    copy_weighted_parts_128bpp (local_ctx->parts_row [0],
+                                local_ctx->parts_row [1],
+                                scale_ctx->hdim.placement_size_px,
+                                w1);
     ofs_y++;
 
-    /* Add up whole rows */
+    /* Add up whole input rows */
 
-    while (ofs_y < ofs_y_max)
+    for (i = 0; i < n; i++)
     {
         scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, ofs_y),
-                          vertical_ctx->parts_row [1]);
-        add_parts (vertical_ctx->parts_row [1],
-                   vertical_ctx->parts_row [0],
-                   scale_ctx->width_out * 2);
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, ofs_y),
+                          local_ctx->parts_row [0]);
+        add_parts (local_ctx->parts_row [0],
+                   local_ctx->parts_row [1],
+                   scale_ctx->hdim.placement_size_px * 2);
 
         ofs_y++;
     }
 
-    /* Final row is optional; if this is the bottommost outrow it could be out of bounds */
+    /* Last input row */
 
-    w = scale_ctx->offsets_y [outrow_index * 2 + 1];
-    if (w > 0)
+    if (ofs_y < scale_ctx->vdim.src_size_px)
     {
         scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, ofs_y),
-                          vertical_ctx->parts_row [1]);
-        weight_row_128bpp (vertical_ctx->parts_row [1],
-                           w - 1,  /* Subtract 1 to avoid overflow */
-                           scale_ctx->width_out);
-        add_parts (vertical_ctx->parts_row [1],
-                   vertical_ctx->parts_row [0],
-                   scale_ctx->width_out * 2);
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, ofs_y),
+                          local_ctx->parts_row [0]);
+        add_weighted_parts_128bpp (local_ctx->parts_row [0],
+                                   local_ctx->parts_row [1],
+                                   scale_ctx->hdim.placement_size_px,
+                                   w2);
+    }
+
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+    {
+        finalize_vertical_with_opacity_128bpp (local_ctx->parts_row [1],
+                                               scale_ctx->vdim.span_mul,
+                                               local_ctx->parts_row [0],
+                                               scale_ctx->hdim.placement_size_px,
+                                               scale_ctx->vdim.first_opacity);
+    }
+    else if (dest_row_index == scale_ctx->vdim.placement_size_px - 1 && scale_ctx->vdim.last_opacity < 256)
+    {
+        finalize_vertical_with_opacity_128bpp (local_ctx->parts_row [1],
+                                               scale_ctx->vdim.span_mul,
+                                               local_ctx->parts_row [0],
+                                               scale_ctx->hdim.placement_size_px,
+                                               scale_ctx->vdim.last_opacity);
+    }
+    else
+    {
+        finalize_vertical_128bpp (local_ctx->parts_row [1],
+                                  scale_ctx->vdim.span_mul,
+                                  local_ctx->parts_row [0],
+                                  scale_ctx->hdim.placement_size_px);
     }
 
-    finalize_vertical_128bpp (vertical_ctx->parts_row [0],
-                              scale_ctx->span_mul_y,
-                              vertical_ctx->parts_row [1],
-                              scale_ctx->width_out);
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [1], row_out, scale_ctx->width_out);
+    return 0;
 }
 
-static void
-scale_outrow_one_64bpp (const SmolScaleCtx *scale_ctx,
-                        SmolVerticalCtx *vertical_ctx,
-                        uint32_t row_index,
-                        uint32_t *row_out)
+static int
+scale_dest_row_one_64bpp (const SmolScaleCtx *scale_ctx,
+                          SmolLocalCtx *local_ctx,
+                          uint32_t row_index)
 {
-    SMOL_UNUSED (row_index);
-
     /* Scale the row and store it */
 
-    if (vertical_ctx->in_ofs != 0)
+    if (local_ctx->src_ofs != 0)
     {
         scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, 0),
-                          vertical_ctx->parts_row [0]);
-        vertical_ctx->in_ofs = 0;
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, 0),
+                          local_ctx->parts_row [0]);
+        local_ctx->src_ofs = 0;
+    }
+
+    if (row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+    {
+        apply_subpixel_opacity_row_copy_64bpp (local_ctx->parts_row [0],
+                                               local_ctx->parts_row [1],
+                                               scale_ctx->hdim.placement_size_px,
+                                               scale_ctx->vdim.first_opacity);
+    }
+    else if (row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256)
+    {
+        apply_subpixel_opacity_row_copy_64bpp (local_ctx->parts_row [0],
+                                               local_ctx->parts_row [1],
+                                               scale_ctx->hdim.placement_size_px,
+                                               scale_ctx->vdim.last_opacity);
+    }
+    else
+    {
+        memcpy (local_ctx->parts_row [1],
+                local_ctx->parts_row [0],
+                scale_ctx->hdim.placement_size_px * sizeof (uint64_t));
     }
 
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out);
+    return 1;
 }
 
-static void
-scale_outrow_one_128bpp (const SmolScaleCtx *scale_ctx,
-                        SmolVerticalCtx *vertical_ctx,
-                        uint32_t row_index,
-                        uint32_t *row_out)
+static int
+scale_dest_row_one_128bpp (const SmolScaleCtx *scale_ctx,
+                           SmolLocalCtx *local_ctx,
+                           uint32_t row_index)
 {
-    SMOL_UNUSED (row_index);
-
     /* Scale the row and store it */
 
-    if (vertical_ctx->in_ofs != 0)
+    if (local_ctx->src_ofs != 0)
     {
         scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, 0),
-                          vertical_ctx->parts_row [0]);
-        vertical_ctx->in_ofs = 0;
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, 0),
+                          local_ctx->parts_row [0]);
+        local_ctx->src_ofs = 0;
+    }
+
+    if (row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+    {
+        apply_subpixel_opacity_row_copy_128bpp (local_ctx->parts_row [0],
+                                                local_ctx->parts_row [1],
+                                                scale_ctx->hdim.placement_size_px,
+                                                scale_ctx->vdim.first_opacity);
+    }
+    else if (row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256)
+    {
+        apply_subpixel_opacity_row_copy_128bpp (local_ctx->parts_row [0],
+                                                local_ctx->parts_row [1],
+                                                scale_ctx->hdim.placement_size_px,
+                                                scale_ctx->vdim.last_opacity);
+    }
+    else
+    {
+        memcpy (local_ctx->parts_row [1],
+                local_ctx->parts_row [0],
+                scale_ctx->hdim.placement_size_px * sizeof (uint64_t) * 2);
     }
 
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out);
+    return 1;
 }
 
-static void
-scale_outrow_copy (const SmolScaleCtx *scale_ctx,
-                   SmolVerticalCtx *vertical_ctx,
-                   uint32_t row_index,
-                   uint32_t *row_out)
+static int
+scale_dest_row_copy (const SmolScaleCtx *scale_ctx,
+                     SmolLocalCtx *local_ctx,
+                     uint32_t row_index)
 {
     scale_horizontal (scale_ctx,
-                      vertical_ctx,
-                      inrow_ofs_to_pointer (scale_ctx, row_index),
-                      vertical_ctx->parts_row [0]);
-
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out);
-}
+                      local_ctx,
+                      src_row_ofs_to_pointer (scale_ctx, row_index),
+                      local_ctx->parts_row [0]);
+
+    return 0;
+}
+
+/* --------------- *
+ * Function tables *
+ * --------------- */
+
+#define R SMOL_REPACK_META
+
+static const SmolRepackMeta repack_meta [] =
+{
+    R (123,   24, PREMUL8,      COMPRESSED, 1324,  64, PREMUL8,       COMPRESSED),
+
+    R (123,   24, PREMUL8,      COMPRESSED, 1234, 128, PREMUL8,       COMPRESSED),
+
+    R (1234,  32, PREMUL8,      COMPRESSED, 1324,  64, PREMUL8,       COMPRESSED),
+    R (1234,  32, PREMUL8,      COMPRESSED, 2431,  64, PREMUL8,       COMPRESSED),
+    R (1234,  32, PREMUL8,      COMPRESSED, 3241,  64, PREMUL8,       COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 1324,  64, PREMUL8,       COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 2431,  64, PREMUL8,       COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 3241,  64, PREMUL8,       COMPRESSED),
+
+    R (1234,  32, PREMUL8,      COMPRESSED, 1234, 128, PREMUL8,       COMPRESSED),
+    R (1234,  32, PREMUL8,      COMPRESSED, 2341, 128, PREMUL8,       COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 1234, 128, PREMUL8,       COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 2341, 128, PREMUL8,       COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 1234, 128, PREMUL16,      COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 2341, 128, PREMUL16,      COMPRESSED),
+
+    R (1234,  64, PREMUL8,      COMPRESSED, 132,   24, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 231,   24, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 324,   24, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 423,   24, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 132,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 231,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 324,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 423,   24, UNASSOCIATED,  COMPRESSED),
+
+    R (1234,  64, PREMUL8,      COMPRESSED, 1324,  32, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 1423,  32, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 2314,  32, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 4132,  32, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 4231,  32, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 1324,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 1423,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 2314,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 4132,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 4231,  32, UNASSOCIATED,  COMPRESSED),
+
+    R (1234, 128, PREMUL8,      COMPRESSED, 123,   24, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 321,   24, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 123,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 321,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     COMPRESSED, 123,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     COMPRESSED, 321,   24, UNASSOCIATED,  COMPRESSED),
+
+    R (1234, 128, PREMUL8,      COMPRESSED, 1234,  32, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 3214,  32, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 4123,  32, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 4321,  32, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 1234,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 3214,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 4123,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 4321,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     COMPRESSED, 1234,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     COMPRESSED, 3214,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     COMPRESSED, 4123,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     COMPRESSED, 4321,  32, UNASSOCIATED,  COMPRESSED),
+
+    SMOL_REPACK_META_LAST
+};
 
-/* --- Conversion tables --- */
+#undef R
 
-static const SmolConversionTable avx2_conversions =
+static const SmolImplementation implementation =
 {
-{ {
-    /* Conversions where accumulators must hold the sum of fewer than
-     * 256 pixels. This can be done in 64bpp, but 128bpp may be used
-     * e.g. for 16 bits per channel internally premultiplied data. */
-
-    /* RGBA8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 3214, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4123, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 1234, u, 64),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 3214, u, 64),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4123, u, 64),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4321, u, 64),
-        /* RGB8      */ SMOL_CONV (1234, p, 1324, p, 132a, p, 123, u, 64),
-        /* BGR8      */ SMOL_CONV (1234, p, 1324, p, 132a, p, 321, u, 64),
-    },
-    /* BGRA8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 3214, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4123, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 3214, u, 64),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 1234, u, 64),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4321, u, 64),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4123, u, 64),
-        /* RGB8      */ SMOL_CONV (1234, p, 1324, p, 132a, p, 321, u, 64),
-        /* BGR8      */ SMOL_CONV (1234, p, 1324, p, 132a, p, 123, u, 64),
-    },
-    /* ARGB8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 2341, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1432, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 2341, u, 64),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 4321, u, 64),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 1234, u, 64),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 1432, u, 64),
-        /* RGB8      */ SMOL_CONV (1234, p, 1324, p, a324, p, 234, u, 64),
-        /* BGR8      */ SMOL_CONV (1234, p, 1324, p, a324, p, 432, u, 64),
-    },
-    /* ABGR8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 2341, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1432, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 4321, u, 64),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 2341, u, 64),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 1432, u, 64),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 1234, u, 64),
-        /* RGB8      */ SMOL_CONV (1234, p, 1324, p, a324, p, 432, u, 64),
-        /* BGR8      */ SMOL_CONV (1234, p, 1324, p, a324, p, 234, u, 64),
-    },
-    /* RGBA8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 1234, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 3214, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4123, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4321, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128),
-        /* RGB8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128),
-        /* BGR8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128),
-    },
-    /* BGRA8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 3214, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 1234, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4321, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4123, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128),
-        /* RGB8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128),
-        /* BGR8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128),
-    },
-    /* ARGB8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 2341, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 4321, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1234, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1432, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128),
-        /* RGB8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128),
-        /* BGR8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128),
-    },
-    /* ABGR8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 4321, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 2341, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1432, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1234, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128),
-        /* RGB8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128),
-        /* BGR8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128),
-    },
-    /* RGB8 -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64),
-        /* BGRA8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64),
-        /* ARGB8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64),
-        /* ABGR8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64),
-        /* RGB8      */ SMOL_CONV (123, p, 132a, p, 132a, p, 123, p, 64),
-        /* BGR8      */ SMOL_CONV (123, p, 132a, p, 132a, p, 321, p, 64),
-    },
-    /* BGR8 -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64),
-        /* BGRA8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64),
-        /* ARGB8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64),
-        /* ABGR8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64),
-        /* RGB8      */ SMOL_CONV (123, p, 132a, p, 132a, p, 321, p, 64),
-        /* BGR8      */ SMOL_CONV (123, p, 132a, p, 132a, p, 123, p, 64),
-    }
-    },
+    /* Horizontal init */
+    init_horizontal,
 
-    {
-    /* Conversions where accumulators must hold the sum of up to
-     * 65535 pixels. We need 128bpp for this. */
-
-    /* RGBA8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 3214, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4123, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 1234, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 3214, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4123, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4321, u, 128),
-        /* RGB8      */ SMOL_CONV (1234, p, 1234, p, 123a, p, 123, u, 128),
-        /* BGR8      */ SMOL_CONV (1234, p, 1234, p, 123a, p, 321, u, 128),
-    },
-    /* BGRA8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 3214, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4123, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 3214, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 1234, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4321, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4123, u, 128),
-        /* RGB8      */ SMOL_CONV (1234, p, 1234, p, 123a, p, 321, u, 128),
-        /* BGR8      */ SMOL_CONV (1234, p, 1234, p, 123a, p, 123, u, 128),
-    },
-    /* ARGB8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 2341, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1432, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 2341, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 4321, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 1234, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 1432, u, 128),
-        /* RGB8      */ SMOL_CONV (1234, p, 1234, p, a234, p, 234, u, 128),
-        /* BGR8      */ SMOL_CONV (1234, p, 1234, p, a234, p, 432, u, 128),
-    },
-    /* ABGR8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 2341, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1432, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 4321, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 2341, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 1432, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 1234, u, 128),
-        /* RGB8      */ SMOL_CONV (1234, p, 1234, p, a234, p, 432, u, 128),
-        /* BGR8      */ SMOL_CONV (1234, p, 1234, p, a234, p, 234, u, 128),
-    },
-    /* RGBA8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 1234, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 3214, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4123, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4321, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128),
-        /* RGB8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128),
-        /* BGR8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128),
-    },
-    /* BGRA8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 3214, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 1234, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4321, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4123, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128),
-        /* RGB8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128),
-        /* BGR8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128),
-    },
-    /* ARGB8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 2341, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 4321, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1234, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1432, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128),
-        /* RGB8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128),
-        /* BGR8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128),
-    },
-    /* ABGR8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 4321, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 2341, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1432, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1234, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128),
-        /* RGB8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128),
-        /* BGR8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128),
-    },
-    /* RGB8 -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128),
-        /* BGRA8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128),
-        /* ARGB8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128),
-        /* ABGR8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128),
-        /* RGB8      */ SMOL_CONV (123, p, 123a, p, 123a, p, 123, p, 128),
-        /* BGR8      */ SMOL_CONV (123, p, 123a, p, 123a, p, 321, p, 128),
-    },
-    /* BGR8 -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128),
-        /* BGRA8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128),
-        /* ARGB8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128),
-        /* ABGR8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128),
-        /* RGB8      */ SMOL_CONV (123, p, 123a, p, 123a, p, 321, p, 128),
-        /* BGR8      */ SMOL_CONV (123, p, 123a, p, 123a, p, 123, p, 128),
-    }
-} }
-};
+    /* Vertical init */
+    init_vertical,
 
-static const SmolImplementation avx2_implementation =
-{
     {
         /* Horizontal filters */
+        {
+            /* 24bpp */
+        },
+        {
+            /* 32bpp */
+        },
         {
             /* 64bpp */
             interp_horizontal_copy_64bpp,
@@ -2833,38 +3477,65 @@ static const SmolImplementation avx2_implementation =
     },
     {
         /* Vertical filters */
+        {
+            /* 24bpp */
+        },
+        {
+            /* 32bpp */
+        },
         {
             /* 64bpp */
-            scale_outrow_copy,
-            scale_outrow_one_64bpp,
-            scale_outrow_bilinear_0h_64bpp,
-            scale_outrow_bilinear_1h_64bpp,
-            scale_outrow_bilinear_2h_64bpp,
-            scale_outrow_bilinear_3h_64bpp,
-            scale_outrow_bilinear_4h_64bpp,
-            scale_outrow_bilinear_5h_64bpp,
-            scale_outrow_bilinear_6h_64bpp,
-            scale_outrow_box_64bpp
+            scale_dest_row_copy,
+            scale_dest_row_one_64bpp,
+            scale_dest_row_bilinear_0h_64bpp,
+            scale_dest_row_bilinear_1h_64bpp,
+            scale_dest_row_bilinear_2h_64bpp,
+            scale_dest_row_bilinear_3h_64bpp,
+            scale_dest_row_bilinear_4h_64bpp,
+            scale_dest_row_bilinear_5h_64bpp,
+            scale_dest_row_bilinear_6h_64bpp,
+            scale_dest_row_box_64bpp
         },
         {
             /* 128bpp */
-            scale_outrow_copy,
-            scale_outrow_one_128bpp,
-            scale_outrow_bilinear_0h_128bpp,
-            scale_outrow_bilinear_1h_128bpp,
-            scale_outrow_bilinear_2h_128bpp,
-            scale_outrow_bilinear_3h_128bpp,
-            scale_outrow_bilinear_4h_128bpp,
-            scale_outrow_bilinear_5h_128bpp,
-            scale_outrow_bilinear_6h_128bpp,
-            scale_outrow_box_128bpp
+            scale_dest_row_copy,
+            scale_dest_row_one_128bpp,
+            scale_dest_row_bilinear_0h_128bpp,
+            scale_dest_row_bilinear_1h_128bpp,
+            scale_dest_row_bilinear_2h_128bpp,
+            scale_dest_row_bilinear_3h_128bpp,
+            scale_dest_row_bilinear_4h_128bpp,
+            scale_dest_row_bilinear_5h_128bpp,
+            scale_dest_row_bilinear_6h_128bpp,
+            scale_dest_row_box_128bpp
         }
     },
-    &avx2_conversions
+    {
+        /* Composite over color */
+        NULL,
+        NULL,
+        NULL,
+        NULL
+    },
+    {
+        /* Composite over dest */
+        NULL,
+        NULL,
+        NULL,
+        NULL
+    },
+    {
+        /* Clear dest */
+        NULL,
+        NULL,
+        NULL,
+        NULL
+    },
+    repack_meta
 };
 
 const SmolImplementation *
 _smol_get_avx2_implementation (void)
 {
-    return &avx2_implementation;
+    return &implementation;
 }
diff --git a/chafa/internal/smolscale/smolscale-generic.c b/chafa/internal/smolscale/smolscale-generic.c
new file mode 100644
index 00000000..1a27db8c
--- /dev/null
+++ b/chafa/internal/smolscale/smolscale-generic.c
@@ -0,0 +1,3080 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+
+/* Copyright © 2019-2023 Hans Petter Jansson. See COPYING for details. */
+
+#include <assert.h>
+#include <stdlib.h> /* malloc, free, alloca */
+#include <string.h> /* memset */
+#include <limits.h>
+#include "smolscale-private.h"
+
+/* ---------------------- *
+ * Context initialization *
+ * ---------------------- */
+
+/* Linear precalc array:
+ *
+ * Each sample is extracted from a pair of adjacent pixels. The sample precalc
+ * consists of the first pixel's index, followed by its sample fraction [0..256].
+ * The second sample is implicitly taken at index+1 and weighted as 256-fraction.
+ *       _   _   _
+ * In   |_| |_| |_|
+ *        \_/ \_/   <- two samples per output pixel
+ * Out    |_| |_|
+ *
+ * When halving,
+ *       _   _   _
+ * In   |_| |_| |_|
+ *        \_/ \_/   <- four samples per output pixel
+ *        |_| |_|
+ *          \_/     <- halving
+ * Out      |_|
+ */
+
+static void
+precalc_linear_range (uint16_t *array_out,
+                      int first_index, int last_index,
+                      uint64_t first_sample_ofs, uint64_t sample_step,
+                      int sample_ofs_px_max,
+                      int32_t dest_clip_before_px,
+                      int *array_i_inout)
+{
+    uint64_t sample_ofs;
+    int i;
+
+    sample_ofs = first_sample_ofs;
+
+    for (i = first_index; i < last_index; i++)
+    {
+        uint16_t sample_ofs_px = sample_ofs / SMOL_BILIN_MULTIPLIER;
+
+        if (sample_ofs_px >= sample_ofs_px_max - 1)
+        {
+            if (i >= dest_clip_before_px)
+            {
+                array_out [(*array_i_inout) * 2] = sample_ofs_px_max - 2;
+                array_out [(*array_i_inout) * 2 + 1] = 0;
+                (*array_i_inout)++;
+            }
+            continue;
+        }
+
+        if (i >= dest_clip_before_px)
+        {
+            array_out [(*array_i_inout) * 2] = sample_ofs_px;
+            array_out [(*array_i_inout) * 2 + 1] = SMOL_SMALL_MUL
+                - ((sample_ofs / (SMOL_BILIN_MULTIPLIER / SMOL_SMALL_MUL)) % SMOL_SMALL_MUL);
+            (*array_i_inout)++;
+        }
+
+        sample_ofs += sample_step;
+    }
+}
+
+static void
+precalc_bilinear_array (uint16_t *array,
+                        uint64_t src_dim_spx,
+                        uint64_t dest_ofs_spx,
+                        uint64_t dest_dim_spx,
+                        uint32_t dest_dim_prehalving_px,
+                        unsigned int n_halvings,
+                        int32_t dest_clip_before_px)
+{
+    uint32_t src_dim_px = SMOL_SPX_TO_PX (src_dim_spx);
+    uint64_t first_sample_ofs [3];
+    uint64_t sample_step;
+    int i = 0;
+
+    assert (src_dim_px > 1);
+
+    dest_ofs_spx %= SMOL_SUBPIXEL_MUL;
+
+    if (src_dim_spx > dest_dim_spx)
+    {
+        /* Minification */
+        sample_step = ((uint64_t) src_dim_spx * SMOL_BILIN_MULTIPLIER) / dest_dim_spx;
+        first_sample_ofs [0] = (sample_step - SMOL_BILIN_MULTIPLIER) / 2;
+        first_sample_ofs [1] = ((sample_step - SMOL_BILIN_MULTIPLIER) / 2)
+            + ((sample_step * (SMOL_SUBPIXEL_MUL - dest_ofs_spx) * (1 << n_halvings)) / SMOL_SUBPIXEL_MUL);
+    }
+    else
+    {
+        /* Magnification */
+        sample_step = ((src_dim_spx - SMOL_SUBPIXEL_MUL) * SMOL_BILIN_MULTIPLIER)
+            / (dest_dim_spx > SMOL_SUBPIXEL_MUL ? (dest_dim_spx - SMOL_SUBPIXEL_MUL) : 1);
+        first_sample_ofs [0] = 0;
+        first_sample_ofs [1] = (sample_step * (SMOL_SUBPIXEL_MUL - dest_ofs_spx)) / SMOL_SUBPIXEL_MUL;
+    }
+
+    first_sample_ofs [2] = (((uint64_t) src_dim_spx * SMOL_BILIN_MULTIPLIER) / SMOL_SUBPIXEL_MUL)
+        + ((sample_step - SMOL_BILIN_MULTIPLIER) / 2)
+        - sample_step * (1U << n_halvings);
+
+    /* Left fringe */
+    precalc_linear_range (array,
+                          0,
+                          1 << n_halvings,
+                          first_sample_ofs [0],
+                          sample_step,
+                          src_dim_px,
+                          dest_clip_before_px,
+                          &i);
+
+    /* Main range */
+    precalc_linear_range (array,
+                          1 << n_halvings,
+                          dest_dim_prehalving_px - (1 << n_halvings),
+                          first_sample_ofs [1],
+                          sample_step,
+                          src_dim_px,
+                          dest_clip_before_px,
+                          &i);
+
+    /* Right fringe */
+    precalc_linear_range (array,
+                          dest_dim_prehalving_px - (1 << n_halvings),
+                          dest_dim_prehalving_px,
+                          first_sample_ofs [2],
+                          sample_step,
+                          src_dim_px,
+                          dest_clip_before_px,
+                          &i);
+}
+
+static void
+precalc_boxes_array (uint32_t *array,
+                     uint32_t *span_step,
+                     uint32_t *span_mul,
+                     uint32_t src_dim_spx,
+                     int32_t dest_dim,
+                     uint32_t dest_ofs_spx,
+                     uint32_t dest_dim_spx,
+                     int32_t dest_clip_before_px)
+{
+    uint64_t fracF, frac_stepF;
+    uint64_t f;
+    uint64_t stride;
+    uint64_t a, b;
+    int i, dest_i;
+
+    dest_ofs_spx %= SMOL_SUBPIXEL_MUL;
+
+    /* Output sample can't be less than a pixel. Fringe opacity is applied in
+     * a separate step. FIXME: May cause wrong subpixel distribution -- revisit. */
+    if (dest_dim_spx < 256)
+        dest_dim_spx = 256;
+
+    frac_stepF = ((uint64_t) src_dim_spx * SMOL_BIG_MUL) / (uint64_t) dest_dim_spx;
+    fracF = 0;
+
+    stride = frac_stepF / (uint64_t) SMOL_BIG_MUL;
+    f = (frac_stepF / SMOL_SMALL_MUL) % SMOL_SMALL_MUL;
+
+    /* We divide by (b + 1) instead of just (b) to avoid overflows in
+     * scale_128bpp_half(), which would affect horizontal box scaling. The
+     * fudge factor counters limited precision in the inverted division
+     * operation. It causes 16-bit values to undershoot by less than 127/65535
+     * (<.2%). Since the final output is 8-bit, and rounding neutralizes the
+     * error, this doesn't matter. */
+
+    a = (SMOL_BOXES_MULTIPLIER * 255);
+    b = ((stride * 255) + ((f * 255) / 256));
+    *span_step = frac_stepF / SMOL_SMALL_MUL;
+    *span_mul = (a + (b / 2)) / (b + 1);
+
+    /* Left fringe */
+    i = 0;
+    dest_i = 0;
+
+    if (dest_i >= dest_clip_before_px)
+        array [i++] = 0;
+
+    /* Main range */
+    fracF = ((frac_stepF * (SMOL_SUBPIXEL_MUL - dest_ofs_spx)) / SMOL_SUBPIXEL_MUL);
+    for (dest_i = 1; dest_i < dest_dim - 1; dest_i++)
+    {
+        if (dest_i >= dest_clip_before_px)
+            array [i++] = fracF / SMOL_SMALL_MUL;
+        fracF += frac_stepF;
+    }
+
+    /* Right fringe */
+    if (dest_dim > 1 && dest_i >= dest_clip_before_px)
+        array [i++] = (((uint64_t) src_dim_spx * SMOL_SMALL_MUL - frac_stepF) / SMOL_SMALL_MUL);
+}
+
+static void
+init_dim (SmolDim *dim)
+{
+    if (dim->filter_type == SMOL_FILTER_ONE || dim->filter_type == SMOL_FILTER_COPY)
+    {
+    }
+    else if (dim->filter_type == SMOL_FILTER_BOX)
+    {
+        precalc_boxes_array (dim->precalc,
+                             &dim->span_step,
+                             &dim->span_mul,
+                             dim->src_size_spx,
+                             dim->placement_size_px,
+                             dim->placement_ofs_spx,
+                             dim->placement_size_spx,
+                             dim->clip_before_px);
+    }
+    else /* SMOL_FILTER_BILINEAR_?H */
+    {
+        precalc_bilinear_array (dim->precalc,
+                                dim->src_size_spx,
+                                dim->placement_ofs_spx,
+                                dim->placement_size_prehalving_spx,
+                                dim->placement_size_prehalving_px,
+                                dim->n_halvings,
+                                dim->clip_before_px);
+    }
+}
+
+static void
+init_horizontal (SmolScaleCtx *scale_ctx)
+{
+    init_dim (&scale_ctx->hdim);
+}
+
+static void
+init_vertical (SmolScaleCtx *scale_ctx)
+{
+    init_dim (&scale_ctx->vdim);
+}
+
+/* ---------------------- *
+ * sRGB/linear conversion *
+ * ---------------------- */
+
+static void
+from_srgb_pixel_xxxa_128bpp (uint64_t * SMOL_RESTRICT pixel_inout)
+{
+    uint64_t part;
+
+    part = pixel_inout [0];
+    pixel_inout [0] =
+        ((uint64_t) _smol_from_srgb_lut [part >> 32] << 32)
+        | _smol_from_srgb_lut [part & 0xff];
+
+    part = pixel_inout [1];
+    pixel_inout [1] =
+        ((uint64_t) _smol_from_srgb_lut [part >> 32] << 32)
+        | ((part & 0xffffffff) << 3) | 7;
+}
+
+static void
+to_srgb_pixel_xxxa_128bpp (const uint64_t *pixel_in, uint64_t *pixel_out)
+{
+    pixel_out [0] =
+        (((uint64_t) _smol_to_srgb_lut [pixel_in [0] >> 32]) << 32)
+        | _smol_to_srgb_lut [pixel_in [0] & 0xffff];
+
+    pixel_out [1] =
+        (((uint64_t) _smol_to_srgb_lut [pixel_in [1] >> 32]) << 32)
+        | (pixel_in [1] & 0xffffffff);  /* FIXME: No need to preserve alpha? */
+}
+
+/* Fetches alpha from linear pixel. Input alpha is in the range [0x000..0x7ff].
+ * Returned alpha is in the range [0x00..0xff], rounded towards 0xff. */
+static SMOL_INLINE uint8_t
+get_alpha_from_linear_xxxa_128bpp (const uint64_t * SMOL_RESTRICT pixel_in)
+{
+    uint16_t alpha = (pixel_in [1] + 7) >> 3;
+    return (uint8_t) (alpha - (alpha >> 8)); /* Turn 0x100 into 0xff */
+}
+
+/* ----------------- *
+ * Premultiplication *
+ * ----------------- */
+
+static SMOL_INLINE void
+premul_u_to_p8_128bpp (uint64_t * SMOL_RESTRICT inout,
+                       uint16_t alpha)
+{
+    inout [0] = ((inout [0] * (alpha + 1)) >> 8) & 0x000000ff000000ff;
+    inout [1] = ((inout [1] * (alpha + 1)) >> 8) & 0x000000ff000000ff;
+}
+
+static SMOL_INLINE void
+unpremul_p8_to_u_128bpp (const uint64_t *in,
+                         uint64_t *out,
+                         uint8_t alpha)
+{
+    out [0] = ((in [0] * _smol_inv_div_p8_lut [alpha])
+               >> INVERTED_DIV_SHIFT_P8) & 0x000000ff000000ff;
+    out [1] = ((in [1] * _smol_inv_div_p8_lut [alpha])
+               >> INVERTED_DIV_SHIFT_P8) & 0x000000ff000000ff;
+}
+
+static SMOL_INLINE uint64_t
+premul_u_to_p8_64bpp (const uint64_t in,
+                      uint16_t alpha)
+{
+    return ((in  * (alpha + 1)) >> 8) & 0x00ff00ff00ff00ff;
+}
+
+static SMOL_INLINE uint64_t
+unpremul_p8_to_u_64bpp (const uint64_t in,
+                        uint8_t alpha)
+{
+    uint64_t in_128bpp [2];
+    uint64_t dest_128bpp [2];
+
+    in_128bpp [0] = (in & 0x000000ff000000ff);
+    in_128bpp [1] = (in & 0x00ff000000ff0000) >> 16;
+
+    unpremul_p8_to_u_128bpp (in_128bpp, dest_128bpp, alpha);
+
+    return dest_128bpp [0] | (dest_128bpp [1] << 16);
+}
+
+static SMOL_INLINE void
+premul_ul_to_p8l_128bpp (uint64_t * SMOL_RESTRICT inout,
+                         uint16_t alpha)
+{
+    inout [0] = ((inout [0] * (alpha + 1)) >> 8) & 0x000007ff000007ff;
+    inout [1] = (((inout [1] * (alpha + 1)) >> 8) & 0x000007ff00000000)
+        | (inout [1] & 0x000007ff);
+}
+
+static SMOL_INLINE void
+unpremul_p8l_to_ul_128bpp (const uint64_t *in,
+                           uint64_t *out,
+                           uint8_t alpha)
+{
+    out [0] = ((in [0] * _smol_inv_div_p8l_lut [alpha])
+               >> INVERTED_DIV_SHIFT_P8L) & 0x000007ff000007ff;
+    out [1] = ((in [1] * _smol_inv_div_p8l_lut [alpha])
+               >> INVERTED_DIV_SHIFT_P8L) & 0x000007ff000007ff;
+}
+
+static SMOL_INLINE void
+premul_u_to_p16_128bpp (uint64_t *inout,
+                        uint8_t alpha)
+{
+    inout [0] = inout [0] * ((uint16_t) alpha + 2);
+    inout [1] = inout [1] * ((uint16_t) alpha + 2);
+}
+
+static SMOL_INLINE void
+unpremul_p16_to_u_128bpp (const uint64_t * SMOL_RESTRICT in,
+                          uint64_t * SMOL_RESTRICT out,
+                          uint8_t alpha)
+{
+    out [0] = ((in [0] * _smol_inv_div_p16_lut [alpha])
+               >> INVERTED_DIV_SHIFT_P16) & 0x000000ff000000ffULL;
+    out [1] = ((in [1] * _smol_inv_div_p16_lut [alpha])
+               >> INVERTED_DIV_SHIFT_P16) & 0x000000ff000000ffULL;
+}
+
+static SMOL_INLINE void
+premul_ul_to_p16l_128bpp (uint64_t *inout,
+                          uint8_t alpha)
+{
+    inout [0] = inout [0] * ((uint16_t) alpha + 2);
+    inout [1] = inout [1] * ((uint16_t) alpha + 2);
+}
+
+static SMOL_INLINE void
+unpremul_p16l_to_ul_128bpp (const uint64_t * SMOL_RESTRICT in,
+                            uint64_t * SMOL_RESTRICT out,
+                            uint8_t alpha)
+{
+    out [0] = ((in [0] * _smol_inv_div_p16l_lut [alpha])
+               >> INVERTED_DIV_SHIFT_P16L) & 0x000007ff000007ffULL;
+    out [1] = ((in [1] * _smol_inv_div_p16l_lut [alpha])
+               >> INVERTED_DIV_SHIFT_P16L) & 0x000007ff000007ffULL;
+}
+
+/* --------- *
+ * Repacking *
+ * --------- */
+
+/* It's nice to be able to shift by a negative amount */
+#define SHIFT_S(in, s) ((s >= 0) ? (in) << (s) : (in) >> -(s))
+
+/* This is kind of bulky (~13 x86 insns), but it's about the same as using
+ * unions, and we don't have to worry about endianness. */
+#define PACK_FROM_1234_64BPP(in, a, b, c, d) \
+    ((SHIFT_S ((in), ((a) - 1) * 16 + 8 - 32) & 0xff000000) \
+     | (SHIFT_S ((in), ((b) - 1) * 16 + 8 - 40) & 0x00ff0000) \
+     | (SHIFT_S ((in), ((c) - 1) * 16 + 8 - 48) & 0x0000ff00) \
+     | (SHIFT_S ((in), ((d) - 1) * 16 + 8 - 56) & 0x000000ff))
+
+#define PACK_FROM_1234_128BPP(in, a, b, c, d) \
+    ((SHIFT_S ((in [((a) - 1) >> 1]), (((a) - 1) & 1) * 32 + 24 - 32) & 0xff000000) \
+     | (SHIFT_S ((in [((b) - 1) >> 1]), (((b) - 1) & 1) * 32 + 24 - 40) & 0x00ff0000) \
+     | (SHIFT_S ((in [((c) - 1) >> 1]), (((c) - 1) & 1) * 32 + 24 - 48) & 0x0000ff00) \
+     | (SHIFT_S ((in [((d) - 1) >> 1]), (((d) - 1) & 1) * 32 + 24 - 56) & 0x000000ff))
+
+#define SWAP_2_AND_3(n) ((n) == 2 ? 3 : (n) == 3 ? 2 : n)
+
+#define PACK_FROM_1324_64BPP(in, a, b, c, d) \
+    ((SHIFT_S ((in), (SWAP_2_AND_3 (a) - 1) * 16 + 8 - 32) & 0xff000000) \
+     | (SHIFT_S ((in), (SWAP_2_AND_3 (b) - 1) * 16 + 8 - 40) & 0x00ff0000) \
+     | (SHIFT_S ((in), (SWAP_2_AND_3 (c) - 1) * 16 + 8 - 48) & 0x0000ff00) \
+     | (SHIFT_S ((in), (SWAP_2_AND_3 (d) - 1) * 16 + 8 - 56) & 0x000000ff))
+
+/* ---------------------- *
+ * Repacking: 24/32 -> 64 *
+ * ---------------------- */
+
+static SMOL_INLINE uint64_t
+unpack_pixel_123_p8_to_132a_p8_64bpp (const uint8_t *p)
+{
+    return ((uint64_t) p [0] << 48) | ((uint32_t) p [1] << 16)
+        | ((uint64_t) p [2] << 32) | 0xff;
+}
+
+SMOL_REPACK_ROW_DEF (123,  24,  8, PREMUL8, COMPRESSED,
+                     1324, 64, 64, PREMUL8, COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = unpack_pixel_123_p8_to_132a_p8_64bpp (src_row);
+        src_row += 3;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE uint64_t
+unpack_pixel_1234_p8_to_1324_p8_64bpp (uint32_t p)
+{
+    return (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff00ff);
+}
+
+SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED,
+                     1324, 64, 64, PREMUL8, COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = unpack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++));
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE uint64_t
+unpack_pixel_1234_p8_to_3241_p8_64bpp (uint32_t p)
+{
+    return (((uint64_t) p & 0x0000ff00) << 40)
+        | (((uint64_t) p & 0x00ff00ff) << 16) | (p >> 24);
+}
+
+SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED,
+                     3241, 64, 64, PREMUL8, COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = unpack_pixel_1234_p8_to_3241_p8_64bpp (*(src_row++));
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE uint64_t
+unpack_pixel_1234_p8_to_2431_p8_64bpp (uint32_t p)
+{
+    uint64_t p64 = p;
+
+    return ((p64 & 0x00ff00ff) << 32) | ((p64 & 0x0000ff00) << 8)
+        | ((p64 & 0xff000000) >> 24);
+}
+
+SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED,
+                     2431, 64, 64, PREMUL8, COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = unpack_pixel_1234_p8_to_2431_p8_64bpp (*(src_row++));
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE uint64_t
+unpack_pixel_a234_u_to_324a_p8_64bpp (uint32_t p)
+{
+    uint64_t p64 = (((uint64_t) p & 0x0000ff00) << 40) | (((uint64_t) p & 0x00ff00ff) << 16);
+    uint8_t alpha = p >> 24;
+
+    return (premul_u_to_p8_64bpp (p64, alpha) & 0xffffffffffffff00ULL) | alpha;
+}
+
+SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED,
+                     3241, 64, 64, PREMUL8,      COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = unpack_pixel_a234_u_to_324a_p8_64bpp (*(src_row++));
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE uint64_t
+unpack_pixel_1234_u_to_2431_p8_64bpp (uint32_t p)
+{
+    uint64_t p64 = (((uint64_t) p & 0x00ff00ff) << 32) | (((uint64_t) p & 0x0000ff00) << 8);
+    uint8_t alpha = p >> 24;
+
+    return (premul_u_to_p8_64bpp (p64, alpha) & 0xffffffffffffff00ULL) | alpha;
+}
+
+SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED,
+                     2431, 64, 64, PREMUL8,      COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = unpack_pixel_1234_u_to_2431_p8_64bpp (*(src_row++));
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE uint64_t
+unpack_pixel_123a_u_to_132a_p8_64bpp (uint32_t p)
+{
+    uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000);
+    uint8_t alpha = p & 0xff;
+
+    return (premul_u_to_p8_64bpp (p64, alpha) & 0xffffffffffffff00ULL) | alpha;
+}
+
+SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED,
+                     1324, 64, 64, PREMUL8,      COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = unpack_pixel_123a_u_to_132a_p8_64bpp (*(src_row++));
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+/* ----------------------- *
+ * Repacking: 24/32 -> 128 *
+ * ----------------------- */
+
+static SMOL_INLINE void
+unpack_pixel_123_p8_to_123a_p8_128bpp (const uint8_t *in,
+                                       uint64_t *out)
+{
+    out [0] = ((uint64_t) in [0] << 32) | in [1];
+    out [1] = ((uint64_t) in [2] << 32) | 0xff;
+}
+
+SMOL_REPACK_ROW_DEF (123,   24,  8, PREMUL8, COMPRESSED,
+                     1234, 128, 64, PREMUL8, COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_123_p8_to_123a_p8_128bpp (src_row, dest_row);
+        src_row += 3;
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (123,   24,  8, PREMUL8, COMPRESSED,
+                     1234, 128, 64, PREMUL8, LINEAR) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha;
+        unpack_pixel_123_p8_to_123a_p8_128bpp (src_row, dest_row);
+        alpha = dest_row [1];
+        unpremul_p8_to_u_128bpp (dest_row, dest_row, alpha);
+        from_srgb_pixel_xxxa_128bpp (dest_row);
+        premul_ul_to_p8l_128bpp (dest_row, alpha);
+        dest_row [1] = (dest_row [1] & 0xffffffff00000000) | (alpha << 3) | 7;
+        src_row += 3;
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_123a_p8_to_123a_p8_128bpp (uint32_t p,
+                                        uint64_t *out)
+{
+    uint64_t p64 = p;
+    out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16);
+    out [1] = ((p64 & 0x0000ff00) << 24) | (p64 & 0x000000ff);
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, PREMUL8, COMPRESSED,
+                     1234, 128, 64, PREMUL8, COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_123a_p8_to_123a_p8_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, PREMUL8, COMPRESSED,
+                     1234, 128, 64, PREMUL8, LINEAR) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha;
+        unpack_pixel_123a_p8_to_123a_p8_128bpp (*(src_row++), dest_row);
+        alpha = dest_row [1];
+        unpremul_p8_to_u_128bpp (dest_row, dest_row, alpha);
+        from_srgb_pixel_xxxa_128bpp (dest_row);
+        premul_ul_to_p8l_128bpp (dest_row, alpha);
+        dest_row [1] = (dest_row [1] & 0xffffffff00000000) | (alpha << 3) | 7;
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_a234_p8_to_234a_p8_128bpp (uint32_t p,
+                                        uint64_t *out)
+{
+    uint64_t p64 = p;
+    out [0] = ((p64 & 0x00ff0000) << 16) | ((p64 & 0x0000ff00) >> 8);
+    out [1] = ((p64 & 0x000000ff) << 32) | ((p64 & 0xff000000) >> 24);
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, PREMUL8, COMPRESSED,
+                     2341, 128, 64, PREMUL8, COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_a234_p8_to_234a_p8_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, PREMUL8, COMPRESSED,
+                     2341, 128, 64, PREMUL8, LINEAR) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha;
+        unpack_pixel_a234_p8_to_234a_p8_128bpp (*(src_row++), dest_row);
+        alpha = dest_row [1];
+        unpremul_p8_to_u_128bpp (dest_row, dest_row, alpha);
+        from_srgb_pixel_xxxa_128bpp (dest_row);
+        premul_ul_to_p8l_128bpp (dest_row, alpha);
+        dest_row [1] = (dest_row [1] & 0xffffffff00000000) | (alpha << 3) | 7;
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_a234_u_to_234a_p8_128bpp (uint32_t p,
+                                       uint64_t *out)
+{
+    uint64_t p64 = (((uint64_t) p & 0x00ff00ff) << 32) | (((uint64_t) p & 0x0000ff00) << 8);
+    uint8_t alpha = p >> 24;
+
+    p64 = (premul_u_to_p8_64bpp (p64, alpha) & 0xffffffffffffff00) | alpha;
+    out [0] = (p64 >> 16) & 0x000000ff000000ff;
+    out [1] = p64 & 0x000000ff000000ff;
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, UNASSOCIATED, COMPRESSED,
+                     2341, 128, 64, PREMUL8,      COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_a234_u_to_234a_p8_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_a234_u_to_234a_pl_128bpp (uint32_t p,
+                                       uint64_t *out)
+{
+    uint64_t p64 = p;
+    uint8_t alpha = p >> 24;
+
+    out [0] = ((p64 & 0x00ff0000) << 16) | ((p64 & 0x0000ff00) >> 8);
+    out [1] = ((p64 & 0x000000ff) << 32);
+
+    from_srgb_pixel_xxxa_128bpp (out);
+    premul_ul_to_p8l_128bpp (out, alpha);
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, UNASSOCIATED, COMPRESSED,
+                     2341, 128, 64, PREMUL8,      LINEAR) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_a234_u_to_234a_pl_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_a234_u_to_234a_p16_128bpp (uint32_t p,
+                                        uint64_t *out)
+{
+    uint64_t p64 = p;
+    uint8_t alpha = p >> 24;
+
+    out [0] = ((p64 & 0x00ff0000) << 16) | ((p64 & 0x0000ff00) >> 8);
+    out [1] = ((p64 & 0x000000ff) << 32);
+
+    premul_u_to_p16_128bpp (out, alpha);
+    out [1] |= (((uint16_t) alpha) << 8) | alpha;
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, UNASSOCIATED, COMPRESSED,
+                     2341, 128, 64, PREMUL16,     COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_a234_u_to_234a_p16_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_a234_u_to_234a_p16l_128bpp (uint32_t p,
+                                         uint64_t *out)
+{
+    uint64_t p64 = p;
+    uint8_t alpha = p >> 24;
+
+    out [0] = ((p64 & 0x00ff0000) << 16) | ((p64 & 0x0000ff00) >> 8);
+    out [1] = ((p64 & 0x000000ff) << 32);
+
+    from_srgb_pixel_xxxa_128bpp (out);
+    out [0] *= alpha;
+    out [1] *= alpha;
+
+    out [1] = (out [1] & 0xffffffff00000000ULL) | (alpha << 8) | alpha;
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, UNASSOCIATED, COMPRESSED,
+                     2341, 128, 64, PREMUL16,     LINEAR) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_a234_u_to_234a_p16l_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_123a_u_to_123a_p8_128bpp (uint32_t p,
+                                       uint64_t *out)
+{
+    uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000);
+    uint8_t alpha = p;
+
+    p64 = (premul_u_to_p8_64bpp (p64, alpha) & 0xffffffffffffff00ULL) | alpha;
+    out [0] = (p64 >> 16) & 0x000000ff000000ff;
+    out [1] = p64 & 0x000000ff000000ff;
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, UNASSOCIATED, COMPRESSED,
+                     1234, 128, 64, PREMUL8,      COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_123a_u_to_123a_p8_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_123a_u_to_123a_pl_128bpp (uint32_t p,
+                                       uint64_t *out)
+{
+    uint64_t p64 = p;
+    uint8_t alpha = p;
+
+    out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16);
+    out [1] = ((p64 & 0x0000ff00) << 24);
+
+    from_srgb_pixel_xxxa_128bpp (out);
+    premul_ul_to_p8l_128bpp (out, alpha);
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, UNASSOCIATED, COMPRESSED,
+                     1234, 128, 64, PREMUL8,      LINEAR) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_123a_u_to_123a_pl_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_123a_u_to_123a_p16_128bpp (uint32_t p,
+                                        uint64_t *out)
+{
+    uint64_t p64 = p;
+    uint8_t alpha = p;
+
+    out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16);
+    out [1] = ((p64 & 0x0000ff00) << 24);
+
+    premul_u_to_p16_128bpp (out, alpha);
+    out [1] |= (((uint16_t) alpha) << 8) | alpha;
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, UNASSOCIATED, COMPRESSED,
+                     1234, 128, 64, PREMUL16,     COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_123a_u_to_123a_p16_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+static SMOL_INLINE void
+unpack_pixel_123a_u_to_123a_p16l_128bpp (uint32_t p,
+                                         uint64_t *out)
+{
+    uint64_t p64 = p;
+    uint8_t alpha = p;
+
+    out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16);
+    out [1] = ((p64 & 0x0000ff00) << 24);
+
+    from_srgb_pixel_xxxa_128bpp (out);
+    premul_ul_to_p16l_128bpp (out, alpha);
+
+    out [1] = (out [1] & 0xffffffff00000000ULL) | ((uint16_t) alpha << 8) | alpha;
+}
+
+SMOL_REPACK_ROW_DEF (1234,  32, 32, UNASSOCIATED, COMPRESSED,
+                     1234, 128, 64, PREMUL16,     LINEAR) {
+    while (dest_row != dest_row_max)
+    {
+        unpack_pixel_123a_u_to_123a_p16l_128bpp (*(src_row++), dest_row);
+        dest_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+/* ---------------------- *
+ * Repacking: 64 -> 24/32 *
+ * ---------------------- */
+
+static SMOL_INLINE uint32_t
+pack_pixel_1234_p8_to_1324_p8_64bpp (uint64_t in)
+{
+    return in | (in >> 24);
+}
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     132,  24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++));
+        *(dest_row++) = p >> 24;
+        *(dest_row++) = p >> 16;
+        *(dest_row++) = p >> 8;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     132,  24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha = *src_row;
+        uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha;
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t);
+        *(dest_row++) = p >> 24;
+        *(dest_row++) = p >> 16;
+        *(dest_row++) = p >> 8;
+        src_row++;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     231,  24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++));
+        *(dest_row++) = p >> 8;
+        *(dest_row++) = p >> 16;
+        *(dest_row++) = p >> 24;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     231,  24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha = *src_row;
+        uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha;
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t);
+        *(dest_row++) = p >> 8;
+        *(dest_row++) = p >> 16;
+        *(dest_row++) = p >> 24;
+        src_row++;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     324,  24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++));
+        *(dest_row++) = p >> 16;
+        *(dest_row++) = p >> 8;
+        *(dest_row++) = p;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     324,  24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha = *src_row >> 24;
+        uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha;
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t);
+        *(dest_row++) = p >> 16;
+        *(dest_row++) = p >> 8;
+        *(dest_row++) = p;
+        src_row++;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     423,  24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++));
+        *(dest_row++) = p;
+        *(dest_row++) = p >> 8;
+        *(dest_row++) = p >> 16;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     423,  24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha = *src_row >> 24;
+        uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha;
+        uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t);
+        *(dest_row++) = p;
+        *(dest_row++) = p >> 8;
+        *(dest_row++) = p >> 16;
+        src_row++;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     1324, 32, 32, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++));
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8,       COMPRESSED,
+                     1324, 32, 32, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint8_t alpha = *src_row;
+        uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha;
+        *(dest_row++) = pack_pixel_1234_p8_to_1324_p8_64bpp (t);
+        src_row++;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+#define DEF_REPACK_FROM_1234_64BPP_TO_32BPP(a, b, c, d) \
+    SMOL_REPACK_ROW_DEF (1234,       64, 64, PREMUL8,       COMPRESSED, \
+                         a##b##c##d, 32, 32, PREMUL8,       COMPRESSED) { \
+        while (dest_row != dest_row_max) \
+        { \
+            *(dest_row++) = PACK_FROM_1234_64BPP (*src_row, a, b, c, d); \
+            src_row++; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END \
+    SMOL_REPACK_ROW_DEF (1234,       64, 64, PREMUL8,       COMPRESSED, \
+                         a##b##c##d, 32, 32, UNASSOCIATED,  COMPRESSED) { \
+        while (dest_row != dest_row_max) \
+        { \
+            uint8_t alpha = *src_row; \
+            uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; \
+            *(dest_row++) = PACK_FROM_1234_64BPP (t, a, b, c, d); \
+            src_row++; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END
+
+DEF_REPACK_FROM_1234_64BPP_TO_32BPP (1, 4, 2, 3)
+DEF_REPACK_FROM_1234_64BPP_TO_32BPP (2, 3, 1, 4)
+DEF_REPACK_FROM_1234_64BPP_TO_32BPP (4, 1, 3, 2)
+DEF_REPACK_FROM_1234_64BPP_TO_32BPP (4, 2, 3, 1)
+
+/* ----------------------- *
+ * Repacking: 128 -> 24/32 *
+ * ----------------------- */
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8,       COMPRESSED,
+                     123,   24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = *src_row >> 32;
+        *(dest_row++) = *(src_row++);
+        *(dest_row++) = *(src_row++) >> 32;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8,       LINEAR,
+                     123,   24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = get_alpha_from_linear_xxxa_128bpp (src_row);
+        unpremul_p8l_to_ul_128bpp (src_row, t, alpha);
+        to_srgb_pixel_xxxa_128bpp (src_row, t);
+        *(dest_row++) = t [0] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [1] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8,       COMPRESSED,
+                     123,   24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = src_row [1];
+        unpremul_p8_to_u_128bpp (src_row, t, alpha);
+        t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;
+        *(dest_row++) = t [0] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [1] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8,       LINEAR,
+                     123,   24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = get_alpha_from_linear_xxxa_128bpp (src_row);
+        unpremul_p8l_to_ul_128bpp (src_row, t, alpha);
+        to_srgb_pixel_xxxa_128bpp (t, t);
+        t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;
+        *(dest_row++) = t [0] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [1] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16,      COMPRESSED,
+                     123,   24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = src_row [1] >> 8;
+        unpremul_p16_to_u_128bpp (src_row, t, alpha);
+        t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;
+        *(dest_row++) = t [0] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [1] >> 32;
+        src_row += 2;
+    } \
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16,      LINEAR,
+                     123,   24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = src_row [1] >> 8;
+        unpremul_p16_to_u_128bpp (src_row, t, alpha);
+        to_srgb_pixel_xxxa_128bpp (t, t);
+        t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;
+        *(dest_row++) = t [0] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [1] >> 32;
+        src_row += 2;
+    } \
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8,       COMPRESSED,
+                     321,   24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        *(dest_row++) = src_row [1] >> 32;
+        *(dest_row++) = src_row [0];
+        *(dest_row++) = src_row [0] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8,       LINEAR,
+                     321,   24,  8, PREMUL8,       COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = get_alpha_from_linear_xxxa_128bpp (src_row);
+        unpremul_p8l_to_ul_128bpp (src_row, t, alpha);
+        to_srgb_pixel_xxxa_128bpp (t, t);
+        *(dest_row++) = t [1] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [0] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8,       COMPRESSED,
+                     321,   24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = src_row [1];
+        unpremul_p8_to_u_128bpp (src_row, t, alpha);
+        t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;
+        *(dest_row++) = t [1] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [0] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8,       LINEAR,
+                     321,   24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = get_alpha_from_linear_xxxa_128bpp (src_row);
+        unpremul_p8l_to_ul_128bpp (src_row, t, alpha);
+        to_srgb_pixel_xxxa_128bpp (t, t);
+        t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;
+        *(dest_row++) = t [1] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [0] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16,      COMPRESSED,
+                     321,   24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = src_row [1] >> 8;
+        unpremul_p16_to_u_128bpp (src_row, t, alpha);
+        t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;
+        *(dest_row++) = t [1] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [0] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16,      LINEAR,
+                     321,   24,  8, UNASSOCIATED,  COMPRESSED) {
+    while (dest_row != dest_row_max)
+    {
+        uint64_t t [2];
+        uint8_t alpha = src_row [1] >> 8;
+        unpremul_p16_to_u_128bpp (src_row, t, alpha);
+        to_srgb_pixel_xxxa_128bpp (t, t);
+        t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;
+        *(dest_row++) = t [1] >> 32;
+        *(dest_row++) = t [0];
+        *(dest_row++) = t [0] >> 32;
+        src_row += 2;
+    }
+} SMOL_REPACK_ROW_DEF_END
+
+#define DEF_REPACK_FROM_1234_128BPP_TO_32BPP(a, b, c, d) \
+    SMOL_REPACK_ROW_DEF (1234,       128, 64, PREMUL8,       COMPRESSED, \
+                         a##b##c##d,  32, 32, PREMUL8,       COMPRESSED) { \
+        while (dest_row != dest_row_max) \
+        { \
+            *(dest_row++) = PACK_FROM_1234_128BPP (src_row, a, b, c, d); \
+            src_row += 2; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END \
+    SMOL_REPACK_ROW_DEF (1234,       128, 64, PREMUL8,       LINEAR, \
+                         a##b##c##d,  32, 32, PREMUL8,       COMPRESSED) { \
+        while (dest_row != dest_row_max) \
+        { \
+            uint64_t t [2]; \
+            uint8_t alpha = get_alpha_from_linear_xxxa_128bpp (src_row); \
+            unpremul_p8l_to_ul_128bpp (src_row, t, alpha); \
+            to_srgb_pixel_xxxa_128bpp (t, t); \
+            premul_u_to_p8_128bpp (t, alpha); \
+            t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \
+            *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \
+            src_row += 2; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END \
+    SMOL_REPACK_ROW_DEF (1234,       128, 64, PREMUL8,       COMPRESSED, \
+                         a##b##c##d,  32, 32, UNASSOCIATED,  COMPRESSED) { \
+        while (dest_row != dest_row_max) \
+        { \
+            uint64_t t [2]; \
+            uint8_t alpha = src_row [1]; \
+            unpremul_p8_to_u_128bpp (src_row, t, alpha); \
+            t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \
+            *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \
+            src_row += 2; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END \
+    SMOL_REPACK_ROW_DEF (1234,       128, 64, PREMUL8,       LINEAR, \
+                         a##b##c##d,  32, 32, UNASSOCIATED,  COMPRESSED) { \
+        while (dest_row != dest_row_max) \
+        { \
+            uint64_t t [2]; \
+            uint8_t alpha = get_alpha_from_linear_xxxa_128bpp (src_row); \
+            unpremul_p8l_to_ul_128bpp (src_row, t, alpha); \
+            to_srgb_pixel_xxxa_128bpp (t, t); \
+            t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \
+            *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \
+            src_row += 2; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END \
+    SMOL_REPACK_ROW_DEF (1234,       128, 64, PREMUL16,      COMPRESSED, \
+                         a##b##c##d,  32, 32, UNASSOCIATED,  COMPRESSED) { \
+        while (dest_row != dest_row_max) \
+        { \
+            uint64_t t [2]; \
+            uint8_t alpha = src_row [1] >> 8; \
+            unpremul_p16_to_u_128bpp (src_row, t, alpha); \
+            t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \
+            *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \
+            src_row += 2; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END \
+    SMOL_REPACK_ROW_DEF (1234,       128, 64, PREMUL16,      LINEAR, \
+                         a##b##c##d,  32, 32, UNASSOCIATED,  COMPRESSED) { \
+        while (dest_row != dest_row_max) \
+        { \
+            uint64_t t [2]; \
+            uint8_t alpha = src_row [1] >> 8; \
+            unpremul_p16l_to_ul_128bpp (src_row, t, alpha); \
+            to_srgb_pixel_xxxa_128bpp (t, t); \
+            t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \
+            *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \
+            src_row += 2; \
+        } \
+    } SMOL_REPACK_ROW_DEF_END
+
+DEF_REPACK_FROM_1234_128BPP_TO_32BPP (1, 2, 3, 4)
+DEF_REPACK_FROM_1234_128BPP_TO_32BPP (3, 2, 1, 4)
+DEF_REPACK_FROM_1234_128BPP_TO_32BPP (4, 1, 2, 3)
+DEF_REPACK_FROM_1234_128BPP_TO_32BPP (4, 3, 2, 1)
+
+/* -------------- *
+ * Filter helpers *
+ * -------------- */
+
+static SMOL_INLINE const char *
+src_row_ofs_to_pointer (const SmolScaleCtx *scale_ctx,
+                        uint32_t src_row_ofs)
+{
+    return scale_ctx->src_pixels + scale_ctx->src_rowstride * src_row_ofs;
+}
+
+static SMOL_INLINE uint64_t
+weight_pixel_64bpp (uint64_t p,
+                    uint16_t w)
+{
+    return ((p * w) >> 8) & 0x00ff00ff00ff00ffULL;
+}
+
+/* p and out may be the same address */
+static SMOL_INLINE void
+weight_pixel_128bpp (const uint64_t *p,
+                     uint64_t *out,
+                     uint16_t w)
+{
+    out [0] = ((p [0] * w) >> 8) & 0x00ffffff00ffffffULL;
+    out [1] = ((p [1] * w) >> 8) & 0x00ffffff00ffffffULL;
+}
+
+static SMOL_INLINE void
+sum_parts_64bpp (const uint64_t ** SMOL_RESTRICT parts_in,
+                 uint64_t * SMOL_RESTRICT accum,
+                 uint32_t n)
+{
+    const uint64_t * SMOL_RESTRICT pp = *parts_in;
+    const uint64_t *pp_end;
+
+    SMOL_ASSUME_ALIGNED_TO (pp, const uint64_t *, sizeof (uint64_t));
+
+    for (pp_end = pp + n; pp < pp_end; pp++)
+    {
+        *accum += *pp;
+    }
+
+    *parts_in = pp;
+}
+
+static SMOL_INLINE void
+sum_parts_128bpp (const uint64_t ** SMOL_RESTRICT parts_in,
+                  uint64_t * SMOL_RESTRICT accum,
+                  uint32_t n)
+{
+    const uint64_t * SMOL_RESTRICT pp = *parts_in;
+    const uint64_t *pp_end;
+
+    SMOL_ASSUME_ALIGNED_TO (pp, const uint64_t *, sizeof (uint64_t) * 2);
+
+    for (pp_end = pp + n * 2; pp < pp_end; )
+    {
+        accum [0] += *(pp++);
+        accum [1] += *(pp++);
+    }
+
+    *parts_in = pp;
+}
+
+static SMOL_INLINE uint64_t
+scale_64bpp (uint64_t accum,
+             uint64_t multiplier)
+{
+    uint64_t a, b;
+
+    a = ((accum & 0x0000ffff0000ffffULL) * multiplier
+         + (SMOL_BOXES_MULTIPLIER / 2) + ((SMOL_BOXES_MULTIPLIER / 2) << 32)) / SMOL_BOXES_MULTIPLIER;
+    b = (((accum & 0xffff0000ffff0000ULL) >> 16) * multiplier
+         + (SMOL_BOXES_MULTIPLIER / 2) + ((SMOL_BOXES_MULTIPLIER / 2) << 32)) / SMOL_BOXES_MULTIPLIER;
+
+    return (a & 0x000000ff000000ffULL) | ((b & 0x000000ff000000ffULL) << 16);
+}
+
+static SMOL_INLINE uint64_t
+scale_128bpp_half (uint64_t accum,
+                   uint64_t multiplier)
+{
+    uint64_t a, b;
+
+    a = accum & 0x00000000ffffffffULL;
+    a = (a * multiplier + SMOL_BOXES_MULTIPLIER / 2) / SMOL_BOXES_MULTIPLIER;
+
+    b = (accum & 0xffffffff00000000ULL) >> 32;
+    b = (b * multiplier + SMOL_BOXES_MULTIPLIER / 2) / SMOL_BOXES_MULTIPLIER;
+
+    return a | (b << 32);
+}
+
+static SMOL_INLINE void
+scale_and_store_128bpp (const uint64_t * SMOL_RESTRICT accum,
+                        uint64_t multiplier,
+                        uint64_t ** SMOL_RESTRICT dest_row_parts)
+{
+    *(*dest_row_parts)++ = scale_128bpp_half (accum [0], multiplier);
+    *(*dest_row_parts)++ = scale_128bpp_half (accum [1], multiplier);
+}
+
+static void
+add_parts (const uint64_t * SMOL_RESTRICT parts_in,
+           uint64_t * SMOL_RESTRICT parts_acc_out,
+           uint32_t n)
+{
+    const uint64_t *parts_in_max = parts_in + n;
+
+    SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *);
+
+    while (parts_in < parts_in_max)
+        *(parts_acc_out++) += *(parts_in++);
+}
+
+static void
+copy_weighted_parts_64bpp (const uint64_t * SMOL_RESTRICT parts_in,
+                           uint64_t * SMOL_RESTRICT parts_acc_out,
+                           uint32_t n,
+                           uint16_t w)
+{
+    const uint64_t *parts_in_max = parts_in + n;
+
+    SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *);
+
+    while (parts_in < parts_in_max)
+    {
+        *(parts_acc_out++) = weight_pixel_64bpp (*(parts_in++), w);
+    }
+}
+
+static void
+copy_weighted_parts_128bpp (const uint64_t * SMOL_RESTRICT parts_in,
+                            uint64_t * SMOL_RESTRICT parts_acc_out,
+                            uint32_t n,
+                            uint16_t w)
+{
+    const uint64_t *parts_in_max = parts_in + n * 2;
+
+    SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *);
+
+    while (parts_in < parts_in_max)
+    {
+        weight_pixel_128bpp (parts_in, parts_acc_out, w);
+        parts_in += 2;
+        parts_acc_out += 2;
+    }
+}
+
+static void
+add_weighted_parts_64bpp (const uint64_t * SMOL_RESTRICT parts_in,
+                          uint64_t * SMOL_RESTRICT parts_acc_out,
+                          uint32_t n,
+                          uint16_t w)
+{
+    const uint64_t *parts_in_max = parts_in + n;
+
+    SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *);
+
+    while (parts_in < parts_in_max)
+    {
+        *(parts_acc_out++) += weight_pixel_64bpp (*(parts_in++), w);
+    }
+}
+
+static void
+add_weighted_parts_128bpp (const uint64_t * SMOL_RESTRICT parts_in,
+                           uint64_t * SMOL_RESTRICT parts_acc_out,
+                           uint32_t n,
+                           uint16_t w)
+{
+    const uint64_t *parts_in_max = parts_in + n * 2;
+
+    SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *);
+
+    while (parts_in < parts_in_max)
+    {
+        uint64_t t [2];
+
+        weight_pixel_128bpp (parts_in, t, w);
+        parts_acc_out [0] += t [0];
+        parts_acc_out [1] += t [1];
+        parts_in += 2;
+        parts_acc_out += 2;
+    }
+}
+
+static SMOL_INLINE void
+apply_subpixel_opacity_64bpp (uint64_t * SMOL_RESTRICT u64_inout, uint16_t opacity)
+{
+    *u64_inout = ((*u64_inout * opacity) >> SMOL_SUBPIXEL_SHIFT) & 0x00ff00ff00ff00ffULL;
+}
+
+static SMOL_INLINE void
+apply_subpixel_opacity_128bpp_half (uint64_t * SMOL_RESTRICT u64_inout, uint16_t opacity)
+{
+    *u64_inout = ((*u64_inout * opacity) >> SMOL_SUBPIXEL_SHIFT) & 0x00ffffff00ffffffULL;
+}
+
+static SMOL_INLINE void
+apply_subpixel_opacity_128bpp (uint64_t *u64_inout, uint16_t opacity)
+{
+    apply_subpixel_opacity_128bpp_half (u64_inout, opacity);
+    apply_subpixel_opacity_128bpp_half (u64_inout + 1, opacity);
+}
+
+static void
+apply_subpixel_opacity_row_copy_64bpp (uint64_t * SMOL_RESTRICT u64_in,
+                                       uint64_t * SMOL_RESTRICT u64_out,
+                                       int n_pixels,
+                                       uint16_t opacity)
+{
+    uint64_t *u64_out_max = u64_out + n_pixels;
+
+    while (u64_out != u64_out_max)
+    {
+        *u64_out = *u64_in++;
+        apply_subpixel_opacity_64bpp (u64_out, opacity);
+        u64_out++;
+    }
+}
+
+static void
+apply_subpixel_opacity_row_copy_128bpp (uint64_t * SMOL_RESTRICT u64_in,
+                                        uint64_t * SMOL_RESTRICT u64_out,
+                                        int n_pixels,
+                                        uint16_t opacity)
+{
+    uint64_t *u64_out_max = u64_out + (n_pixels * 2);
+
+    while (u64_out != u64_out_max)
+    {
+        u64_out [0] = u64_in [0];
+        u64_out [1] = u64_in [1];
+        apply_subpixel_opacity_128bpp_half (u64_out, opacity);
+        apply_subpixel_opacity_128bpp_half (u64_out + 1, opacity);
+        u64_in += 2;
+        u64_out += 2;
+    }
+}
+
+static void
+apply_horiz_edge_opacity (const SmolScaleCtx *scale_ctx,
+                          uint64_t *row_parts)
+{
+    if (scale_ctx->storage_type == SMOL_STORAGE_64BPP)
+    {
+        apply_subpixel_opacity_64bpp (&row_parts [0], scale_ctx->hdim.first_opacity);
+        apply_subpixel_opacity_64bpp (&row_parts [scale_ctx->hdim.placement_size_px - 1], scale_ctx->hdim.last_opacity);
+    }
+    else
+    {
+        apply_subpixel_opacity_128bpp (&row_parts [0], scale_ctx->hdim.first_opacity);
+        apply_subpixel_opacity_128bpp (&row_parts [(scale_ctx->hdim.placement_size_px - 1) * 2], scale_ctx->hdim.last_opacity);
+    }
+}
+
+/* ------------------ *
+ * Horizontal scaling *
+ * ------------------ */
+
+#define DEF_INTERP_HORIZONTAL_BILINEAR(n_halvings) \
+static void \
+interp_horizontal_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \
+                                                  const uint64_t * SMOL_RESTRICT src_row_parts, \
+                                                  uint64_t * SMOL_RESTRICT dest_row_parts) \
+{ \
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; \
+    uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px; \
+    uint64_t p, q; \
+    uint64_t F; \
+    int i; \
+\
+    SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); \
+\
+    do \
+    { \
+        uint64_t accum = 0; \
+\
+        for (i = 0; i < (1 << (n_halvings)); i++) \
+        { \
+            uint64_t pixel_ofs = *(precalc_x++); \
+            F = *(precalc_x++); \
+\
+            p = src_row_parts [pixel_ofs]; \
+            q = src_row_parts [pixel_ofs + 1]; \
+\
+            accum += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \
+            } \
+        *(dest_row_parts++) = ((accum) >> (n_halvings)) & 0x00ff00ff00ff00ffULL; \
+    } \
+    while (dest_row_parts != dest_row_parts_max); \
+} \
+\
+static void \
+interp_horizontal_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \
+                                                   const uint64_t * SMOL_RESTRICT src_row_parts, \
+                                                   uint64_t * SMOL_RESTRICT dest_row_parts) \
+{ \
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; \
+    uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px * 2; \
+    uint64_t p, q; \
+    uint64_t F; \
+    int i; \
+\
+    SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); \
+\
+    do \
+    { \
+        uint64_t accum [2] = { 0 }; \
+         \
+        for (i = 0; i < (1 << (n_halvings)); i++) \
+        { \
+            uint32_t pixel_ofs = *(precalc_x++) * 2; \
+            F = *(precalc_x++); \
+\
+            p = src_row_parts [pixel_ofs]; \
+            q = src_row_parts [pixel_ofs + 2]; \
+\
+            accum [0] += ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \
+\
+            p = src_row_parts [pixel_ofs + 1]; \
+            q = src_row_parts [pixel_ofs + 3]; \
+\
+            accum [1] += ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \
+        } \
+        *(dest_row_parts++) = ((accum [0]) >> (n_halvings)) & 0x00ffffff00ffffffULL; \
+        *(dest_row_parts++) = ((accum [1]) >> (n_halvings)) & 0x00ffffff00ffffffULL; \
+    } \
+    while (dest_row_parts != dest_row_parts_max); \
+}
+
+static void
+interp_horizontal_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx,
+                                     const uint64_t * SMOL_RESTRICT src_row_parts,
+                                     uint64_t * SMOL_RESTRICT dest_row_parts)
+{
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc;
+    uint64_t * SMOL_RESTRICT dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px;
+    uint64_t p, q;
+    uint64_t F;
+
+    SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *);
+
+    do
+    {
+        uint32_t pixel_ofs = *(precalc_x++);
+        F = *(precalc_x++);
+
+        p = src_row_parts [pixel_ofs];
+        q = src_row_parts [pixel_ofs + 1];
+
+        *(dest_row_parts++) = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;
+    }
+    while (dest_row_parts != dest_row_parts_max);
+}
+
+static void
+interp_horizontal_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx,
+                                      const uint64_t * SMOL_RESTRICT src_row_parts,
+                                      uint64_t * SMOL_RESTRICT dest_row_parts)
+{
+    const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc;
+    uint64_t * SMOL_RESTRICT dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px * 2;
+    uint64_t p, q;
+    uint64_t F;
+
+    SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *);
+
+    do
+    {
+        uint32_t pixel_ofs = *(precalc_x++) * 2;
+        F = *(precalc_x++);
+
+        p = src_row_parts [pixel_ofs];
+        q = src_row_parts [pixel_ofs + 2];
+
+        *(dest_row_parts++) = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL;
+
+        p = src_row_parts [pixel_ofs + 1];
+        q = src_row_parts [pixel_ofs + 3];
+
+        *(dest_row_parts++) = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL;
+    }
+    while (dest_row_parts != dest_row_parts_max);
+}
+
+DEF_INTERP_HORIZONTAL_BILINEAR(1)
+DEF_INTERP_HORIZONTAL_BILINEAR(2)
+DEF_INTERP_HORIZONTAL_BILINEAR(3)
+DEF_INTERP_HORIZONTAL_BILINEAR(4)
+DEF_INTERP_HORIZONTAL_BILINEAR(5)
+DEF_INTERP_HORIZONTAL_BILINEAR(6)
+
+static SMOL_INLINE void
+unpack_box_precalc (const uint32_t precalc,
+                    uint32_t step,
+                    uint32_t *ofs0,
+                    uint32_t *ofs1,
+                    uint32_t *f0,
+                    uint32_t *f1,
+                    uint32_t *n)
+{
+    *ofs0 = precalc;
+    *ofs1 = *ofs0 + step;
+    *f0 = 256 - (*ofs0 % SMOL_SUBPIXEL_MUL);
+    *f1 = *ofs1 % SMOL_SUBPIXEL_MUL;
+    *ofs0 /= SMOL_SUBPIXEL_MUL;
+    *ofs1 /= SMOL_SUBPIXEL_MUL;
+    *n = *ofs1 - *ofs0 - 1;
+}
+
+static void
+interp_horizontal_boxes_64bpp (const SmolScaleCtx *scale_ctx,
+                               const uint64_t *src_row_parts,
+                               uint64_t * SMOL_RESTRICT dest_row_parts)
+{
+    const uint64_t * SMOL_RESTRICT pp;
+    const uint32_t *precalc_x = scale_ctx->hdim.precalc;
+    uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px;
+    uint64_t accum;
+
+    SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *);
+
+    while (dest_row_parts < dest_row_parts_max)
+    {
+        uint32_t ofs0, ofs1;
+        uint32_t f0, f1;
+        uint32_t n;
+
+        unpack_box_precalc (*(precalc_x++),
+                            scale_ctx->hdim.span_step,
+                            &ofs0,
+                            &ofs1,
+                            &f0,
+                            &f1,
+                            &n);
+
+        pp = src_row_parts + ofs0;
+
+        accum = weight_pixel_64bpp (*(pp++), f0);
+        sum_parts_64bpp ((const uint64_t ** SMOL_RESTRICT) &pp, &accum, n);
+        accum += weight_pixel_64bpp (*pp, f1);
+
+        *(dest_row_parts++) = scale_64bpp (accum, scale_ctx->hdim.span_mul);
+    }
+}
+
+static void
+interp_horizontal_boxes_128bpp (const SmolScaleCtx *scale_ctx,
+                                const uint64_t *src_row_parts,
+                                uint64_t * SMOL_RESTRICT dest_row_parts)
+{
+    const uint64_t * SMOL_RESTRICT pp;
+    const uint32_t *precalc_x = scale_ctx->hdim.precalc;
+    uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px * 2;
+    uint64_t accum [2];
+
+    SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *);
+
+    while (dest_row_parts < dest_row_parts_max)
+    {
+        uint32_t ofs0, ofs1;
+        uint32_t f0, f1;
+        uint32_t n;
+        uint64_t t [2];
+
+        unpack_box_precalc (*(precalc_x++),
+                            scale_ctx->hdim.span_step,
+                            &ofs0,
+                            &ofs1,
+                            &f0,
+                            &f1,
+                            &n);
+
+        pp = src_row_parts + (ofs0 * 2);
+
+        weight_pixel_128bpp (pp, accum, f0);
+        pp += 2;
+
+        sum_parts_128bpp ((const uint64_t ** SMOL_RESTRICT) &pp, accum, n);
+
+        weight_pixel_128bpp (pp, t, f1);
+        accum [0] += t [0];
+        accum [1] += t [1];
+
+        scale_and_store_128bpp (accum,
+                                scale_ctx->hdim.span_mul,
+                                (uint64_t ** SMOL_RESTRICT) &dest_row_parts);
+    }
+}
+
+static void
+interp_horizontal_one_64bpp (const SmolScaleCtx *scale_ctx,
+                             const uint64_t * SMOL_RESTRICT src_row_parts,
+                             uint64_t * SMOL_RESTRICT dest_row_parts)
+{
+    uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px;
+    uint64_t part;
+
+    SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *);
+
+    part = *src_row_parts;
+    while (dest_row_parts != dest_row_parts_max)
+        *(dest_row_parts++) = part;
+}
+
+static void
+interp_horizontal_one_128bpp (const SmolScaleCtx *scale_ctx,
+                              const uint64_t * SMOL_RESTRICT src_row_parts,
+                              uint64_t * SMOL_RESTRICT dest_row_parts)
+{
+    uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px * 2;
+
+    SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *);
+
+    while (dest_row_parts != dest_row_parts_max)
+    {
+        *(dest_row_parts++) = src_row_parts [0];
+        *(dest_row_parts++) = src_row_parts [1];
+    }
+}
+
+static void
+interp_horizontal_copy_64bpp (const SmolScaleCtx *scale_ctx,
+                              const uint64_t * SMOL_RESTRICT src_row_parts,
+                              uint64_t * SMOL_RESTRICT dest_row_parts)
+{
+    SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *);
+
+    memcpy (dest_row_parts, src_row_parts, scale_ctx->hdim.placement_size_px * sizeof (uint64_t));
+}
+
+static void
+interp_horizontal_copy_128bpp (const SmolScaleCtx *scale_ctx,
+                               const uint64_t * SMOL_RESTRICT src_row_parts,
+                               uint64_t * SMOL_RESTRICT dest_row_parts)
+{
+    SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *);
+
+    memcpy (dest_row_parts, src_row_parts, scale_ctx->hdim.placement_size_px * 2 * sizeof (uint64_t));
+}
+
+static void
+scale_horizontal (const SmolScaleCtx *scale_ctx,
+                  SmolLocalCtx *local_ctx,
+                  const char *src_row,
+                  uint64_t *dest_row_parts)
+{
+    uint64_t * SMOL_RESTRICT src_row_unpacked;
+
+    src_row_unpacked = local_ctx->parts_row [3];
+
+    /* 32-bit unpackers need 32-bit alignment */
+    if ((((uintptr_t) src_row) & 3)
+        && scale_ctx->src_pixel_type != SMOL_PIXEL_RGB8
+        && scale_ctx->src_pixel_type != SMOL_PIXEL_BGR8)
+    {
+        if (!local_ctx->src_aligned)
+            local_ctx->src_aligned =
+                smol_alloc_aligned (scale_ctx->hdim.src_size_px * sizeof (uint32_t),
+                                    &local_ctx->src_aligned_storage);
+        memcpy (local_ctx->src_aligned, src_row, scale_ctx->hdim.src_size_px * sizeof (uint32_t));
+        src_row = (const char *) local_ctx->src_aligned;
+    }
+
+    scale_ctx->src_unpack_row_func (src_row,
+                                    src_row_unpacked,
+                                    scale_ctx->hdim.src_size_px);
+    scale_ctx->hfilter_func (scale_ctx,
+                             src_row_unpacked,
+                             dest_row_parts);
+
+    apply_horiz_edge_opacity (scale_ctx, dest_row_parts);
+}
+
+/* ---------------- *
+ * Vertical scaling *
+ * ---------------- */
+
+static void
+update_local_ctx_bilinear (const SmolScaleCtx *scale_ctx,
+                           SmolLocalCtx *local_ctx,
+                           uint32_t dest_row_index)
+{
+    uint16_t *precalc_y = scale_ctx->vdim.precalc;
+    uint32_t new_src_ofs = precalc_y [dest_row_index * 2];
+
+    if (new_src_ofs == local_ctx->src_ofs)
+        return;
+
+    if (new_src_ofs == local_ctx->src_ofs + 1)
+    {
+        uint64_t *t = local_ctx->parts_row [0];
+        local_ctx->parts_row [0] = local_ctx->parts_row [1];
+        local_ctx->parts_row [1] = t;
+
+        scale_horizontal (scale_ctx,
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, new_src_ofs + 1),
+                          local_ctx->parts_row [1]);
+    }
+    else
+    {
+        scale_horizontal (scale_ctx,
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, new_src_ofs),
+                          local_ctx->parts_row [0]);
+        scale_horizontal (scale_ctx,
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, new_src_ofs + 1),
+                          local_ctx->parts_row [1]);
+    }
+
+    local_ctx->src_ofs = new_src_ofs;
+}
+
+static void
+interp_vertical_bilinear_store_64bpp (uint64_t F,
+                                      const uint64_t * SMOL_RESTRICT top_src_row_parts,
+                                      const uint64_t * SMOL_RESTRICT bottom_src_row_parts,
+                                      uint64_t * SMOL_RESTRICT dest_parts,
+                                      uint32_t width)
+{
+    uint64_t *parts_dest_last = dest_parts + width;
+
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
+
+    do
+    {
+        uint64_t p, q;
+
+        p = *(top_src_row_parts++);
+        q = *(bottom_src_row_parts++);
+
+        *(dest_parts++) = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;
+    }
+    while (dest_parts != parts_dest_last);
+}
+
+static void
+interp_vertical_bilinear_store_with_opacity_64bpp (uint64_t F,
+                                                   const uint64_t * SMOL_RESTRICT top_src_row_parts,
+                                                   const uint64_t * SMOL_RESTRICT bottom_src_row_parts,
+                                                   uint64_t * SMOL_RESTRICT dest_parts,
+                                                   uint32_t width,
+                                                   uint16_t opacity)
+{
+    uint64_t *parts_dest_last = dest_parts + width;
+
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
+
+    do
+    {
+        uint64_t p, q;
+
+        p = *(top_src_row_parts++);
+        q = *(bottom_src_row_parts++);
+
+        *dest_parts = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;
+        apply_subpixel_opacity_64bpp (dest_parts, opacity);
+        dest_parts++;
+    }
+    while (dest_parts != parts_dest_last);
+}
+
+static void
+interp_vertical_bilinear_add_64bpp (uint64_t F,
+                                    const uint64_t * SMOL_RESTRICT top_src_row_parts,
+                                    const uint64_t * SMOL_RESTRICT bottom_src_row_parts,
+                                    uint64_t * SMOL_RESTRICT accum_out,
+                                    uint32_t width)
+{
+    uint64_t *accum_dest_last = accum_out + width;
+
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (accum_out, uint64_t *);
+
+    do
+    {
+        uint64_t p, q;
+
+        p = *(top_src_row_parts++);
+        q = *(bottom_src_row_parts++);
+
+        *(accum_out++) += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;
+    }
+    while (accum_out != accum_dest_last);
+}
+
+static void
+interp_vertical_bilinear_store_128bpp (uint64_t F,
+                                       const uint64_t * SMOL_RESTRICT top_src_row_parts,
+                                       const uint64_t * SMOL_RESTRICT bottom_src_row_parts,
+                                       uint64_t * SMOL_RESTRICT dest_parts,
+                                       uint32_t width)
+{
+    uint64_t *parts_dest_last = dest_parts + width;
+
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
+
+    do
+    {
+        uint64_t p, q;
+
+        p = *(top_src_row_parts++);
+        q = *(bottom_src_row_parts++);
+
+        *(dest_parts++) = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL;
+    }
+    while (dest_parts != parts_dest_last);
+}
+
+static void
+interp_vertical_bilinear_store_with_opacity_128bpp (uint64_t F,
+                                                    const uint64_t * SMOL_RESTRICT top_src_row_parts,
+                                                    const uint64_t * SMOL_RESTRICT bottom_src_row_parts,
+                                                    uint64_t * SMOL_RESTRICT dest_parts,
+                                                    uint32_t width,
+                                                    uint16_t opacity)
+{
+    uint64_t *parts_dest_last = dest_parts + width;
+
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
+
+    do
+    {
+        uint64_t p, q;
+
+        p = *(top_src_row_parts++);
+        q = *(bottom_src_row_parts++);
+
+        *dest_parts = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL;
+        apply_subpixel_opacity_128bpp_half (dest_parts, opacity);
+        dest_parts++;
+    }
+    while (dest_parts != parts_dest_last);
+}
+
+static void
+interp_vertical_bilinear_add_128bpp (uint64_t F,
+                                     const uint64_t * SMOL_RESTRICT top_src_row_parts,
+                                     const uint64_t * SMOL_RESTRICT bottom_src_row_parts,
+                                     uint64_t * SMOL_RESTRICT accum_out,
+                                     uint32_t width)
+{
+    uint64_t *accum_dest_last = accum_out + width;
+
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (accum_out, uint64_t *);
+
+    do
+    {
+        uint64_t p, q;
+
+        p = *(top_src_row_parts++);
+        q = *(bottom_src_row_parts++);
+
+        *(accum_out++) += ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL;
+    }
+    while (accum_out != accum_dest_last);
+}
+
+#define DEF_INTERP_VERTICAL_BILINEAR_FINAL(n_halvings) \
+static void \
+interp_vertical_bilinear_final_##n_halvings##h_64bpp (uint64_t F, \
+                                                      const uint64_t * SMOL_RESTRICT top_src_row_parts, \
+                                                      const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \
+                                                      uint64_t * SMOL_RESTRICT accum_inout, \
+                                                      uint32_t width) \
+{ \
+    uint64_t *accum_inout_last = accum_inout + width; \
+\
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \
+\
+    do \
+    { \
+        uint64_t p, q; \
+\
+        p = *(top_src_row_parts++); \
+        q = *(bottom_src_row_parts++); \
+\
+        p = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \
+        p = ((p + *accum_inout) >> n_halvings) & 0x00ff00ff00ff00ffULL; \
+\
+        *(accum_inout++) = p; \
+    } \
+    while (accum_inout != accum_inout_last); \
+} \
+\
+static void \
+interp_vertical_bilinear_final_##n_halvings##h_with_opacity_64bpp (uint64_t F, \
+                                                                   const uint64_t * SMOL_RESTRICT top_src_row_parts, \
+                                                                   const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \
+                                                                   uint64_t * SMOL_RESTRICT accum_inout, \
+                                                                   uint32_t width, \
+                                                                   uint16_t opacity) \
+{ \
+    uint64_t *accum_inout_last = accum_inout + width; \
+\
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \
+\
+    do \
+    { \
+        uint64_t p, q; \
+\
+        p = *(top_src_row_parts++); \
+        q = *(bottom_src_row_parts++); \
+\
+        p = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \
+        p = ((p + *accum_inout) >> n_halvings) & 0x00ff00ff00ff00ffULL; \
+\
+        apply_subpixel_opacity_64bpp (&p, opacity); \
+        *(accum_inout++) = p; \
+    } \
+    while (accum_inout != accum_inout_last); \
+} \
+\
+static void \
+interp_vertical_bilinear_final_##n_halvings##h_128bpp (uint64_t F, \
+                                                       const uint64_t * SMOL_RESTRICT top_src_row_parts, \
+                                                       const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \
+                                                       uint64_t * SMOL_RESTRICT accum_inout, \
+                                                       uint32_t width) \
+{ \
+    uint64_t *accum_inout_last = accum_inout + width; \
+\
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \
+\
+    do \
+    { \
+        uint64_t p, q; \
+\
+        p = *(top_src_row_parts++); \
+        q = *(bottom_src_row_parts++); \
+\
+        p = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \
+        p = ((p + *accum_inout) >> n_halvings) & 0x00ffffff00ffffffULL; \
+\
+        *(accum_inout++) = p; \
+    } \
+    while (accum_inout != accum_inout_last); \
+} \
+\
+static void \
+interp_vertical_bilinear_final_##n_halvings##h_with_opacity_128bpp (uint64_t F, \
+                                                                    const uint64_t * SMOL_RESTRICT top_src_row_parts, \
+                                                                    const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \
+                                                                    uint64_t * SMOL_RESTRICT accum_inout, \
+                                                                    uint32_t width, \
+                                                                    uint16_t opacity) \
+{ \
+    uint64_t *accum_inout_last = accum_inout + width; \
+\
+    SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \
+    SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \
+\
+    do \
+    { \
+        uint64_t p, q; \
+\
+        p = *(top_src_row_parts++); \
+        q = *(bottom_src_row_parts++); \
+\
+        p = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \
+        p = ((p + *accum_inout) >> n_halvings) & 0x00ffffff00ffffffULL; \
+\
+        apply_subpixel_opacity_128bpp_half (&p, opacity); \
+        *(accum_inout++) = p; \
+    } \
+    while (accum_inout != accum_inout_last); \
+}
+
+#define DEF_SCALE_DEST_ROW_BILINEAR(n_halvings) \
+static int \
+scale_dest_row_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \
+                                               SmolLocalCtx *local_ctx, \
+                                               uint32_t dest_row_index) \
+{ \
+    uint16_t *precalc_y = scale_ctx->vdim.precalc; \
+    uint32_t bilin_index = dest_row_index << (n_halvings); \
+    unsigned int i; \
+\
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \
+    interp_vertical_bilinear_store_64bpp (precalc_y [bilin_index * 2 + 1], \
+                                          local_ctx->parts_row [0], \
+                                          local_ctx->parts_row [1], \
+                                          local_ctx->parts_row [2], \
+                                          scale_ctx->hdim.placement_size_px); \
+    bilin_index++; \
+\
+    for (i = 0; i < (1 << (n_halvings)) - 2; i++) \
+    { \
+        update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \
+        interp_vertical_bilinear_add_64bpp (precalc_y [bilin_index * 2 + 1], \
+                                            local_ctx->parts_row [0], \
+                                            local_ctx->parts_row [1], \
+                                            local_ctx->parts_row [2], \
+                                            scale_ctx->hdim.placement_size_px); \
+        bilin_index++; \
+    } \
+\
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \
+\
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) \
+        interp_vertical_bilinear_final_##n_halvings##h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1], \
+                                                                           local_ctx->parts_row [0], \
+                                                                           local_ctx->parts_row [1], \
+                                                                           local_ctx->parts_row [2], \
+                                                                           scale_ctx->hdim.placement_size_px, \
+                                                                           scale_ctx->vdim.first_opacity); \
+    else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) \
+        interp_vertical_bilinear_final_##n_halvings##h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1], \
+                                                                           local_ctx->parts_row [0], \
+                                                                           local_ctx->parts_row [1], \
+                                                                           local_ctx->parts_row [2], \
+                                                                           scale_ctx->hdim.placement_size_px, \
+                                                                           scale_ctx->vdim.last_opacity); \
+    else \
+        interp_vertical_bilinear_final_##n_halvings##h_64bpp (precalc_y [bilin_index * 2 + 1], \
+                                                              local_ctx->parts_row [0], \
+                                                              local_ctx->parts_row [1], \
+                                                              local_ctx->parts_row [2], \
+                                                              scale_ctx->hdim.placement_size_px); \
+\
+    return 2; \
+} \
+\
+static int \
+scale_dest_row_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \
+                                                SmolLocalCtx *local_ctx, \
+                                                uint32_t dest_row_index) \
+{ \
+    uint16_t *precalc_y = scale_ctx->vdim.precalc; \
+    uint32_t bilin_index = dest_row_index << (n_halvings); \
+    unsigned int i; \
+\
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \
+    interp_vertical_bilinear_store_128bpp (precalc_y [bilin_index * 2 + 1], \
+                                           local_ctx->parts_row [0], \
+                                           local_ctx->parts_row [1], \
+                                           local_ctx->parts_row [2], \
+                                           scale_ctx->hdim.placement_size_px * 2); \
+    bilin_index++; \
+\
+    for (i = 0; i < (1 << (n_halvings)) - 2; i++) \
+    { \
+        update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \
+        interp_vertical_bilinear_add_128bpp (precalc_y [bilin_index * 2 + 1], \
+                                             local_ctx->parts_row [0], \
+                                             local_ctx->parts_row [1], \
+                                             local_ctx->parts_row [2], \
+                                             scale_ctx->hdim.placement_size_px * 2); \
+        bilin_index++; \
+    } \
+\
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \
+\
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) \
+        interp_vertical_bilinear_final_##n_halvings##h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1], \
+                                                                            local_ctx->parts_row [0], \
+                                                                            local_ctx->parts_row [1], \
+                                                                            local_ctx->parts_row [2], \
+                                                                            scale_ctx->hdim.placement_size_px * 2, \
+                                                                            scale_ctx->vdim.first_opacity); \
+    else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) \
+        interp_vertical_bilinear_final_##n_halvings##h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1], \
+                                                                            local_ctx->parts_row [0], \
+                                                                            local_ctx->parts_row [1], \
+                                                                            local_ctx->parts_row [2], \
+                                                                            scale_ctx->hdim.placement_size_px * 2, \
+                                                                            scale_ctx->vdim.last_opacity); \
+    else \
+        interp_vertical_bilinear_final_##n_halvings##h_128bpp (precalc_y [bilin_index * 2 + 1], \
+                                                               local_ctx->parts_row [0], \
+                                                               local_ctx->parts_row [1], \
+                                                               local_ctx->parts_row [2], \
+                                                               scale_ctx->hdim.placement_size_px * 2); \
+\
+    return 2; \
+}
+
+static int
+scale_dest_row_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx,
+                                  SmolLocalCtx *local_ctx,
+                                  uint32_t dest_row_index)
+{
+    uint16_t *precalc_y = scale_ctx->vdim.precalc;
+
+    update_local_ctx_bilinear (scale_ctx, local_ctx, dest_row_index);
+
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+        interp_vertical_bilinear_store_with_opacity_64bpp (precalc_y [dest_row_index * 2 + 1],
+                                                           local_ctx->parts_row [0],
+                                                           local_ctx->parts_row [1],
+                                                           local_ctx->parts_row [2],
+                                                           scale_ctx->hdim.placement_size_px,
+                                                           scale_ctx->vdim.first_opacity);
+    else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256)
+        interp_vertical_bilinear_store_with_opacity_64bpp (precalc_y [dest_row_index * 2 + 1],
+                                                           local_ctx->parts_row [0],
+                                                           local_ctx->parts_row [1],
+                                                           local_ctx->parts_row [2],
+                                                           scale_ctx->hdim.placement_size_px,
+                                                           scale_ctx->vdim.last_opacity);
+    else
+        interp_vertical_bilinear_store_64bpp (precalc_y [dest_row_index * 2 + 1],
+                                              local_ctx->parts_row [0],
+                                              local_ctx->parts_row [1],
+                                              local_ctx->parts_row [2],
+                                              scale_ctx->hdim.placement_size_px);
+
+    return 2;
+}
+
+static int
+scale_dest_row_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx,
+                                   SmolLocalCtx *local_ctx,
+                                   uint32_t dest_row_index)
+{
+    uint16_t *precalc_y = scale_ctx->vdim.precalc;
+
+    update_local_ctx_bilinear (scale_ctx, local_ctx, dest_row_index);
+
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+        interp_vertical_bilinear_store_with_opacity_128bpp (precalc_y [dest_row_index * 2 + 1],
+                                                            local_ctx->parts_row [0],
+                                                            local_ctx->parts_row [1],
+                                                            local_ctx->parts_row [2],
+                                                            scale_ctx->hdim.placement_size_px * 2,
+                                                            scale_ctx->vdim.first_opacity);
+    else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256)
+        interp_vertical_bilinear_store_with_opacity_128bpp (precalc_y [dest_row_index * 2 + 1],
+                                                            local_ctx->parts_row [0],
+                                                            local_ctx->parts_row [1],
+                                                            local_ctx->parts_row [2],
+                                                            scale_ctx->hdim.placement_size_px * 2,
+                                                            scale_ctx->vdim.last_opacity);
+    else
+        interp_vertical_bilinear_store_128bpp (precalc_y [dest_row_index * 2 + 1],
+                                               local_ctx->parts_row [0],
+                                               local_ctx->parts_row [1],
+                                               local_ctx->parts_row [2],
+                                               scale_ctx->hdim.placement_size_px * 2);
+
+    return 2;
+}
+
+DEF_INTERP_VERTICAL_BILINEAR_FINAL(1)
+
+static int
+scale_dest_row_bilinear_1h_64bpp (const SmolScaleCtx *scale_ctx,
+                                  SmolLocalCtx *local_ctx,
+                                  uint32_t dest_row_index)
+{
+    uint16_t *precalc_y = scale_ctx->vdim.precalc;
+    uint32_t bilin_index = dest_row_index << 1;
+
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index);
+    interp_vertical_bilinear_store_64bpp (precalc_y [bilin_index * 2 + 1],
+                                          local_ctx->parts_row [0],
+                                          local_ctx->parts_row [1],
+                                          local_ctx->parts_row [2],
+                                          scale_ctx->hdim.placement_size_px);
+    bilin_index++;
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index);
+
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+        interp_vertical_bilinear_final_1h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1],
+                                                              local_ctx->parts_row [0],
+                                                              local_ctx->parts_row [1],
+                                                              local_ctx->parts_row [2],
+                                                              scale_ctx->hdim.placement_size_px,
+                                                              scale_ctx->vdim.first_opacity);
+    else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256)
+        interp_vertical_bilinear_final_1h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1],
+                                                              local_ctx->parts_row [0],
+                                                              local_ctx->parts_row [1],
+                                                              local_ctx->parts_row [2],
+                                                              scale_ctx->hdim.placement_size_px,
+                                                              scale_ctx->vdim.last_opacity);
+    else
+        interp_vertical_bilinear_final_1h_64bpp (precalc_y [bilin_index * 2 + 1],
+                                                 local_ctx->parts_row [0],
+                                                 local_ctx->parts_row [1],
+                                                 local_ctx->parts_row [2],
+                                                 scale_ctx->hdim.placement_size_px);
+
+    return 2;
+}
+
+static int
+scale_dest_row_bilinear_1h_128bpp (const SmolScaleCtx *scale_ctx,
+                                   SmolLocalCtx *local_ctx,
+                                   uint32_t dest_row_index)
+{
+    uint16_t *precalc_y = scale_ctx->vdim.precalc;
+    uint32_t bilin_index = dest_row_index << 1;
+
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index);
+    interp_vertical_bilinear_store_128bpp (precalc_y [bilin_index * 2 + 1],
+                                           local_ctx->parts_row [0],
+                                           local_ctx->parts_row [1],
+                                           local_ctx->parts_row [2],
+                                           scale_ctx->hdim.placement_size_px * 2);
+    bilin_index++;
+    update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index);
+
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+        interp_vertical_bilinear_final_1h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1],
+                                                               local_ctx->parts_row [0],
+                                                               local_ctx->parts_row [1],
+                                                               local_ctx->parts_row [2],
+                                                               scale_ctx->hdim.placement_size_px * 2,
+                                                               scale_ctx->vdim.first_opacity);
+    else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256)
+        interp_vertical_bilinear_final_1h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1],
+                                                               local_ctx->parts_row [0],
+                                                               local_ctx->parts_row [1],
+                                                               local_ctx->parts_row [2],
+                                                               scale_ctx->hdim.placement_size_px * 2,
+                                                               scale_ctx->vdim.last_opacity);
+    else
+        interp_vertical_bilinear_final_1h_128bpp (precalc_y [bilin_index * 2 + 1],
+                                                  local_ctx->parts_row [0],
+                                                  local_ctx->parts_row [1],
+                                                  local_ctx->parts_row [2],
+                                                  scale_ctx->hdim.placement_size_px * 2);
+
+    return 2;
+}
+
+DEF_INTERP_VERTICAL_BILINEAR_FINAL(2)
+DEF_SCALE_DEST_ROW_BILINEAR(2)
+DEF_INTERP_VERTICAL_BILINEAR_FINAL(3)
+DEF_SCALE_DEST_ROW_BILINEAR(3)
+DEF_INTERP_VERTICAL_BILINEAR_FINAL(4)
+DEF_SCALE_DEST_ROW_BILINEAR(4)
+DEF_INTERP_VERTICAL_BILINEAR_FINAL(5)
+DEF_SCALE_DEST_ROW_BILINEAR(5)
+DEF_INTERP_VERTICAL_BILINEAR_FINAL(6)
+DEF_SCALE_DEST_ROW_BILINEAR(6)
+
+static void
+finalize_vertical_64bpp (const uint64_t * SMOL_RESTRICT accums,
+                         uint64_t multiplier,
+                         uint64_t * SMOL_RESTRICT dest_parts,
+                         uint32_t n)
+{
+    uint64_t *parts_dest_max = dest_parts + n;
+
+    SMOL_ASSUME_ALIGNED (accums, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
+
+    while (dest_parts != parts_dest_max)
+    {
+        *(dest_parts++) = scale_64bpp (*(accums++), multiplier);
+    }
+}
+
+static void
+finalize_vertical_with_opacity_64bpp (const uint64_t * SMOL_RESTRICT accums,
+                                      uint64_t multiplier,
+                                      uint64_t * SMOL_RESTRICT dest_parts,
+                                      uint32_t n,
+                                      uint16_t opacity)
+{
+    uint64_t *parts_dest_max = dest_parts + n;
+
+    SMOL_ASSUME_ALIGNED (accums, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
+
+    while (dest_parts != parts_dest_max)
+    {
+        *dest_parts = scale_64bpp (*(accums++), multiplier);
+        apply_subpixel_opacity_64bpp (dest_parts, opacity);
+        dest_parts++;
+    }
+}
+
+static int
+scale_dest_row_box_64bpp (const SmolScaleCtx *scale_ctx,
+                          SmolLocalCtx *local_ctx,
+                          uint32_t dest_row_index)
+{
+    uint32_t *precalc_y = scale_ctx->vdim.precalc;
+    uint32_t ofs_y, ofs_y_max;
+    uint32_t w1, w2;
+    uint32_t n, i;
+
+    unpack_box_precalc (precalc_y [dest_row_index],
+                        scale_ctx->vdim.span_step,
+                        &ofs_y,
+                        &ofs_y_max,
+                        &w1,
+                        &w2,
+                        &n);
+
+    /* First input row */
+
+    scale_horizontal (scale_ctx,
+                      local_ctx,
+                      src_row_ofs_to_pointer (scale_ctx, ofs_y),
+                      local_ctx->parts_row [0]);
+    copy_weighted_parts_64bpp (local_ctx->parts_row [0],
+                               local_ctx->parts_row [1],
+                               scale_ctx->hdim.placement_size_px,
+                               w1);
+    ofs_y++;
+
+    /* Add up whole input rows */
+
+    for (i = 0; i < n; i++)
+    {
+        scale_horizontal (scale_ctx,
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, ofs_y),
+                          local_ctx->parts_row [0]);
+        add_parts (local_ctx->parts_row [0],
+                   local_ctx->parts_row [1],
+                   scale_ctx->hdim.placement_size_px);
+
+        ofs_y++;
+    }
+
+    /* Last input row */
+
+    if (ofs_y < scale_ctx->vdim.src_size_px)
+    {
+        scale_horizontal (scale_ctx,
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, ofs_y),
+                          local_ctx->parts_row [0]);
+        add_weighted_parts_64bpp (local_ctx->parts_row [0],
+                                  local_ctx->parts_row [1],
+                                  scale_ctx->hdim.placement_size_px,
+                                  w2);
+    }
+
+    /* Finalize */
+
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+    {
+        finalize_vertical_with_opacity_64bpp (local_ctx->parts_row [1],
+                                              scale_ctx->vdim.span_mul,
+                                              local_ctx->parts_row [0],
+                                              scale_ctx->hdim.placement_size_px,
+                                              scale_ctx->vdim.first_opacity);
+    }
+    else if (dest_row_index == scale_ctx->vdim.placement_size_px - 1 && scale_ctx->vdim.last_opacity < 256)
+    {
+        finalize_vertical_with_opacity_64bpp (local_ctx->parts_row [1],
+                                              scale_ctx->vdim.span_mul,
+                                              local_ctx->parts_row [0],
+                                              scale_ctx->hdim.placement_size_px,
+                                              scale_ctx->vdim.last_opacity);
+    }
+    else
+    {
+        finalize_vertical_64bpp (local_ctx->parts_row [1],
+                                 scale_ctx->vdim.span_mul,
+                                 local_ctx->parts_row [0],
+                                 scale_ctx->hdim.placement_size_px);
+    }
+
+    return 0;
+}
+
+static void
+finalize_vertical_128bpp (const uint64_t * SMOL_RESTRICT accums,
+                          uint64_t multiplier,
+                          uint64_t * SMOL_RESTRICT dest_parts,
+                          uint32_t n)
+{
+    uint64_t *parts_dest_max = dest_parts + n * 2;
+
+    SMOL_ASSUME_ALIGNED (accums, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
+
+    while (dest_parts != parts_dest_max)
+    {
+        *(dest_parts++) = scale_128bpp_half (*(accums++), multiplier);
+        *(dest_parts++) = scale_128bpp_half (*(accums++), multiplier);
+    }
+}
+
+static void
+finalize_vertical_with_opacity_128bpp (const uint64_t * SMOL_RESTRICT accums,
+                                       uint64_t multiplier,
+                                       uint64_t * SMOL_RESTRICT dest_parts,
+                                       uint32_t n,
+                                       uint16_t opacity)
+{
+    uint64_t *parts_dest_max = dest_parts + n * 2;
+
+    SMOL_ASSUME_ALIGNED (accums, const uint64_t *);
+    SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *);
+
+    while (dest_parts != parts_dest_max)
+    {
+        dest_parts [0] = scale_128bpp_half (*(accums++), multiplier);
+        dest_parts [1] = scale_128bpp_half (*(accums++), multiplier);
+        apply_subpixel_opacity_128bpp (dest_parts, opacity);
+        dest_parts += 2;
+    }
+}
+
+static int
+scale_dest_row_box_128bpp (const SmolScaleCtx *scale_ctx,
+                           SmolLocalCtx *local_ctx,
+                           uint32_t dest_row_index)
+{
+    uint32_t *precalc_y = scale_ctx->vdim.precalc;
+    uint32_t ofs_y, ofs_y_max;
+    uint32_t w1, w2;
+    uint32_t n, i;
+
+    unpack_box_precalc (precalc_y [dest_row_index],
+                        scale_ctx->vdim.span_step,
+                        &ofs_y,
+                        &ofs_y_max,
+                        &w1,
+                        &w2,
+                        &n);
+
+    /* First input row */
+
+    scale_horizontal (scale_ctx,
+                      local_ctx,
+                      src_row_ofs_to_pointer (scale_ctx, ofs_y),
+                      local_ctx->parts_row [0]);
+    copy_weighted_parts_128bpp (local_ctx->parts_row [0],
+                                local_ctx->parts_row [1],
+                                scale_ctx->hdim.placement_size_px,
+                                w1);
+    ofs_y++;
+
+    /* Add up whole input rows */
+
+    for (i = 0; i < n; i++)
+    {
+        scale_horizontal (scale_ctx,
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, ofs_y),
+                          local_ctx->parts_row [0]);
+        add_parts (local_ctx->parts_row [0],
+                   local_ctx->parts_row [1],
+                   scale_ctx->hdim.placement_size_px * 2);
+
+        ofs_y++;
+    }
+
+    /* Last input row */
+
+    if (ofs_y < scale_ctx->vdim.src_size_px)
+    {
+        scale_horizontal (scale_ctx,
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, ofs_y),
+                          local_ctx->parts_row [0]);
+        add_weighted_parts_128bpp (local_ctx->parts_row [0],
+                                   local_ctx->parts_row [1],
+                                   scale_ctx->hdim.placement_size_px,
+                                   w2);
+    }
+
+    if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+    {
+        finalize_vertical_with_opacity_128bpp (local_ctx->parts_row [1],
+                                               scale_ctx->vdim.span_mul,
+                                               local_ctx->parts_row [0],
+                                               scale_ctx->hdim.placement_size_px,
+                                               scale_ctx->vdim.first_opacity);
+    }
+    else if (dest_row_index == scale_ctx->vdim.placement_size_px - 1 && scale_ctx->vdim.last_opacity < 256)
+    {
+        finalize_vertical_with_opacity_128bpp (local_ctx->parts_row [1],
+                                               scale_ctx->vdim.span_mul,
+                                               local_ctx->parts_row [0],
+                                               scale_ctx->hdim.placement_size_px,
+                                               scale_ctx->vdim.last_opacity);
+    }
+    else
+    {
+        finalize_vertical_128bpp (local_ctx->parts_row [1],
+                                  scale_ctx->vdim.span_mul,
+                                  local_ctx->parts_row [0],
+                                  scale_ctx->hdim.placement_size_px);
+    }
+
+    return 0;
+}
+
+static int
+scale_dest_row_one_64bpp (const SmolScaleCtx *scale_ctx,
+                          SmolLocalCtx *local_ctx,
+                          uint32_t row_index)
+{
+    /* Scale the row and store it */
+
+    if (local_ctx->src_ofs != 0)
+    {
+        scale_horizontal (scale_ctx,
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, 0),
+                          local_ctx->parts_row [0]);
+        local_ctx->src_ofs = 0;
+    }
+
+    if (row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+    {
+        apply_subpixel_opacity_row_copy_64bpp (local_ctx->parts_row [0],
+                                               local_ctx->parts_row [1],
+                                               scale_ctx->hdim.placement_size_px,
+                                               scale_ctx->vdim.first_opacity);
+    }
+    else if (row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256)
+    {
+        apply_subpixel_opacity_row_copy_64bpp (local_ctx->parts_row [0],
+                                               local_ctx->parts_row [1],
+                                               scale_ctx->hdim.placement_size_px,
+                                               scale_ctx->vdim.last_opacity);
+    }
+    else
+    {
+        memcpy (local_ctx->parts_row [1],
+                local_ctx->parts_row [0],
+                scale_ctx->hdim.placement_size_px * sizeof (uint64_t));
+    }
+
+    return 1;
+}
+
+static int
+scale_dest_row_one_128bpp (const SmolScaleCtx *scale_ctx,
+                           SmolLocalCtx *local_ctx,
+                           uint32_t row_index)
+{
+    /* Scale the row and store it */
+
+    if (local_ctx->src_ofs != 0)
+    {
+        scale_horizontal (scale_ctx,
+                          local_ctx,
+                          src_row_ofs_to_pointer (scale_ctx, 0),
+                          local_ctx->parts_row [0]);
+        local_ctx->src_ofs = 0;
+    }
+
+    if (row_index == 0 && scale_ctx->vdim.first_opacity < 256)
+    {
+        apply_subpixel_opacity_row_copy_128bpp (local_ctx->parts_row [0],
+                                                local_ctx->parts_row [1],
+                                                scale_ctx->hdim.placement_size_px,
+                                                scale_ctx->vdim.first_opacity);
+    }
+    else if (row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256)
+    {
+        apply_subpixel_opacity_row_copy_128bpp (local_ctx->parts_row [0],
+                                                local_ctx->parts_row [1],
+                                                scale_ctx->hdim.placement_size_px,
+                                                scale_ctx->vdim.last_opacity);
+    }
+    else
+    {
+        memcpy (local_ctx->parts_row [1],
+                local_ctx->parts_row [0],
+                scale_ctx->hdim.placement_size_px * sizeof (uint64_t) * 2);
+    }
+
+    return 1;
+}
+
+static int
+scale_dest_row_copy (const SmolScaleCtx *scale_ctx,
+                     SmolLocalCtx *local_ctx,
+                     uint32_t row_index)
+{
+    scale_horizontal (scale_ctx,
+                      local_ctx,
+                      src_row_ofs_to_pointer (scale_ctx, row_index),
+                      local_ctx->parts_row [0]);
+
+    return 0;
+}
+
+/* ----------- *
+ * Compositing *
+ * ----------- */
+
+static void
+composite_over_color_64bpp (uint64_t * SMOL_RESTRICT srcdest_row,
+                            const uint64_t * SMOL_RESTRICT color_pixel,
+                            uint32_t n_pixels)
+{
+    uint32_t i;
+
+    SMOL_ASSUME_ALIGNED_TO (srcdest_row, uint64_t *, sizeof (uint64_t));
+    SMOL_ASSUME_ALIGNED_TO (color_pixel, const uint64_t *, sizeof (uint64_t));
+
+    for (i = 0; i < n_pixels; i++)
+    {
+        uint64_t a = srcdest_row [i] & 0xff;
+
+        srcdest_row [i] += (((*color_pixel) * (0xff - a)) >> 8) & 0x00ff00ff00ff00ff;
+    }
+}
+
+static void
+composite_over_color_128bpp (uint64_t * SMOL_RESTRICT srcdest_row,
+                             const uint64_t * SMOL_RESTRICT color_pixel,
+                             uint32_t n_pixels)
+{
+    uint32_t i;
+
+    SMOL_ASSUME_ALIGNED_TO (srcdest_row, uint64_t *, sizeof (uint64_t) * 2);
+    SMOL_ASSUME_ALIGNED_TO (color_pixel, const uint64_t *, sizeof (uint64_t));
+
+    for (i = 0; i < n_pixels * 2; i += 2)
+    {
+        uint64_t a = (srcdest_row [i + 1] >> 4) & 0xfff;
+
+        srcdest_row [i] += ((color_pixel [0] * (0xfff - a)) >> 12) & 0x000fffff000fffff;
+        srcdest_row [i + 1] += ((color_pixel [1] * (0xfff - a)) >> 12) & 0x000fffff000fffff;
+    }
+}
+
+static void
+composite_over_dest_64bpp (const uint64_t * SMOL_RESTRICT src_row,
+                           uint64_t * SMOL_RESTRICT dest_row,
+                           uint32_t n_pixels)
+{
+    uint32_t i;
+
+    SMOL_ASSUME_ALIGNED_TO (src_row, const uint64_t *, sizeof (uint64_t));
+    SMOL_ASSUME_ALIGNED_TO (dest_row, uint64_t *, sizeof (uint64_t));
+
+    for (i = 0; i < n_pixels; i++)
+    {
+        dest_row [i] = ((src_row [i] + dest_row [i]) >> 1) & 0x7fff7fff7fff7fff;
+    }
+}
+
+static void
+composite_over_dest_128bpp (const uint64_t * SMOL_RESTRICT src_row,
+                            uint64_t * SMOL_RESTRICT dest_row,
+                            uint32_t n_pixels)
+{
+    uint32_t i;
+
+    SMOL_ASSUME_ALIGNED_TO (src_row, const uint64_t *, sizeof (uint64_t) * 2);
+    SMOL_ASSUME_ALIGNED_TO (dest_row, uint64_t *, sizeof (uint64_t) * 2);
+
+    for (i = 0; i < n_pixels * 2; i += 2)
+    {
+        dest_row [i] = ((src_row [i] + dest_row [i]) >> 1) & 0x7fffffff7fffffff;
+        dest_row [i + 1] = ((src_row [i + 1] + dest_row [i + 1]) >> 1) & 0x7fffffff7fffffff;
+    }
+}
+
+/* -------- *
+ * Clearing *
+ * -------- */
+
+static void
+clear_24bpp (const void *src_pixel_batch,
+             void *dest_row,
+             uint32_t n_pixels)
+{
+    const uint8_t *src_pixel_batch_u8 = src_pixel_batch;
+    const uint32_t *src_pixel_batch_u32 = src_pixel_batch;
+    uint8_t *dest_row_u8 = dest_row;
+    uint32_t *dest_row_u32 = dest_row;
+    uint32_t i;
+
+    SMOL_ASSUME_ALIGNED_TO (src_pixel_batch_u32, const uint32_t *, sizeof (uint32_t));
+
+    for (i = 0; n_pixels - i >= 4; i += 4)
+    {
+        *(dest_row_u32++) = src_pixel_batch_u32 [0];
+        *(dest_row_u32++) = src_pixel_batch_u32 [1];
+        *(dest_row_u32++) = src_pixel_batch_u32 [2];
+    }
+
+    for ( ; i < n_pixels; i++)
+    {
+        dest_row_u8 [i * 3] = src_pixel_batch_u8 [0];
+        dest_row_u8 [i * 3 + 1] = src_pixel_batch_u8 [1];
+        dest_row_u8 [i * 3 + 2] = src_pixel_batch_u8 [2];
+    }
+}
+
+static void
+clear_32bpp (const void *src_pixel_batch,
+             void *dest_row,
+             uint32_t n_pixels)
+{
+    const uint32_t *src_pixel_batch_u32 = src_pixel_batch;
+    uint32_t *dest_row_u32 = dest_row;
+    uint32_t i;
+
+    SMOL_ASSUME_ALIGNED_TO (src_pixel_batch_u32, const uint32_t *, sizeof (uint32_t));
+
+    for (i = 0; i < n_pixels; i++)
+        dest_row_u32 [i] = src_pixel_batch_u32 [0];
+}
+
+/* --------------- *
+ * Function tables *
+ * --------------- */
+
+#define R SMOL_REPACK_META
+
+static const SmolRepackMeta repack_meta [] =
+{
+    R (123,   24, PREMUL8,      COMPRESSED, 1324,  64, PREMUL8,       COMPRESSED),
+
+    R (123,   24, PREMUL8,      COMPRESSED, 1234, 128, PREMUL8,       COMPRESSED),
+    R (123,   24, PREMUL8,      COMPRESSED, 1234, 128, PREMUL8,       LINEAR),
+
+    R (1234,  32, PREMUL8,      COMPRESSED, 1324,  64, PREMUL8,       COMPRESSED),
+    R (1234,  32, PREMUL8,      COMPRESSED, 2431,  64, PREMUL8,       COMPRESSED),
+    R (1234,  32, PREMUL8,      COMPRESSED, 3241,  64, PREMUL8,       COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 1324,  64, PREMUL8,       COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 2431,  64, PREMUL8,       COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 3241,  64, PREMUL8,       COMPRESSED),
+
+    R (1234,  32, PREMUL8,      COMPRESSED, 1234, 128, PREMUL8,       COMPRESSED),
+    R (1234,  32, PREMUL8,      COMPRESSED, 2341, 128, PREMUL8,       COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 1234, 128, PREMUL8,       COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 2341, 128, PREMUL8,       COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 1234, 128, PREMUL16,      COMPRESSED),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 2341, 128, PREMUL16,      COMPRESSED),
+    R (1234,  32, PREMUL8,      COMPRESSED, 1234, 128, PREMUL8,       LINEAR),
+    R (1234,  32, PREMUL8,      COMPRESSED, 2341, 128, PREMUL8,       LINEAR),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 1234, 128, PREMUL8,       LINEAR),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 2341, 128, PREMUL8,       LINEAR),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 1234, 128, PREMUL16,      LINEAR),
+    R (1234,  32, UNASSOCIATED, COMPRESSED, 2341, 128, PREMUL16,      LINEAR),
+
+    R (1234,  64, PREMUL8,      COMPRESSED, 132,   24, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 231,   24, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 324,   24, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 423,   24, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 132,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 231,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 324,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 423,   24, UNASSOCIATED,  COMPRESSED),
+
+    R (1234,  64, PREMUL8,      COMPRESSED, 1324,  32, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 1423,  32, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 2314,  32, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 4132,  32, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 4231,  32, PREMUL8,       COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 1324,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 1423,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 2314,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 4132,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234,  64, PREMUL8,      COMPRESSED, 4231,  32, UNASSOCIATED,  COMPRESSED),
+
+    R (1234, 128, PREMUL8,      COMPRESSED, 123,   24, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 321,   24, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 123,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 321,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     COMPRESSED, 123,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     COMPRESSED, 321,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      LINEAR,     123,   24, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      LINEAR,     321,   24, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      LINEAR,     123,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      LINEAR,     321,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     LINEAR,     123,   24, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     LINEAR,     321,   24, UNASSOCIATED,  COMPRESSED),
+
+    R (1234, 128, PREMUL8,      COMPRESSED, 1234,  32, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 3214,  32, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 4123,  32, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 4321,  32, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 1234,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 3214,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 4123,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      COMPRESSED, 4321,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     COMPRESSED, 1234,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     COMPRESSED, 3214,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     COMPRESSED, 4123,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     COMPRESSED, 4321,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      LINEAR,     1234,  32, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      LINEAR,     3214,  32, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      LINEAR,     4123,  32, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      LINEAR,     4321,  32, PREMUL8,       COMPRESSED),
+    R (1234, 128, PREMUL8,      LINEAR,     1234,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      LINEAR,     3214,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      LINEAR,     4123,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL8,      LINEAR,     4321,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     LINEAR,     1234,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     LINEAR,     3214,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     LINEAR,     4123,  32, UNASSOCIATED,  COMPRESSED),
+    R (1234, 128, PREMUL16,     LINEAR,     4321,  32, UNASSOCIATED,  COMPRESSED),
+
+    SMOL_REPACK_META_LAST
+};
+
+#undef R
+
+static const SmolImplementation implementation =
+{
+    /* Horizontal init */
+    init_horizontal,
+
+    /* Vertical init */
+    init_vertical,
+
+    {
+        /* Horizontal filters */
+        {
+            /* 24bpp */
+        },
+        {
+            /* 32bpp */
+        },
+        {
+            /* 64bpp */
+            interp_horizontal_copy_64bpp,
+            interp_horizontal_one_64bpp,
+            interp_horizontal_bilinear_0h_64bpp,
+            interp_horizontal_bilinear_1h_64bpp,
+            interp_horizontal_bilinear_2h_64bpp,
+            interp_horizontal_bilinear_3h_64bpp,
+            interp_horizontal_bilinear_4h_64bpp,
+            interp_horizontal_bilinear_5h_64bpp,
+            interp_horizontal_bilinear_6h_64bpp,
+            interp_horizontal_boxes_64bpp
+        },
+        {
+            /* 128bpp */
+            interp_horizontal_copy_128bpp,
+            interp_horizontal_one_128bpp,
+            interp_horizontal_bilinear_0h_128bpp,
+            interp_horizontal_bilinear_1h_128bpp,
+            interp_horizontal_bilinear_2h_128bpp,
+            interp_horizontal_bilinear_3h_128bpp,
+            interp_horizontal_bilinear_4h_128bpp,
+            interp_horizontal_bilinear_5h_128bpp,
+            interp_horizontal_bilinear_6h_128bpp,
+            interp_horizontal_boxes_128bpp
+        }
+    },
+    {
+        /* Vertical filters */
+        {
+            /* 24bpp */
+        },
+        {
+            /* 32bpp */
+        },
+        {
+            /* 64bpp */
+            scale_dest_row_copy,
+            scale_dest_row_one_64bpp,
+            scale_dest_row_bilinear_0h_64bpp,
+            scale_dest_row_bilinear_1h_64bpp,
+            scale_dest_row_bilinear_2h_64bpp,
+            scale_dest_row_bilinear_3h_64bpp,
+            scale_dest_row_bilinear_4h_64bpp,
+            scale_dest_row_bilinear_5h_64bpp,
+            scale_dest_row_bilinear_6h_64bpp,
+            scale_dest_row_box_64bpp
+        },
+        {
+            /* 128bpp */
+            scale_dest_row_copy,
+            scale_dest_row_one_128bpp,
+            scale_dest_row_bilinear_0h_128bpp,
+            scale_dest_row_bilinear_1h_128bpp,
+            scale_dest_row_bilinear_2h_128bpp,
+            scale_dest_row_bilinear_3h_128bpp,
+            scale_dest_row_bilinear_4h_128bpp,
+            scale_dest_row_bilinear_5h_128bpp,
+            scale_dest_row_bilinear_6h_128bpp,
+            scale_dest_row_box_128bpp
+        }
+    },
+    {
+        /* Composite over color */
+        NULL,
+        NULL,
+        composite_over_color_64bpp,
+        composite_over_color_128bpp
+    },
+    {
+        /* Composite over dest */
+        NULL,
+        NULL,
+        composite_over_dest_64bpp,
+        composite_over_dest_128bpp
+    },
+    {
+        /* Clear dest */
+        clear_24bpp,
+        clear_32bpp,
+        NULL,
+        NULL
+    },
+    repack_meta
+};
+
+const SmolImplementation *
+_smol_get_generic_implementation (void)
+{
+    return &implementation;
+}
diff --git a/chafa/internal/smolscale/smolscale-private.h b/chafa/internal/smolscale/smolscale-private.h
index 4ad3429f..bb305841 100644
--- a/chafa/internal/smolscale/smolscale-private.h
+++ b/chafa/internal/smolscale/smolscale-private.h
@@ -2,6 +2,15 @@
 
 /* Copyright © 2019-2023 Hans Petter Jansson. See COPYING for details. */
 
+/* If you're just going to use Smolscale in your project, you don't have to
+ * worry about anything in here. The public API and documentation, such as
+ * it is, lives in smolscale.h.
+ *
+ * If, on the other hand, you're here to hack on Smolscale itself, this file
+ * contains all the internal shared declarations. */
+
+#undef SMOL_ENABLE_ASSERTS
+
 #include <stdint.h>
 #include "smolscale.h"
 
@@ -12,8 +21,16 @@
 extern "C" {
 #endif
 
-#include "config.h"
+#ifdef SMOL_ENABLE_ASSERTS
+# include <assert.h>
+# define SMOL_ASSERT(x) assert (x)
+#else
+# define SMOL_ASSERT(x)
+#endif
 
+/* We'll use at most ~4MB of scratch space. That won't fit on the stack
+ * everywhere, so we default to malloc(). If you know better, you can define
+ * SMOL_USE_ALLOCA. */
 #ifdef SMOL_USE_ALLOCA
 # define _SMOL_ALLOC(n) alloca (n)
 # define _SMOL_FREE(p)
@@ -70,7 +87,10 @@ typedef unsigned int SmolBool;
 
 #define SMOL_ALIGNMENT 64
 
-#define SMOL_ASSUME_ALIGNED_TO(x, t, n) (x) = (t) __builtin_assume_aligned ((x), (n))
+#define SMOL_ASSIGN_ALIGNED_TO(x, t, n) (t) __builtin_assume_aligned ((x), (n))
+#define SMOL_ASSIGN_ALIGNED(x, t) SMOL_ASSIGN_ALIGNED_TO ((x), t, SMOL_ALIGNMENT)
+
+#define SMOL_ASSUME_ALIGNED_TO(x, t, n) (x) = SMOL_ASSIGN_ALIGNED_TO ((x), t, (n))
 #define SMOL_ASSUME_ALIGNED(x, t) SMOL_ASSUME_ALIGNED_TO ((x), t, SMOL_ALIGNMENT)
 
 /* Pointer to beginning of storage is stored in *r. This must be passed to smol_free() later. */
@@ -81,8 +101,11 @@ typedef unsigned int SmolBool;
 
 typedef enum
 {
+    SMOL_STORAGE_24BPP,
+    SMOL_STORAGE_32BPP,
     SMOL_STORAGE_64BPP,
     SMOL_STORAGE_128BPP,
+
     SMOL_STORAGE_MAX
 }
 SmolStorageType;
@@ -104,92 +127,281 @@ typedef enum
 }
 SmolFilterType;
 
+typedef enum
+{
+    SMOL_REORDER_1234_TO_1234,
+
+    SMOL_REORDER_1234_TO_2341,
+    SMOL_REORDER_1234_TO_3214,
+    SMOL_REORDER_1234_TO_4123,
+    SMOL_REORDER_1234_TO_4321,
+    SMOL_REORDER_1234_TO_123,
+    SMOL_REORDER_1234_TO_321,
+    SMOL_REORDER_123_TO_1234,
+
+    SMOL_REORDER_1234_TO_1324,
+    SMOL_REORDER_1234_TO_2314,
+    SMOL_REORDER_1234_TO_2431,
+    SMOL_REORDER_1234_TO_4132,
+    SMOL_REORDER_1234_TO_4231,
+    SMOL_REORDER_1234_TO_132,
+    SMOL_REORDER_1234_TO_231,
+    SMOL_REORDER_123_TO_1324,
+
+    SMOL_REORDER_1234_TO_324,
+    SMOL_REORDER_1234_TO_423,
+
+    SMOL_REORDER_1234_TO_1423,
+    SMOL_REORDER_1234_TO_3241,
+
+    SMOL_REORDER_MAX
+}
+SmolReorderType;
+
+typedef enum
+{
+    SMOL_ALPHA_UNASSOCIATED,
+    SMOL_ALPHA_PREMUL8,
+    SMOL_ALPHA_PREMUL16,
+
+    SMOL_ALPHA_MAX
+}
+SmolAlphaType;
+
+typedef enum
+{
+    SMOL_GAMMA_SRGB_COMPRESSED,
+    SMOL_GAMMA_SRGB_LINEAR,
+
+    SMOL_GAMMA_MAX
+}
+SmolGammaType;
+
+typedef struct
+{
+    unsigned char src [4];
+    unsigned char dest [4];
+}
+SmolReorderMeta;
+
+typedef struct
+{
+    unsigned char storage;
+    unsigned char pixel_stride;
+    unsigned char alpha;
+    unsigned char order [4];
+}
+SmolPixelTypeMeta;
+
 /* For reusing rows that have already undergone horizontal scaling */
 typedef struct
 {
-    uint32_t in_ofs;
+    uint32_t src_ofs;
     uint64_t *parts_row [4];
     uint64_t *row_storage [4];
-    uint32_t *in_aligned;
-    uint32_t *in_aligned_storage;
+    uint32_t *src_aligned;
+    uint32_t *src_aligned_storage;
 }
-SmolVerticalCtx;
+SmolLocalCtx;
 
-typedef void (SmolUnpackRowFunc) (const uint32_t *row_in,
-                                  uint64_t *row_out,
+typedef void (SmolInitFunc) (SmolScaleCtx *scale_ctx);
+typedef void (SmolRepackRowFunc) (const void *src_row,
+                                  void *dest_row,
                                   uint32_t n_pixels);
-typedef void (SmolPackRowFunc) (const uint64_t *row_in,
-                                uint32_t *row_out,
-                                uint32_t n_pixels);
 typedef void (SmolHFilterFunc) (const SmolScaleCtx *scale_ctx,
-                                const uint64_t *row_limbs_in,
-                                uint64_t *row_limbs_out);
-typedef void (SmolVFilterFunc) (const SmolScaleCtx *scale_ctx,
-                                SmolVerticalCtx *vertical_ctx,
-                                uint32_t outrow_index,
-                                uint32_t *row_out);
-
-#define SMOL_CONV_UNDEFINED { 0, NULL, NULL }
-#define SMOL_CONV(un_from_order, un_from_type, un_to_order, un_to_type, pk_from_order, pk_from_type, pk_to_order, pk_to_type, storage_bits) \
-{ storage_bits / 8, (SmolUnpackRowFunc *) unpack_row_##un_from_order##_##un_from_type##_to_##un_to_order##_##un_to_type##_##storage_bits##bpp, \
-(SmolPackRowFunc *) pack_row_##pk_from_order##_##pk_from_type##_to_##pk_to_order##_##pk_to_type##_##storage_bits##bpp }
+                                const uint64_t *src_row_limbs,
+                                uint64_t *dest_row_limbs);
+typedef int (SmolVFilterFunc) (const SmolScaleCtx *scale_ctx,
+                               SmolLocalCtx *local_ctx,
+                               uint32_t dest_row_index);
+typedef void (SmolCompositeOverColorFunc) (uint64_t *srcdest_row,
+                                           const uint64_t *color_pixel,
+                                           uint32_t n_pixels);
+typedef void (SmolCompositeOverDestFunc) (const uint64_t *src_row,
+                                          uint64_t *dest_row,
+                                          uint32_t n_pixels);
+typedef void (SmolClearFunc) (const void *src_pixel_batch,
+                              void *dest_row,
+                              uint32_t n_pixels);
+
+#define SMOL_REPACK_SIGNATURE_GET_REORDER(sig) ((sig) >> (2 * (SMOL_GAMMA_BITS + SMOL_ALPHA_BITS + SMOL_STORAGE_BITS)))
+
+#define SMOL_REORDER_BITS 6
+#define SMOL_STORAGE_BITS 2
+#define SMOL_ALPHA_BITS 2
+#define SMOL_GAMMA_BITS 1
+
+#define SMOL_MAKE_REPACK_SIGNATURE_ANY_ORDER(src_storage, src_alpha, src_gamma, \
+                                             dest_storage, dest_alpha, dest_gamma) \
+    (((src_storage) << (SMOL_GAMMA_BITS + SMOL_ALPHA_BITS + SMOL_STORAGE_BITS + SMOL_GAMMA_BITS + SMOL_ALPHA_BITS)) \
+     | ((src_alpha) << (SMOL_GAMMA_BITS + SMOL_ALPHA_BITS + SMOL_STORAGE_BITS + SMOL_GAMMA_BITS)) \
+     | ((src_gamma) << (SMOL_GAMMA_BITS + SMOL_ALPHA_BITS + SMOL_STORAGE_BITS)) \
+     | ((dest_storage) << (SMOL_GAMMA_BITS + SMOL_ALPHA_BITS))           \
+     | ((dest_alpha) << (SMOL_GAMMA_BITS))                               \
+     | ((dest_gamma) << 0))                                              \
+
+#define MASK_ITEM(m, n_bits) ((m) ? (1 << (n_bits)) - 1 : 0)
+
+#define SMOL_REPACK_SIGNATURE_ANY_ORDER_MASK(src_storage, src_alpha, src_gamma, \
+                                             dest_storage, dest_alpha, dest_gamma) \
+    SMOL_MAKE_REPACK_SIGNATURE_ANY_ORDER(MASK_ITEM (src_storage, SMOL_STORAGE_BITS), \
+                                         MASK_ITEM (src_alpha, SMOL_ALPHA_BITS), \
+                                         MASK_ITEM (src_gamma, SMOL_GAMMA_BITS), \
+                                         MASK_ITEM (dest_storage, SMOL_STORAGE_BITS), \
+                                         MASK_ITEM (dest_alpha, SMOL_ALPHA_BITS), \
+                                         MASK_ITEM (dest_gamma, SMOL_GAMMA_BITS))
+
+#define SMOL_REPACK_META(src_order, src_storage, src_alpha, src_gamma,      \
+                         dest_order, dest_storage, dest_alpha, dest_gamma)  \
+    { (((SMOL_REORDER_##src_order##_TO_##dest_order) << 10)               \
+       | ((SMOL_STORAGE_##src_storage##BPP) << 8) | ((SMOL_ALPHA_##src_alpha) << 6) \
+       | ((SMOL_GAMMA_SRGB_##src_gamma) << 5)                            \
+       | ((SMOL_STORAGE_##dest_storage##BPP) << 3) | ((SMOL_ALPHA_##dest_alpha) << 1) \
+       | ((SMOL_GAMMA_SRGB_##dest_gamma) << 0)), \
+    (SmolRepackRowFunc *) repack_row_##src_order##_##src_storage##_##src_alpha##_##src_gamma##_to_##dest_order##_##dest_storage##_##dest_alpha##_##dest_gamma }
+
+#define SMOL_REPACK_META_LAST { 0xffff, NULL }
 
 typedef struct
 {
-    uint8_t n_bytes_per_pixel;
-    SmolUnpackRowFunc *unpack_row_func;
-    SmolPackRowFunc *pack_row_func;
+    uint16_t signature;
+    SmolRepackRowFunc *repack_row_func;
 }
-SmolConversion;
+SmolRepackMeta;
+
+#define SMOL_REPACK_ROW_DEF(src_order, src_storage, src_limb_bits, src_alpha, src_gamma, \
+                            dest_order, dest_storage, dest_limb_bits, dest_alpha, dest_gamma) \
+    static void repack_row_##src_order##_##src_storage##_##src_alpha##_##src_gamma##_to_##dest_order##_##dest_storage##_##dest_alpha##_##dest_gamma \
+    (const uint##src_limb_bits##_t * SMOL_RESTRICT src_row,               \
+     uint##dest_limb_bits##_t * SMOL_RESTRICT dest_row,                   \
+     uint32_t n_pixels)                                                 \
+    {                                                                   \
+        uint##dest_limb_bits##_t *dest_row_max = dest_row + n_pixels * (dest_storage / dest_limb_bits); \
+        SMOL_ASSUME_ALIGNED_TO (src_row, uint##src_limb_bits##_t *, src_limb_bits / 8); \
+        SMOL_ASSUME_ALIGNED_TO (dest_row, uint##dest_limb_bits##_t *, dest_limb_bits / 8);
+
+#define SMOL_REPACK_ROW_DEF_END }
 
 typedef struct
 {
-    SmolConversion conversions [SMOL_STORAGE_MAX] [SMOL_PIXEL_MAX] [SMOL_PIXEL_MAX];
+    SmolInitFunc *init_h_func;
+    SmolInitFunc *init_v_func;
+    SmolHFilterFunc *hfilter_funcs [SMOL_STORAGE_MAX] [SMOL_FILTER_MAX];
+    SmolVFilterFunc *vfilter_funcs [SMOL_STORAGE_MAX] [SMOL_FILTER_MAX];
+    SmolCompositeOverColorFunc *composite_over_color_funcs [SMOL_STORAGE_MAX];
+    SmolCompositeOverDestFunc *composite_over_dest_funcs [SMOL_STORAGE_MAX];
+    SmolClearFunc *clear_funcs [SMOL_STORAGE_MAX];
+    const SmolRepackMeta *repack_meta;
 }
-SmolConversionTable;
+SmolImplementation;
 
 typedef struct
 {
-    SmolHFilterFunc *hfilter_funcs [SMOL_STORAGE_MAX] [SMOL_FILTER_MAX];
-    SmolVFilterFunc *vfilter_funcs [SMOL_STORAGE_MAX] [SMOL_FILTER_MAX];
+    void *precalc;
+    SmolFilterType filter_type;
+
+    uint32_t src_size_px, src_size_spx;
+    uint32_t dest_size_px, dest_size_spx;
+
+    unsigned int n_halvings;
+
+    int32_t placement_ofs_px, placement_ofs_spx;
+    uint32_t placement_size_px, placement_size_spx;
+    uint32_t placement_size_prehalving_px, placement_size_prehalving_spx;
+
+    uint32_t span_step;  /* For box filter, in spx */
+    uint32_t span_mul;  /* For box filter */
+
+    /* Opacity of first and last column or row. Used for subpixel placement
+     * and applied after each scaling step. */
+    uint16_t first_opacity, last_opacity;
+
+    /* Rows or cols to add consisting of unbroken pixel_color. This is done
+     * after scaling but before conversion to output pixel format. */
+    uint16_t clear_before_px, clear_after_px;
 
-    /* Can be a NULL pointer if the implementation does not override any
-     * conversions. */
-    const SmolConversionTable *ctab;
+    uint16_t clip_before_px, clip_after_px;
 }
-SmolImplementation;
+SmolDim;
+
+#define SMOL_CLEAR_BATCH_SIZE 96
 
 struct SmolScaleCtx
 {
     /* <private> */
 
-    const uint32_t *pixels_in;
-    uint32_t *pixels_out;
-    uint32_t width_in, height_in, rowstride_in;
-    uint32_t width_out, height_out, rowstride_out;
+    const char *src_pixels;
+    char *dest_pixels;
 
-    SmolPixelType pixel_type_in, pixel_type_out;
-    SmolFilterType filter_h, filter_v;
+    uint32_t src_rowstride;
+    uint32_t dest_rowstride;
+
+    SmolPixelType src_pixel_type, dest_pixel_type;
     SmolStorageType storage_type;
+    SmolGammaType gamma_type;
+    SmolCompositeOp composite_op;
+
+    /* Raw flags passed in by user */
+    SmolFlags flags;
 
-    SmolUnpackRowFunc *unpack_row_func;
-    SmolPackRowFunc *pack_row_func;
+    SmolRepackRowFunc *src_unpack_row_func;
+    SmolRepackRowFunc *dest_unpack_row_func;
+    SmolRepackRowFunc *pack_row_func;
     SmolHFilterFunc *hfilter_func;
     SmolVFilterFunc *vfilter_func;
+    SmolCompositeOverColorFunc *composite_over_color_func;
+    SmolCompositeOverDestFunc *composite_over_dest_func;
+    SmolClearFunc *clear_dest_func;
 
     /* User specified, can be NULL */
     SmolPostRowFunc *post_row_func;
     void *user_data;
 
-    /* Each offset is split in two uint16s: { pixel index, fraction }. These
-     * are relative to the image after halvings have taken place. */
-    uint16_t *offsets_x, *offsets_y;
-    uint32_t span_mul_x, span_mul_y;  /* For box filter */
+    /* Storage for dimensions' precalc arrays. Single allocation. */
+    void *precalc_storage;
+
+    /* Specifics for each dimension */
+    SmolDim hdim, vdim;
+
+    /* TRUE if input rows can be copied directly to output. */
+    unsigned int is_noop : 1;
+
+    /* TRUE if we have a color_pixel to composite on. */
+    unsigned int have_composite_color : 1;
 
-    uint32_t width_bilin_out, height_bilin_out;
-    unsigned int width_halvings, height_halvings;
+    /* Unpacked color to composite on */
+    uint64_t color_pixel [2];
+
+    /* A batch of color pixels in dest storage format. The batch size
+     * is in bytes, and chosen as an even multiple of 3, allowing 32 bytes wide
+     * operations (e.g. AVX2) to be used to clear packed RGB pixels. */
+    unsigned char color_pixels_clear_batch [SMOL_CLEAR_BATCH_SIZE];
 };
 
+/* Number of pixels to convert per batch. For some conversions, we perform
+ * an alpha test per batch to avoid the expensive premul path when the image
+ * data is opaque.
+ *
+ * FIXME: Unimplemented. */
+#define PIXEL_BATCH_SIZE 32
+
+#define SRGB_LINEAR_BITS 11
+#define SRGB_LINEAR_MAX (1 << (SRGB_LINEAR_BITS))
+
+extern const uint16_t _smol_from_srgb_lut [256];
+extern const uint8_t _smol_to_srgb_lut [SRGB_LINEAR_MAX];
+
+#define INVERTED_DIV_SHIFT_P8 (21 - 8)
+#define INVERTED_DIV_SHIFT_P8L (22 - SRGB_LINEAR_BITS)
+#define INVERTED_DIV_SHIFT_P16 (24 - 8)
+#define INVERTED_DIV_SHIFT_P16L (30 - SRGB_LINEAR_BITS)
+
+extern const uint32_t _smol_inv_div_p8_lut [256];
+extern const uint32_t _smol_inv_div_p8l_lut [256];
+extern const uint32_t _smol_inv_div_p16_lut [256];
+extern const uint32_t _smol_inv_div_p16l_lut [256];
+
+const SmolImplementation *_smol_get_generic_implementation (void);
 #ifdef SMOL_WITH_AVX2
 const SmolImplementation *_smol_get_avx2_implementation (void);
 #endif
diff --git a/chafa/internal/smolscale/smolscale.c b/chafa/internal/smolscale/smolscale.c
index 2f983219..c14bff7c 100644
--- a/chafa/internal/smolscale/smolscale.c
+++ b/chafa/internal/smolscale/smolscale.c
@@ -8,3023 +8,1337 @@
 #include <limits.h>
 #include "smolscale-private.h"
 
-/* --- Premultiplication --- */
+/* ----------------------- *
+ * Misc. conversion tables *
+ * ----------------------- */
 
-#define INVERTED_DIV_SHIFT 21
-#define INVERTED_DIV_ROUNDING (1U << (INVERTED_DIV_SHIFT - 1))
-#define INVERTED_DIV_ROUNDING_128BPP \
-    (((uint64_t) INVERTED_DIV_ROUNDING << 32) | INVERTED_DIV_ROUNDING)
-
-/* This table is used to divide by an integer [1..255] using only a lookup,
- * multiplication and a shift. This is faster than plain division on most
- * architectures.
+/* Table of channel reorderings. Each entry describes an available shuffle
+ * implementation indexed by its SmolReorderType. Channel indexes are 1-based.
+ * A zero index denotes that the channel is not present (e.g. 3-channel RGB).
  *
- * Each entry represents the integer 2097152 (1 << 21) divided by the index
- * of the entry. Consequently,
+ * Keep in sync with the private SmolReorderType enum. */
+static const SmolReorderMeta reorder_meta [SMOL_REORDER_MAX] =
+{
+    { { 1, 2, 3, 4 }, { 1, 2, 3, 4 } },
+
+    { { 1, 2, 3, 4 }, { 2, 3, 4, 1 } },
+    { { 1, 2, 3, 4 }, { 3, 2, 1, 4 } },
+    { { 1, 2, 3, 4 }, { 4, 1, 2, 3 } },
+    { { 1, 2, 3, 4 }, { 4, 3, 2, 1 } },
+    { { 1, 2, 3, 4 }, { 1, 2, 3, 0 } },
+    { { 1, 2, 3, 4 }, { 3, 2, 1, 0 } },
+    { { 1, 2, 3, 0 }, { 1, 2, 3, 4 } },
+
+    { { 1, 2, 3, 4 }, { 1, 3, 2, 4 } },
+    { { 1, 2, 3, 4 }, { 2, 3, 1, 4 } },
+    { { 1, 2, 3, 4 }, { 2, 4, 3, 1 } },
+    { { 1, 2, 3, 4 }, { 4, 1, 3, 2 } },
+    { { 1, 2, 3, 4 }, { 4, 2, 3, 1 } },
+    { { 1, 2, 3, 4 }, { 1, 3, 2, 0 } },
+    { { 1, 2, 3, 4 }, { 2, 3, 1, 0 } },
+    { { 1, 2, 3, 0 }, { 1, 3, 2, 4 } },
+
+    { { 1, 2, 3, 4 }, { 3, 2, 4, 0 } },
+    { { 1, 2, 3, 4 }, { 4, 2, 3, 0 } },
+
+    { { 1, 2, 3, 4 }, { 1, 4, 2, 3 } },
+    { { 1, 2, 3, 4 }, { 3, 2, 4, 1 } }
+};
+
+/* Metadata for each pixel type. Storage type, number of channels, alpha type,
+ * channel ordering. Channel indexes are 1-based, and 4 is always alpha. A
+ * zero index denotes that the channel is not present.
  *
- * (v / i) ~= (v * inverted_div_table [i] + (1 << 20)) >> 21
+ * RGBA = 1, 2, 3, 4.
  *
- * (1 << 20) is added for nearest rounding. It would've been nice to keep
- * this table in uint16_t, but alas, we need the extra bits for sufficient
- * precision. */
-static const uint32_t inverted_div_table [256] =
-{
-         0,2097152,1048576, 699051, 524288, 419430, 349525, 299593,
-    262144, 233017, 209715, 190650, 174763, 161319, 149797, 139810,
-    131072, 123362, 116508, 110376, 104858,  99864,  95325,  91181,
-     87381,  83886,  80660,  77672,  74898,  72316,  69905,  67650,
-     65536,  63550,  61681,  59919,  58254,  56680,  55188,  53773,
-     52429,  51150,  49932,  48771,  47663,  46603,  45590,  44620,
-     43691,  42799,  41943,  41121,  40330,  39569,  38836,  38130,
-     37449,  36792,  36158,  35545,  34953,  34380,  33825,  33288,
-     32768,  32264,  31775,  31301,  30840,  30394,  29959,  29537,
-     29127,  28728,  28340,  27962,  27594,  27236,  26887,  26546,
-     26214,  25891,  25575,  25267,  24966,  24672,  24385,  24105,
-     23831,  23564,  23302,  23046,  22795,  22550,  22310,  22075,
-     21845,  21620,  21400,  21183,  20972,  20764,  20560,  20361,
-     20165,  19973,  19784,  19600,  19418,  19240,  19065,  18893,
-     18725,  18559,  18396,  18236,  18079,  17924,  17772,  17623,
-     17476,  17332,  17190,  17050,  16913,  16777,  16644,  16513,
-     16384,  16257,  16132,  16009,  15888,  15768,  15650,  15534,
-     15420,  15308,  15197,  15087,  14980,  14873,  14769,  14665,
-     14564,  14463,  14364,  14266,  14170,  14075,  13981,  13888,
-     13797,  13707,  13618,  13530,  13443,  13358,  13273,  13190,
-     13107,  13026,  12945,  12866,  12788,  12710,  12633,  12558,
-     12483,  12409,  12336,  12264,  12193,  12122,  12053,  11984,
-     11916,  11848,  11782,  11716,  11651,  11586,  11523,  11460,
-     11398,  11336,  11275,  11215,  11155,  11096,  11038,  10980,
-     10923,  10866,  10810,  10755,  10700,  10645,  10592,  10538,
-     10486,  10434,  10382,  10331,  10280,  10230,  10180,  10131,
-     10082,  10034,   9986,   9939,   9892,   9846,   9800,   9754,
-      9709,   9664,   9620,   9576,   9533,   9489,   9447,   9404,
-      9362,   9321,   9279,   9239,   9198,   9158,   9118,   9079,
-      9039,   9001,   8962,   8924,   8886,   8849,   8812,   8775,
-      8738,   8702,   8666,   8630,   8595,   8560,   8525,   8490,
-      8456,   8422,   8389,   8355,   8322,   8289,   8257,   8224,
+ * Keep in sync with the public SmolPixelType enum. */
+static const SmolPixelTypeMeta pixel_type_meta [SMOL_PIXEL_MAX] =
+{
+    { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_PREMUL8,      { 1, 2, 3, 4 } },
+    { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_PREMUL8,      { 3, 2, 1, 4 } },
+    { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_PREMUL8,      { 4, 1, 2, 3 } },
+    { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_PREMUL8,      { 4, 3, 2, 1 } },
+    { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_UNASSOCIATED, { 1, 2, 3, 4 } },
+    { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_UNASSOCIATED, { 3, 2, 1, 4 } },
+    { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_UNASSOCIATED, { 4, 1, 2, 3 } },
+    { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_UNASSOCIATED, { 4, 3, 2, 1 } },
+    { SMOL_STORAGE_24BPP, 3, SMOL_ALPHA_PREMUL8,      { 1, 2, 3, 0 } },
+    { SMOL_STORAGE_24BPP, 3, SMOL_ALPHA_PREMUL8,      { 3, 2, 1, 0 } }
 };
 
-/* Masking and shifting out the results is left to the caller. In
- * and out may not overlap. */
-static SMOL_INLINE void
-unpremul_i_to_u_128bpp (const uint64_t * SMOL_RESTRICT in,
-                        uint64_t * SMOL_RESTRICT out,
-                        uint8_t alpha)
-{
-    out [0] = ((in [0] * (uint64_t) inverted_div_table [alpha]
-                + INVERTED_DIV_ROUNDING_128BPP) >> INVERTED_DIV_SHIFT);
-    out [1] = ((in [1] * (uint64_t) inverted_div_table [alpha]
-                + INVERTED_DIV_ROUNDING_128BPP) >> INVERTED_DIV_SHIFT);
-}
-
-static SMOL_INLINE void
-unpremul_p_to_u_128bpp (const uint64_t * SMOL_RESTRICT in,
-                        uint64_t * SMOL_RESTRICT out,
-                        uint8_t alpha)
-{
-    out [0] = (((in [0] << 8) * (uint64_t) inverted_div_table [alpha])
-               >> INVERTED_DIV_SHIFT);
-    out [1] = (((in [1] << 8) * (uint64_t) inverted_div_table [alpha])
-               >> INVERTED_DIV_SHIFT);
-}
-
-static SMOL_INLINE uint64_t
-unpremul_p_to_u_64bpp (const uint64_t in,
-                       uint8_t alpha)
-{
-    uint64_t in_128bpp [2];
-    uint64_t out_128bpp [2];
-
-    in_128bpp [0] = (in & 0x000000ff000000ff);
-    in_128bpp [1] = (in & 0x00ff000000ff0000) >> 16;
-
-    unpremul_p_to_u_128bpp (in_128bpp, out_128bpp, alpha);
-
-    return (out_128bpp [0] & 0x000000ff000000ff)
-           | ((out_128bpp [1] & 0x000000ff000000ff) << 16);
-}
-
-static SMOL_INLINE uint64_t
-premul_u_to_p_64bpp (const uint64_t in,
-                     uint8_t alpha)
-{
-    return ((in * ((uint16_t) alpha + 1)) >> 8) & 0x00ff00ff00ff00ff;
-}
-
-/* --- Packing --- */
-
-/* It's nice to be able to shift by a negative amount */
-#define SHIFT_S(in, s) ((s >= 0) ? (in) << (s) : (in) >> -(s))
-
-#if 0
-/* Currently unused */
-
-/* This is kind of bulky (~13 x86 insns), but it's about the same as using
- * unions, and we don't have to worry about endianness. */
-#define PACK_FROM_1234_64BPP(in, a, b, c, d)                  \
-     ((SHIFT_S ((in), ((a) - 1) * 16 + 8 - 32) & 0xff000000)  \
-    | (SHIFT_S ((in), ((b) - 1) * 16 + 8 - 40) & 0x00ff0000)  \
-    | (SHIFT_S ((in), ((c) - 1) * 16 + 8 - 48) & 0x0000ff00)  \
-    | (SHIFT_S ((in), ((d) - 1) * 16 + 8 - 56) & 0x000000ff))
-#endif
+/* Channel ordering corrected for little endian. Only applies when fetching
+ * entire pixels as dwords (i.e. u32), so 3-byte variants don't require any
+ * correction.
+ *
+ * Keep in sync with the public SmolPixelType enum. */
+static const SmolPixelType pixel_type_u32_le [SMOL_PIXEL_MAX] =
+{
+    SMOL_PIXEL_ABGR8_PREMULTIPLIED,
+    SMOL_PIXEL_ARGB8_PREMULTIPLIED,
+    SMOL_PIXEL_BGRA8_PREMULTIPLIED,
+    SMOL_PIXEL_RGBA8_PREMULTIPLIED,
+    SMOL_PIXEL_ABGR8_UNASSOCIATED,
+    SMOL_PIXEL_ARGB8_UNASSOCIATED,
+    SMOL_PIXEL_BGRA8_UNASSOCIATED,
+    SMOL_PIXEL_RGBA8_UNASSOCIATED,
+    SMOL_PIXEL_RGB8,
+    SMOL_PIXEL_BGR8
+};
 
-#define PACK_FROM_1234_128BPP(in, a, b, c, d)                                         \
-     ((SHIFT_S ((in [((a) - 1) >> 1]), (((a) - 1) & 1) * 32 + 24 - 32) & 0xff000000)  \
-    | (SHIFT_S ((in [((b) - 1) >> 1]), (((b) - 1) & 1) * 32 + 24 - 40) & 0x00ff0000)  \
-    | (SHIFT_S ((in [((c) - 1) >> 1]), (((c) - 1) & 1) * 32 + 24 - 48) & 0x0000ff00)  \
-    | (SHIFT_S ((in [((d) - 1) >> 1]), (((d) - 1) & 1) * 32 + 24 - 56) & 0x000000ff))
-
-#define SWAP_2_AND_3(n) ((n) == 2 ? 3 : (n) == 3 ? 2 : n)
-
-#define PACK_FROM_1324_64BPP(in, a, b, c, d)                               \
-     ((SHIFT_S ((in), (SWAP_2_AND_3 (a) - 1) * 16 + 8 - 32) & 0xff000000)  \
-    | (SHIFT_S ((in), (SWAP_2_AND_3 (b) - 1) * 16 + 8 - 40) & 0x00ff0000)  \
-    | (SHIFT_S ((in), (SWAP_2_AND_3 (c) - 1) * 16 + 8 - 48) & 0x0000ff00)  \
-    | (SHIFT_S ((in), (SWAP_2_AND_3 (d) - 1) * 16 + 8 - 56) & 0x000000ff))
-
-#if 0
-/* Currently unused */
-
-#define PACK_FROM_1324_128BPP(in, a, b, c, d)                               \
-     ((SHIFT_S ((in [(SWAP_2_AND_3 (a) - 1) >> 1]),                         \
-                ((SWAP_2_AND_3 (a) - 1) & 1) * 32 + 24 - 32) & 0xff000000)  \
-    | (SHIFT_S ((in [(SWAP_2_AND_3 (b) - 1) >> 1]),                         \
-                ((SWAP_2_AND_3 (b) - 1) & 1) * 32 + 24 - 40) & 0x00ff0000)  \
-    | (SHIFT_S ((in [(SWAP_2_AND_3 (c) - 1) >> 1]),                         \
-                ((SWAP_2_AND_3 (c) - 1) & 1) * 32 + 24 - 48) & 0x0000ff00)  \
-    | (SHIFT_S ((in [(SWAP_2_AND_3 (d) - 1) >> 1]),                         \
-                ((SWAP_2_AND_3 (d) - 1) & 1) * 32 + 24 - 56) & 0x000000ff))
-#endif
+/* ----------------------------------- *
+ * sRGB/linear conversion: Shared code *
+ * ----------------------------------- */
 
-/* Pack p -> p */
+/* These tables are manually tweaked to be reversible without information
+ * loss; _smol_to_srgb_lut [_smol_from_srgb_lut [i]] == i.
+ *
+ * As a side effect, the values in the lower range (first 35 indexes) are
+ * off by < 2%. */
+
+const uint16_t _smol_from_srgb_lut [256] =
+{
+       0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11, 
+      12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23, 
+      24,   25,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35, 
+      37,   39,   41,   43,   45,   47,   49,   51,   53,   55,   57,   59, 
+      62,   64,   67,   69,   72,   74,   77,   79,   82,   85,   88,   91, 
+      94,   97,  100,  103,  106,  109,  113,  116,  119,  123,  126,  130, 
+     134,  137,  141,  145,  149,  153,  157,  161,  165,  169,  174,  178, 
+     182,  187,  191,  196,  201,  205,  210,  215,  220,  225,  230,  235, 
+     240,  246,  251,  256,  262,  267,  273,  279,  284,  290,  296,  302, 
+     308,  314,  320,  326,  333,  339,  345,  352,  359,  365,  372,  379, 
+     385,  392,  399,  406,  414,  421,  428,  435,  443,  450,  458,  466, 
+     473,  481,  489,  497,  505,  513,  521,  530,  538,  546,  555,  563, 
+     572,  581,  589,  598,  607,  616,  625,  634,  644,  653,  662,  672, 
+     682,  691,  701,  711,  721,  731,  741,  751,  761,  771,  782,  792, 
+     803,  813,  824,  835,  845,  856,  867,  879,  890,  901,  912,  924, 
+     935,  947,  959,  970,  982,  994, 1006, 1018, 1030, 1043, 1055, 1067, 
+    1080, 1093, 1105, 1118, 1131, 1144, 1157, 1170, 1183, 1197, 1210, 1223, 
+    1237, 1251, 1264, 1278, 1292, 1306, 1320, 1334, 1349, 1363, 1377, 1392, 
+    1407, 1421, 1436, 1451, 1466, 1481, 1496, 1512, 1527, 1542, 1558, 1573, 
+    1589, 1605, 1621, 1637, 1653, 1669, 1685, 1702, 1718, 1735, 1751, 1768, 
+    1785, 1802, 1819, 1836, 1853, 1870, 1887, 1905, 1922, 1940, 1958, 1976, 
+    1994, 2012, 2030, 2047
+};
 
-static SMOL_INLINE uint32_t
-pack_pixel_1324_p_to_1234_p_64bpp (uint64_t in)
-{
-    return in | (in >> 24);
-}
+const uint8_t _smol_to_srgb_lut [SRGB_LINEAR_MAX] =
+{
+      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13, 
+     14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27, 
+     28,  29,  30,  31,  32,  33,  34,  35,  36,  36,  37,  37,  38,  38, 
+     39,  39,  40,  40,  41,  41,  42,  42,  43,  43,  44,  44,  45,  45, 
+     46,  46,  47,  47,  47,  48,  48,  49,  49,  49,  50,  50,  51,  51, 
+     51,  52,  52,  53,  53,  53,  54,  54,  55,  55,  55,  56,  56,  56, 
+     57,  57,  57,  58,  58,  58,  59,  59,  59,  60,  60,  60,  61,  61, 
+     61,  62,  62,  62,  63,  63,  63,  64,  64,  64,  65,  65,  65,  65, 
+     66,  66,  66,  67,  67,  67,  68,  68,  68,  68,  69,  69,  69,  70, 
+     70,  70,  70,  71,  71,  71,  71,  72,  72,  72,  73,  73,  73,  73, 
+     74,  74,  74,  74,  75,  75,  75,  75,  76,  76,  76,  76,  77,  77, 
+     77,  77,  78,  78,  78,  78,  79,  79,  79,  79,  80,  80,  80,  80, 
+     81,  81,  81,  81,  81,  82,  82,  82,  82,  83,  83,  83,  83,  84, 
+     84,  84,  84,  84,  85,  85,  85,  85,  86,  86,  86,  86,  86,  87, 
+     87,  87,  87,  88,  88,  88,  88,  88,  89,  89,  89,  89,  89,  90, 
+     90,  90,  90,  90,  91,  91,  91,  91,  91,  92,  92,  92,  92,  92, 
+     93,  93,  93,  93,  93,  94,  94,  94,  94,  94,  95,  95,  95,  95, 
+     95,  96,  96,  96,  96,  96,  97,  97,  97,  97,  97,  98,  98,  98, 
+     98,  98,  98,  99,  99,  99,  99,  99, 100, 100, 100, 100, 100, 100, 
+    101, 101, 101, 101, 101, 102, 102, 102, 102, 102, 102, 103, 103, 103, 
+    103, 103, 103, 104, 104, 104, 104, 104, 105, 105, 105, 105, 105, 105, 
+    106, 106, 106, 106, 106, 106, 107, 107, 107, 107, 107, 107, 108, 108, 
+    108, 108, 108, 108, 109, 109, 109, 109, 109, 109, 110, 110, 110, 110, 
+    110, 110, 110, 111, 111, 111, 111, 111, 111, 112, 112, 112, 112, 112, 
+    112, 113, 113, 113, 113, 113, 113, 113, 114, 114, 114, 114, 114, 114, 
+    115, 115, 115, 115, 115, 115, 115, 116, 116, 116, 116, 116, 116, 117, 
+    117, 117, 117, 117, 117, 117, 118, 118, 118, 118, 118, 118, 118, 119, 
+    119, 119, 119, 119, 119, 120, 120, 120, 120, 120, 120, 120, 121, 121, 
+    121, 121, 121, 121, 121, 122, 122, 122, 122, 122, 122, 122, 123, 123, 
+    123, 123, 123, 123, 123, 124, 124, 124, 124, 124, 124, 124, 124, 125, 
+    125, 125, 125, 125, 125, 125, 126, 126, 126, 126, 126, 126, 126, 127, 
+    127, 127, 127, 127, 127, 127, 128, 128, 128, 128, 128, 128, 128, 128, 
+    129, 129, 129, 129, 129, 129, 129, 129, 130, 130, 130, 130, 130, 130, 
+    130, 131, 131, 131, 131, 131, 131, 131, 131, 132, 132, 132, 132, 132, 
+    132, 132, 132, 133, 133, 133, 133, 133, 133, 133, 134, 134, 134, 134, 
+    134, 134, 134, 134, 135, 135, 135, 135, 135, 135, 135, 135, 136, 136, 
+    136, 136, 136, 136, 136, 136, 137, 137, 137, 137, 137, 137, 137, 137, 
+    137, 138, 138, 138, 138, 138, 138, 138, 138, 139, 139, 139, 139, 139, 
+    139, 139, 139, 140, 140, 140, 140, 140, 140, 140, 140, 141, 141, 141, 
+    141, 141, 141, 141, 141, 141, 142, 142, 142, 142, 142, 142, 142, 142, 
+    143, 143, 143, 143, 143, 143, 143, 143, 143, 144, 144, 144, 144, 144, 
+    144, 144, 144, 144, 145, 145, 145, 145, 145, 145, 145, 145, 146, 146, 
+    146, 146, 146, 146, 146, 146, 146, 147, 147, 147, 147, 147, 147, 147, 
+    147, 147, 148, 148, 148, 148, 148, 148, 148, 148, 148, 149, 149, 149, 
+    149, 149, 149, 149, 149, 149, 150, 150, 150, 150, 150, 150, 150, 150, 
+    150, 151, 151, 151, 151, 151, 151, 151, 151, 151, 152, 152, 152, 152, 
+    152, 152, 152, 152, 152, 152, 153, 153, 153, 153, 153, 153, 153, 153, 
+    153, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, 155, 155, 155, 
+    155, 155, 155, 155, 155, 155, 156, 156, 156, 156, 156, 156, 156, 156, 
+    156, 156, 157, 157, 157, 157, 157, 157, 157, 157, 157, 158, 158, 158, 
+    158, 158, 158, 158, 158, 158, 158, 159, 159, 159, 159, 159, 159, 159, 
+    159, 159, 159, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 161, 
+    161, 161, 161, 161, 161, 161, 161, 161, 161, 162, 162, 162, 162, 162, 
+    162, 162, 162, 162, 162, 163, 163, 163, 163, 163, 163, 163, 163, 163, 
+    163, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 165, 165, 165, 
+    165, 165, 165, 165, 165, 165, 165, 165, 166, 166, 166, 166, 166, 166, 
+    166, 166, 166, 166, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 
+    167, 168, 168, 168, 168, 168, 168, 168, 168, 168, 168, 169, 169, 169, 
+    169, 169, 169, 169, 169, 169, 169, 169, 170, 170, 170, 170, 170, 170, 
+    170, 170, 170, 170, 170, 171, 171, 171, 171, 171, 171, 171, 171, 171, 
+    171, 172, 172, 172, 172, 172, 172, 172, 172, 172, 172, 172, 173, 173, 
+    173, 173, 173, 173, 173, 173, 173, 173, 173, 174, 174, 174, 174, 174, 
+    174, 174, 174, 174, 174, 174, 175, 175, 175, 175, 175, 175, 175, 175, 
+    175, 175, 175, 176, 176, 176, 176, 176, 176, 176, 176, 176, 176, 176, 
+    176, 177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 178, 178, 
+    178, 178, 178, 178, 178, 178, 178, 178, 178, 179, 179, 179, 179, 179, 
+    179, 179, 179, 179, 179, 179, 179, 180, 180, 180, 180, 180, 180, 180, 
+    180, 180, 180, 180, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 
+    181, 181, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
+    183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 184, 184, 184, 
+    184, 184, 184, 184, 184, 184, 184, 184, 184, 185, 185, 185, 185, 185, 
+    185, 185, 185, 185, 185, 185, 185, 186, 186, 186, 186, 186, 186, 186, 
+    186, 186, 186, 186, 186, 187, 187, 187, 187, 187, 187, 187, 187, 187, 
+    187, 187, 187, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 
+    188, 188, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 
+    190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 191, 191, 
+    191, 191, 191, 191, 191, 191, 191, 191, 191, 191, 191, 192, 192, 192, 
+    192, 192, 192, 192, 192, 192, 192, 192, 192, 193, 193, 193, 193, 193, 
+    193, 193, 193, 193, 193, 193, 193, 193, 194, 194, 194, 194, 194, 194, 
+    194, 194, 194, 194, 194, 194, 194, 195, 195, 195, 195, 195, 195, 195, 
+    195, 195, 195, 195, 195, 195, 196, 196, 196, 196, 196, 196, 196, 196, 
+    196, 196, 196, 196, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 
+    197, 197, 197, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 
+    198, 198, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 
+    199, 199, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 
+    200, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 
+    202, 202, 202, 202, 202, 202, 202, 202, 202, 202, 202, 202, 202, 202, 
+    203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 204, 
+    204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 205, 
+    205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 206, 206, 
+    206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 207, 207, 
+    207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 208, 208, 
+    208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 209, 209, 
+    209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 210, 210, 
+    210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 211, 211, 
+    211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 212, 212, 
+    212, 212, 212, 212, 212, 212, 212, 212, 212, 212, 212, 212, 212, 213, 
+    213, 213, 213, 213, 213, 213, 213, 213, 213, 213, 213, 213, 213, 214, 
+    214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 215, 
+    215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 
+    216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 
+    216, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 
+    217, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 
+    218, 218, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 
+    219, 219, 219, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 
+    220, 220, 220, 220, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 
+    221, 221, 221, 221, 221, 222, 222, 222, 222, 222, 222, 222, 222, 222, 
+    222, 222, 222, 222, 222, 222, 223, 223, 223, 223, 223, 223, 223, 223, 
+    223, 223, 223, 223, 223, 223, 223, 223, 224, 224, 224, 224, 224, 224, 
+    224, 224, 224, 224, 224, 224, 224, 224, 224, 225, 225, 225, 225, 225, 
+    225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 226, 226, 226, 226, 
+    226, 226, 226, 226, 226, 226, 226, 226, 226, 226, 226, 226, 227, 227, 
+    227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 
+    228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 
+    228, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 
+    229, 229, 229, 230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 
+    230, 230, 230, 230, 230, 231, 231, 231, 231, 231, 231, 231, 231, 231, 
+    231, 231, 231, 231, 231, 231, 231, 232, 232, 232, 232, 232, 232, 232, 
+    232, 232, 232, 232, 232, 232, 232, 232, 232, 233, 233, 233, 233, 233, 
+    233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 234, 234, 234, 
+    234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 
+    235, 235, 235, 235, 235, 235, 235, 235, 235, 235, 235, 235, 235, 235, 
+    235, 235, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 
+    236, 236, 236, 236, 237, 237, 237, 237, 237, 237, 237, 237, 237, 237, 
+    237, 237, 237, 237, 237, 237, 237, 238, 238, 238, 238, 238, 238, 238, 
+    238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 239, 239, 239, 239, 
+    239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 240, 240, 
+    240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 
+    240, 241, 241, 241, 241, 241, 241, 241, 241, 241, 241, 241, 241, 241, 
+    241, 241, 241, 241, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 
+    242, 242, 242, 242, 242, 242, 242, 243, 243, 243, 243, 243, 243, 243, 
+    243, 243, 243, 243, 243, 243, 243, 243, 243, 243, 244, 244, 244, 244, 
+    244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 245, 
+    245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 
+    245, 245, 245, 246, 246, 246, 246, 246, 246, 246, 246, 246, 246, 246, 
+    246, 246, 246, 246, 246, 246, 247, 247, 247, 247, 247, 247, 247, 247, 
+    247, 247, 247, 247, 247, 247, 247, 247, 247, 248, 248, 248, 248, 248, 
+    248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 249, 
+    249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 
+    249, 249, 249, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 
+    250, 250, 250, 250, 250, 250, 251, 251, 251, 251, 251, 251, 251, 251, 
+    251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, 
+    252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 
+    253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 
+    253, 253, 253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 
+    254, 254, 254, 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 
+    255, 255, 255, 255
+};
 
-static void
-pack_row_1324_p_to_1234_p_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint32_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint32_t *row_out_max = row_out + n_pixels;
+/* ------------------------------ *
+ * Premultiplication: Shared code *
+ * ------------------------------ */
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+/* These tables are used to divide by an integer [1..255] using only a lookup,
+ * multiplication and a shift. This is faster than plain division on most
+ * architectures.
+ *
+ * The values are tuned to minimize the error and overhead when turning
+ * premultiplied (8-bit, 11-bit, 16-bit, 19-bit) into 8-bit unassociated alpha. */
+
+/* Lossy premultiplication: 8-bit * alpha -> 8-bit. Not perfectly reversible. */
+const uint32_t _smol_inv_div_p8_lut [256] =
+{
+    0x00000000, 0x00181fff, 0x000e2fff, 0x0009f555, 0x0007a7ff, 0x00063333, 0x00052555, 0x00047999,
+    0x0003ebff, 0x0003838e, 0x00032333, 0x0002e2e8, 0x0002a2aa, 0x0002713b, 0x00024249, 0x00021ccc,
+    0x0001f924, 0x0001dd17, 0x0001c1c7, 0x0001ab4b, 0x000195e5, 0x0001830c, 0x000170c3, 0x00016164,
+    0x0001537a, 0x0001450d, 0x0001390b, 0x00012de9, 0x00012249, 0x00011846, 0x00010eaa, 0x0001069e,
+    0x0000fd70, 0x0000f6aa, 0x0000eedb, 0x0000e8f5, 0x0000e1c7, 0x0000db8e, 0x0000d638, 0x0000d069,
+    0x0000cb7c, 0x0000c690, 0x0000c186, 0x0000bd2d, 0x0000b8f9, 0x0000b4f7, 0x0000b0ff, 0x0000ad65,
+    0x0000a9ac, 0x0000a687, 0x0000a286, 0x00009f33, 0x00009c98, 0x000099b9, 0x000096f1, 0x00009414,
+    0x00009147, 0x00008efa, 0x00008c59, 0x00008a0a, 0x000087b4, 0x0000856c, 0x00008341, 0x0000818c,
+    0x00007f55, 0x00007d60, 0x00007b7f, 0x000079b2, 0x000077b9, 0x00007608, 0x0000743c, 0x000072b5,
+    0x0000711a, 0x00006fac, 0x00006e1a, 0x00006cad, 0x00006b17, 0x000069e1, 0x00006864, 0x00006736,
+    0x000065db, 0x000064b1, 0x00006357, 0x00006250, 0x000060c5, 0x00006060, 0x00005ec0, 0x00005da5,
+    0x00005c9b, 0x00005b8b, 0x00005a93, 0x000059ab, 0x00005884, 0x00005799, 0x000056ae, 0x000055d5,
+    0x000054e2, 0x0000540b, 0x00005343, 0x00005255, 0x0000517c, 0x000050a7, 0x00004fff, 0x00004f2c,
+    0x00004e5e, 0x00004d9f, 0x00004cec, 0x00004c34, 0x00004b78, 0x00004adc, 0x00004a23, 0x00004981,
+    0x000048ce, 0x00004836, 0x0000478c, 0x000046eb, 0x00004656, 0x000045b6, 0x00004524, 0x0000449c,
+    0x000043ff, 0x00004370, 0x000042e2, 0x00004257, 0x000041ce, 0x00004147, 0x000040c3, 0x00004081,
+    0x00003fff, 0x00003f57, 0x00003ed3, 0x00003e54, 0x00003dd9, 0x00003d60, 0x00003ced, 0x00003c78,
+    0x00003c07, 0x00003b9a, 0x00003b26, 0x00003abf, 0x00003a4f, 0x000039e1, 0x0000397e, 0x00003917,
+    0x000038af, 0x00003848, 0x000037ee, 0x00003787, 0x00003726, 0x000036c9, 0x0000366b, 0x0000360d,
+    0x000035b0, 0x00003567, 0x00003503, 0x000034aa, 0x00003453, 0x000033ff, 0x000033a8, 0x0000335c,
+    0x00003305, 0x000032b3, 0x00003266, 0x00003213, 0x000031c7, 0x00003178, 0x0000312b, 0x000030df,
+    0x00003094, 0x00003049, 0x00003018, 0x00002fc0, 0x00002f76, 0x00002f2d, 0x00002ee8, 0x00002ea6,
+    0x00002e5f, 0x00002e1c, 0x00002dd9, 0x00002d99, 0x00002d59, 0x00002d17, 0x00002cdf, 0x00002c9b,
+    0x00002c5d, 0x00002c1c, 0x00002be1, 0x00002ba6, 0x00002b6a, 0x00002b2e, 0x00002af3, 0x00002ac7,
+    0x00002a85, 0x00002a4a, 0x00002a11, 0x000029dc, 0x000029a6, 0x0000296e, 0x00002936, 0x00002904,
+    0x000028cd, 0x0000289a, 0x00002866, 0x00002833, 0x0000280a, 0x000027d0, 0x0000279e, 0x0000276f,
+    0x0000273c, 0x0000270d, 0x000026de, 0x000026ad, 0x0000267e, 0x00002652, 0x00002622, 0x000025f5,
+    0x000025c9, 0x0000259b, 0x0000256f, 0x00002545, 0x00002518, 0x000024ef, 0x000024c3, 0x0000249c,
+    0x0000246f, 0x00002446, 0x0000241c, 0x000023f4, 0x000023ca, 0x000023a2, 0x0000237b, 0x00002354,
+    0x0000232e, 0x00002306, 0x000022e0, 0x000022b9, 0x00002294, 0x0000226f, 0x0000224b, 0x00002226,
+    0x00002202, 0x000021dc, 0x000021b8, 0x00002195, 0x00002172, 0x0000214f, 0x0000212c, 0x0000210a,
+    0x000020e7, 0x000020c5, 0x000020a4, 0x00002083, 0x00002061, 0x00002041, 0x00002020, 0x00002020
+};
 
-    while (row_out != row_out_max)
-    {
-        *(row_out++) = pack_pixel_1324_p_to_1234_p_64bpp (*(row_in++));
-    }
-}
+/* Lossy premultiplication: 11-bit * alpha -> 11-bit. Not perfectly reversible. */
+const uint32_t _smol_inv_div_p8l_lut [256] =
+{
+    0x00000000, 0x0007ffff, 0x0003ffff, 0x0002aaaa, 0x0001ffff, 0x00019999, 0x00015555, 0x00012492,
+    0x0000ffff, 0x0000e38e, 0x0000cccc, 0x0000ba2e, 0x0000aaaa, 0x00009d89, 0x00009249, 0x00008888,
+    0x00007fff, 0x00007878, 0x000071c7, 0x00006bca, 0x00006666, 0x00006186, 0x00005d17, 0x0000590b,
+    0x00005555, 0x000051eb, 0x00004ec4, 0x00004bda, 0x00004924, 0x0000469e, 0x00004444, 0x00004210,
+    0x00003fff, 0x00003e0f, 0x00003c3c, 0x00003a83, 0x000038e3, 0x0000372a, 0x000035b7, 0x00003458,
+    0x0000330a, 0x000031cc, 0x0000309e, 0x00002f7d, 0x00002e69, 0x00002d62, 0x00002c66, 0x00002b75,
+    0x00002a8e, 0x000029b0, 0x000028db, 0x0000280f, 0x0000274a, 0x0000268c, 0x000025d6, 0x00002526,
+    0x0000247d, 0x000023d9, 0x0000233c, 0x000022a3, 0x0000220f, 0x00002181, 0x000020f7, 0x00002071,
+    0x00001ff0, 0x00001f72, 0x00001ef8, 0x00001e82, 0x00001e0f, 0x00001da0, 0x00001d34, 0x00001ccb,
+    0x00001c65, 0x00001bf5, 0x00001b95, 0x00001b37, 0x00001adb, 0x00001a82, 0x00001a2c, 0x000019d7,
+    0x00001985, 0x00001934, 0x000018e6, 0x00001899, 0x0000184f, 0x00001806, 0x000017be, 0x00001779,
+    0x00001734, 0x000016f2, 0x000016b1, 0x00001671, 0x00001633, 0x000015f6, 0x000015ba, 0x00001580,
+    0x00001547, 0x0000150f, 0x000014d8, 0x000014a2, 0x0000146d, 0x0000143a, 0x00001407, 0x000013d5,
+    0x000013a5, 0x00001375, 0x00001346, 0x00001318, 0x000012eb, 0x000012be, 0x0000128e, 0x00001263,
+    0x00001239, 0x00001210, 0x000011e7, 0x000011c0, 0x00001199, 0x00001172, 0x0000114d, 0x00001127,
+    0x00001103, 0x000010df, 0x000010bc, 0x00001099, 0x00001077, 0x00001055, 0x00001034, 0x00001014,
+    0x00000ff4, 0x00000fd4, 0x00000fb5, 0x00000f96, 0x00000f78, 0x00000f5a, 0x00000f3d, 0x00000f20,
+    0x00000f04, 0x00000ee8, 0x00000ecc, 0x00000eb1, 0x00000e96, 0x00000e7c, 0x00000e62, 0x00000e48,
+    0x00000e2f, 0x00000e16, 0x00000dfa, 0x00000de2, 0x00000dca, 0x00000db2, 0x00000d9b, 0x00000d84,
+    0x00000d6d, 0x00000d57, 0x00000d41, 0x00000d2b, 0x00000d16, 0x00000d00, 0x00000ceb, 0x00000cd7,
+    0x00000cc2, 0x00000cae, 0x00000c9a, 0x00000c86, 0x00000c73, 0x00000c5f, 0x00000c4c, 0x00000c3a,
+    0x00000c27, 0x00000c15, 0x00000c03, 0x00000bf1, 0x00000bdf, 0x00000bcd, 0x00000bbc, 0x00000bab,
+    0x00000b9a, 0x00000b89, 0x00000b79, 0x00000b68, 0x00000b58, 0x00000b48, 0x00000b38, 0x00000b27,
+    0x00000b17, 0x00000b08, 0x00000af9, 0x00000aea, 0x00000adb, 0x00000acc, 0x00000abe, 0x00000ab0,
+    0x00000aa1, 0x00000a93, 0x00000a85, 0x00000a78, 0x00000a6a, 0x00000a5c, 0x00000a4f, 0x00000a42,
+    0x00000a35, 0x00000a28, 0x00000a1b, 0x00000a0e, 0x00000a02, 0x000009f5, 0x000009e9, 0x000009dd,
+    0x000009d1, 0x000009c5, 0x000009b9, 0x000009ad, 0x000009a1, 0x00000996, 0x0000098a, 0x0000097f,
+    0x00000974, 0x00000969, 0x0000095e, 0x00000951, 0x00000947, 0x0000093c, 0x00000931, 0x00000927,
+    0x0000091c, 0x00000912, 0x00000908, 0x000008fe, 0x000008f3, 0x000008e9, 0x000008e0, 0x000008d6,
+    0x000008cc, 0x000008c2, 0x000008b9, 0x000008af, 0x000008a6, 0x0000089d, 0x00000893, 0x0000088a,
+    0x00000881, 0x00000878, 0x0000086f, 0x00000866, 0x0000085e, 0x00000855, 0x0000084c, 0x00000844,
+    0x0000083b, 0x00000833, 0x0000082a, 0x00000822, 0x0000081a, 0x00000812, 0x0000080a, 0x00000801
+};
 
-static void
-pack_row_132a_p_to_123_p_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                uint8_t * SMOL_RESTRICT row_out,
-                                uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
+/* Lossless premultiplication: 8-bit * alpha -> 16-bit. Reversible with this table. */
+const uint32_t _smol_inv_div_p16_lut [256] =
+{
+    0x00000000, 0x00005556, 0x00004000, 0x00003334, 0x00002aab, 0x00002493, 0x00002000, 0x00001c72,
+    0x0000199a, 0x00001746, 0x00001556, 0x000013b2, 0x0000124a, 0x00001112, 0x00001000, 0x00000f10,
+    0x00000e39, 0x00000d7a, 0x00000ccd, 0x00000c31, 0x00000ba3, 0x00000b22, 0x00000aab, 0x00000a3e,
+    0x000009d9, 0x0000097c, 0x00000925, 0x000008d4, 0x00000889, 0x00000843, 0x00000800, 0x000007c2,
+    0x00000788, 0x00000751, 0x0000071d, 0x000006ec, 0x000006bd, 0x00000691, 0x00000667, 0x0000063f,
+    0x00000619, 0x000005f5, 0x000005d2, 0x000005b1, 0x00000591, 0x00000573, 0x00000556, 0x0000053a,
+    0x0000051f, 0x00000506, 0x000004ed, 0x000004d5, 0x000004be, 0x000004a8, 0x00000493, 0x0000047e,
+    0x0000046a, 0x00000457, 0x00000445, 0x00000433, 0x00000422, 0x00000411, 0x00000400, 0x000003f1,
+    0x000003e1, 0x000003d3, 0x000003c4, 0x000003b6, 0x000003a9, 0x0000039c, 0x0000038f, 0x00000382,
+    0x00000376, 0x0000036a, 0x0000035f, 0x00000354, 0x00000349, 0x0000033e, 0x00000334, 0x0000032a,
+    0x00000320, 0x00000316, 0x0000030d, 0x00000304, 0x000002fb, 0x000002f2, 0x000002e9, 0x000002e1,
+    0x000002d9, 0x000002d1, 0x000002c9, 0x000002c1, 0x000002ba, 0x000002b2, 0x000002ab, 0x000002a4,
+    0x0000029d, 0x00000296, 0x00000290, 0x00000289, 0x00000283, 0x0000027d, 0x00000277, 0x00000271,
+    0x0000026b, 0x00000265, 0x0000025f, 0x0000025a, 0x00000254, 0x0000024f, 0x0000024a, 0x00000244,
+    0x0000023f, 0x0000023a, 0x00000235, 0x00000231, 0x0000022c, 0x00000227, 0x00000223, 0x0000021e,
+    0x0000021a, 0x00000215, 0x00000211, 0x0000020d, 0x00000209, 0x00000205, 0x00000200, 0x000001fd,
+    0x000001f9, 0x000001f5, 0x000001f1, 0x000001ed, 0x000001ea, 0x000001e6, 0x000001e2, 0x000001df,
+    0x000001db, 0x000001d8, 0x000001d5, 0x000001d1, 0x000001ce, 0x000001cb, 0x000001c8, 0x000001c4,
+    0x000001c1, 0x000001be, 0x000001bb, 0x000001b8, 0x000001b5, 0x000001b3, 0x000001b0, 0x000001ad,
+    0x000001aa, 0x000001a7, 0x000001a5, 0x000001a2, 0x0000019f, 0x0000019d, 0x0000019a, 0x00000198,
+    0x00000195, 0x00000193, 0x00000190, 0x0000018e, 0x0000018b, 0x00000189, 0x00000187, 0x00000184,
+    0x00000182, 0x00000180, 0x0000017e, 0x0000017b, 0x00000179, 0x00000177, 0x00000175, 0x00000173,
+    0x00000171, 0x0000016f, 0x0000016d, 0x0000016b, 0x00000169, 0x00000167, 0x00000165, 0x00000163,
+    0x00000161, 0x0000015f, 0x0000015d, 0x0000015b, 0x00000159, 0x00000158, 0x00000156, 0x00000154,
+    0x00000152, 0x00000151, 0x0000014f, 0x0000014d, 0x0000014b, 0x0000014a, 0x00000148, 0x00000147,
+    0x00000145, 0x00000143, 0x00000142, 0x00000140, 0x0000013f, 0x0000013d, 0x0000013c, 0x0000013a,
+    0x00000139, 0x00000137, 0x00000136, 0x00000134, 0x00000133, 0x00000131, 0x00000130, 0x0000012f,
+    0x0000012d, 0x0000012c, 0x0000012a, 0x00000129, 0x00000128, 0x00000126, 0x00000125, 0x00000124,
+    0x00000122, 0x00000121, 0x00000120, 0x0000011f, 0x0000011d, 0x0000011c, 0x0000011b, 0x0000011a,
+    0x00000119, 0x00000117, 0x00000116, 0x00000115, 0x00000114, 0x00000113, 0x00000112, 0x00000110,
+    0x0000010f, 0x0000010e, 0x0000010d, 0x0000010c, 0x0000010b, 0x0000010a, 0x00000109, 0x00000108,
+    0x00000107, 0x00000106, 0x00000105, 0x00000104, 0x00000103, 0x00000102, 0x00000100, 0x00000100
+};
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+/* Lossless premultiplication: 11-bit * alpha -> 19-bit. Reversible with this table. */
+const uint32_t _smol_inv_div_p16l_lut [256] =
+{
+    0x00000000, 0x0002aaab, 0x00020000, 0x0001999a, 0x00015556, 0x00012493, 0x00010000, 0x0000e38f,
+    0x0000cccd, 0x0000ba2f, 0x0000aaab, 0x00009d8a, 0x0000924a, 0x00008889, 0x00008000, 0x00007879,
+    0x000071c8, 0x00006bcb, 0x00006667, 0x00006187, 0x00005d18, 0x0000590c, 0x00005556, 0x000051ec,
+    0x00004ec5, 0x00004bdb, 0x00004925, 0x0000469f, 0x00004445, 0x00004211, 0x00004000, 0x00003e10,
+    0x00003c3d, 0x00003a84, 0x000038e4, 0x0000375a, 0x000035e6, 0x00003484, 0x00003334, 0x000031f4,
+    0x000030c4, 0x00002fa1, 0x00002e8c, 0x00002d83, 0x00002c86, 0x00002b94, 0x00002aab, 0x000029cc,
+    0x000028f6, 0x00002829, 0x00002763, 0x000026a5, 0x000025ee, 0x0000253d, 0x00002493, 0x000023ef,
+    0x00002350, 0x000022b7, 0x00002223, 0x00002193, 0x00002109, 0x00002083, 0x00002000, 0x00001f82,
+    0x00001f08, 0x00001e92, 0x00001e1f, 0x00001daf, 0x00001d42, 0x00001cd9, 0x00001c72, 0x00001c0f,
+    0x00001bad, 0x00001b4f, 0x00001af3, 0x00001a99, 0x00001a42, 0x000019ed, 0x0000199a, 0x00001949,
+    0x000018fa, 0x000018ad, 0x00001862, 0x00001819, 0x000017d1, 0x0000178b, 0x00001746, 0x00001703,
+    0x000016c2, 0x00001682, 0x00001643, 0x00001606, 0x000015ca, 0x0000158f, 0x00001556, 0x0000151e,
+    0x000014e6, 0x000014b0, 0x0000147b, 0x00001447, 0x00001415, 0x000013e3, 0x000013b2, 0x00001382,
+    0x00001353, 0x00001324, 0x000012f7, 0x000012ca, 0x0000129f, 0x00001274, 0x0000124a, 0x00001220,
+    0x000011f8, 0x000011d0, 0x000011a8, 0x00001182, 0x0000115c, 0x00001136, 0x00001112, 0x000010ed,
+    0x000010ca, 0x000010a7, 0x00001085, 0x00001063, 0x00001042, 0x00001021, 0x00001000, 0x00000fe1,
+    0x00000fc1, 0x00000fa3, 0x00000f84, 0x00000f67, 0x00000f49, 0x00000f2c, 0x00000f10, 0x00000ef3,
+    0x00000ed8, 0x00000ebc, 0x00000ea1, 0x00000e87, 0x00000e6d, 0x00000e53, 0x00000e39, 0x00000e20,
+    0x00000e08, 0x00000def, 0x00000dd7, 0x00000dbf, 0x00000da8, 0x00000d91, 0x00000d7a, 0x00000d63,
+    0x00000d4d, 0x00000d37, 0x00000d21, 0x00000d0c, 0x00000cf7, 0x00000ce2, 0x00000ccd, 0x00000cb9,
+    0x00000ca5, 0x00000c91, 0x00000c7d, 0x00000c6a, 0x00000c57, 0x00000c44, 0x00000c31, 0x00000c1f,
+    0x00000c0d, 0x00000bfb, 0x00000be9, 0x00000bd7, 0x00000bc6, 0x00000bb4, 0x00000ba3, 0x00000b93,
+    0x00000b82, 0x00000b71, 0x00000b61, 0x00000b51, 0x00000b41, 0x00000b31, 0x00000b22, 0x00000b12,
+    0x00000b03, 0x00000af4, 0x00000ae5, 0x00000ad7, 0x00000ac8, 0x00000ab9, 0x00000aab, 0x00000a9d,
+    0x00000a8f, 0x00000a81, 0x00000a73, 0x00000a66, 0x00000a58, 0x00000a4b, 0x00000a3e, 0x00000a31,
+    0x00000a24, 0x00000a17, 0x00000a0b, 0x000009fe, 0x000009f2, 0x000009e5, 0x000009d9, 0x000009cd,
+    0x000009c1, 0x000009b5, 0x000009aa, 0x0000099e, 0x00000992, 0x00000987, 0x0000097c, 0x00000971,
+    0x00000965, 0x0000095b, 0x00000950, 0x00000945, 0x0000093a, 0x00000930, 0x00000925, 0x0000091b,
+    0x00000910, 0x00000906, 0x000008fc, 0x000008f2, 0x000008e8, 0x000008de, 0x000008d4, 0x000008cb,
+    0x000008c1, 0x000008b8, 0x000008ae, 0x000008a5, 0x0000089b, 0x00000892, 0x00000889, 0x00000880,
+    0x00000877, 0x0000086e, 0x00000865, 0x0000085c, 0x00000854, 0x0000084b, 0x00000843, 0x0000083a,
+    0x00000832, 0x00000829, 0x00000821, 0x00000819, 0x00000811, 0x00000809, 0x00000800, 0x000007f9
+};
 
-    while (row_out != row_out_max)
-    {
-        /* FIXME: Would be faster to shift directly */
-        uint32_t p = pack_pixel_1324_p_to_1234_p_64bpp (*(row_in++));
-        *(row_out++) = p >> 24;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 8;
-    }
-}
+/* ------- *
+ * Helpers *
+ * ------- */
 
-static void
-pack_row_132a_p_to_321_p_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                uint8_t * SMOL_RESTRICT row_out,
-                                uint32_t n_pixels)
+static SMOL_INLINE int
+check_row_range (const SmolScaleCtx *scale_ctx,
+                 int32_t *first_dest_row,
+                 int32_t *n_dest_rows)
 {
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
+    if (*first_dest_row < 0)
     {
-        /* FIXME: Would be faster to shift directly */
-        uint32_t p = pack_pixel_1324_p_to_1234_p_64bpp (*(row_in++));
-        *(row_out++) = p >> 8;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 24;
+        *n_dest_rows += *first_dest_row;
+        *first_dest_row = 0;
     }
-}
-
-#define DEF_PACK_FROM_1324_P_TO_P_64BPP(a, b, c, d)                     \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_1324_p_to_##a##b##c##d##_p_64bpp (uint64_t in)               \
-{                                                                       \
-    return PACK_FROM_1324_64BPP (in, a, b, c, d);                       \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_1324_p_to_##a##b##c##d##_p_64bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                           uint32_t * SMOL_RESTRICT row_out, \
-                                           uint32_t n_pixels)           \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-        *(row_out++) = pack_pixel_1324_p_to_##a##b##c##d##_p_64bpp (*(row_in++)); \
-}
-
-DEF_PACK_FROM_1324_P_TO_P_64BPP (1, 4, 3, 2)
-DEF_PACK_FROM_1324_P_TO_P_64BPP (2, 3, 4, 1)
-DEF_PACK_FROM_1324_P_TO_P_64BPP (3, 2, 1, 4)
-DEF_PACK_FROM_1324_P_TO_P_64BPP (4, 1, 2, 3)
-DEF_PACK_FROM_1324_P_TO_P_64BPP (4, 3, 2, 1)
-
-static SMOL_INLINE uint32_t
-pack_pixel_1234_p_to_1234_p_128bpp (const uint64_t *in)
-{
-    /* FIXME: Are masks needed? */
-    return ((in [0] >> 8) & 0xff000000)
-           | ((in [0] << 16) & 0x00ff0000)
-           | ((in [1] >> 24) & 0x0000ff00)
-           | (in [1] & 0x000000ff);
-}
-
-static void
-pack_row_1234_p_to_1234_p_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                  uint32_t * SMOL_RESTRICT row_out,
-                                  uint32_t n_pixels)
-{
-    uint32_t *row_out_max = row_out + n_pixels;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
+    else if (*first_dest_row >= (int32_t) scale_ctx->vdim.dest_size_px)
     {
-        *(row_out++) = pack_pixel_1234_p_to_1234_p_128bpp (row_in);
-        row_in += 2;
+        return 0;
     }
-}
-
-#define DEF_PACK_FROM_1234_P_TO_P_128BPP(a, b, c, d)                    \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_1234_p_to_##a##b##c##d##_p_128bpp (const uint64_t * SMOL_RESTRICT in) \
-{                                                                       \
-    return PACK_FROM_1234_128BPP (in, a, b, c, d);                      \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_1234_p_to_##a##b##c##d##_p_128bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                            uint32_t * SMOL_RESTRICT row_out, \
-                                            uint32_t n_pixels)          \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-    {                                                                   \
-        *(row_out++) = pack_pixel_1234_p_to_##a##b##c##d##_p_128bpp (row_in); \
-        row_in += 2;                                                    \
-    }                                                                   \
-}
-
-DEF_PACK_FROM_1234_P_TO_P_128BPP (1, 4, 3, 2)
-DEF_PACK_FROM_1234_P_TO_P_128BPP (2, 3, 4, 1)
-DEF_PACK_FROM_1234_P_TO_P_128BPP (3, 2, 1, 4)
-DEF_PACK_FROM_1234_P_TO_P_128BPP (4, 1, 2, 3)
-DEF_PACK_FROM_1234_P_TO_P_128BPP (4, 3, 2, 1)
-
-static void
-pack_row_123a_p_to_123_p_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
+    if (*n_dest_rows < 0 || *first_dest_row + *n_dest_rows > (int32_t) scale_ctx->vdim.dest_size_px)
     {
-        *(row_out++) = *row_in >> 32;
-        *(row_out++) = *(row_in++);
-        *(row_out++) = *(row_in++) >> 32;
+        *n_dest_rows = scale_ctx->vdim.dest_size_px - *first_dest_row;
     }
-}
-
-static void
-pack_row_123a_p_to_321_p_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
+    else if (*n_dest_rows == 0)
     {
-        *(row_out++) = row_in [1] >> 32;
-        *(row_out++) = row_in [0];
-        *(row_out++) = row_in [0] >> 32;
-        row_in += 2;
+        return 0;
     }
-}
 
-/* Pack p (alpha last) -> u */
-
-static SMOL_INLINE uint32_t
-pack_pixel_132a_p_to_1234_u_64bpp (uint64_t in)
-{
-    uint8_t alpha = in;
-    in = (unpremul_p_to_u_64bpp (in, alpha) & 0xffffffffffffff00) | alpha;
-    return in | (in >> 24);
+    return 1;
 }
 
-static void
-pack_row_132a_p_to_1234_u_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint32_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint32_t *row_out_max = row_out + n_pixels;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        *(row_out++) = pack_pixel_132a_p_to_1234_u_64bpp (*(row_in++));
-    }
-}
+/* ------------------- *
+ * Scaling: Outer loop *
+ * ------------------- */
 
-static void
-pack_row_132a_p_to_123_u_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                uint8_t * SMOL_RESTRICT row_out,
-                                uint32_t n_pixels)
+static SMOL_INLINE const char *
+src_row_ofs_to_pointer (const SmolScaleCtx *scale_ctx,
+                        uint32_t src_row_ofs)
 {
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_132a_p_to_1234_u_64bpp (*(row_in++));
-        *(row_out++) = p >> 24;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 8;
-    }
+    return scale_ctx->src_pixels + scale_ctx->src_rowstride * src_row_ofs;
 }
 
-static void
-pack_row_132a_p_to_321_u_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                uint8_t * SMOL_RESTRICT row_out,
-                                uint32_t n_pixels)
+static SMOL_INLINE char *
+dest_row_ofs_to_pointer (const SmolScaleCtx *scale_ctx,
+                         uint32_t dest_row_ofs)
 {
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_132a_p_to_1234_u_64bpp (*(row_in++));
-        *(row_out++) = p >> 8;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 24;
-    }
-}
-
-#define DEF_PACK_FROM_132A_P_TO_U_64BPP(a, b, c, d)                     \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_132a_p_to_##a##b##c##d##_u_64bpp (uint64_t in)               \
-{                                                                       \
-    uint8_t alpha = in;                                                 \
-    in = (unpremul_p_to_u_64bpp (in, alpha) & 0xffffffffffffff00) | alpha; \
-    return PACK_FROM_1324_64BPP (in, a, b, c, d);                       \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_132a_p_to_##a##b##c##d##_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                           uint32_t * SMOL_RESTRICT row_out, \
-                                           uint32_t n_pixels)           \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-        *(row_out++) = pack_pixel_132a_p_to_##a##b##c##d##_u_64bpp (*(row_in++)); \
-}
-
-DEF_PACK_FROM_132A_P_TO_U_64BPP (3, 2, 1, 4)
-DEF_PACK_FROM_132A_P_TO_U_64BPP (4, 1, 2, 3)
-DEF_PACK_FROM_132A_P_TO_U_64BPP (4, 3, 2, 1)
-
-#define DEF_PACK_FROM_123A_P_TO_U_128BPP(a, b, c, d)                    \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_123a_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT in) \
-{                                                                       \
-    uint64_t t [2];                                                     \
-    uint8_t alpha = in [1];                                             \
-    unpremul_p_to_u_128bpp (in, t, alpha);                              \
-    t [1] = (t [1] & 0xffffffff00000000) | alpha;                       \
-    return PACK_FROM_1234_128BPP (t, a, b, c, d);                       \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_123a_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                            uint32_t * SMOL_RESTRICT row_out, \
-                                            uint32_t n_pixels)          \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-    {                                                                   \
-        *(row_out++) = pack_pixel_123a_p_to_##a##b##c##d##_u_128bpp (row_in); \
-        row_in += 2;                                                    \
-    }                                                                   \
+    return scale_ctx->dest_pixels + scale_ctx->dest_rowstride * dest_row_ofs;
 }
 
-DEF_PACK_FROM_123A_P_TO_U_128BPP (1, 2, 3, 4)
-DEF_PACK_FROM_123A_P_TO_U_128BPP (3, 2, 1, 4)
-DEF_PACK_FROM_123A_P_TO_U_128BPP (4, 1, 2, 3)
-DEF_PACK_FROM_123A_P_TO_U_128BPP (4, 3, 2, 1)
-
-static void
-pack_row_123a_p_to_123_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
+static SMOL_INLINE void *
+dest_hofs_to_pointer (const SmolScaleCtx *scale_ctx,
+                      void *dest_row_ptr,
+                      uint32_t dest_hofs)
 {
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_123a_p_to_1234_u_128bpp (row_in);
-        row_in += 2;
-        *(row_out++) = p >> 24;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 8;
-    }
+    uint8_t *dest_row_ptr_u8 = dest_row_ptr;
+    return dest_row_ptr_u8 + dest_hofs * pixel_type_meta [scale_ctx->dest_pixel_type].pixel_stride;
 }
 
 static void
-pack_row_123a_p_to_321_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_123a_p_to_1234_u_128bpp (row_in);
-        row_in += 2;
-        *(row_out++) = p >> 8;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 24;
-    }
-}
-
-/* Pack p (alpha first) -> u */
-
-static SMOL_INLINE uint32_t
-pack_pixel_a324_p_to_1234_u_64bpp (uint64_t in)
+copy_row (const SmolScaleCtx *scale_ctx,
+          uint32_t dest_row_index,
+          uint32_t *row_out)
 {
-    uint8_t alpha = (in >> 48) & 0xff;  /* FIXME: May not need mask */
-    in = (unpremul_p_to_u_64bpp (in, alpha) & 0x0000ffffffffffff) | ((uint64_t) alpha << 48);
-    return in | (in >> 24);
+    memcpy (row_out,
+            src_row_ofs_to_pointer (scale_ctx, dest_row_index),
+            scale_ctx->hdim.dest_size_px * pixel_type_meta [scale_ctx->dest_pixel_type].pixel_stride);
 }
 
 static void
-pack_row_a324_p_to_1234_u_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint32_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
+scale_dest_row (const SmolScaleCtx *scale_ctx,
+                SmolLocalCtx *local_ctx,
+                uint32_t dest_row_index,
+                void *row_out)
 {
-    uint32_t *row_out_max = row_out + n_pixels;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
+    if (dest_row_index < scale_ctx->vdim.clear_before_px
+        || dest_row_index >= scale_ctx->vdim.dest_size_px - scale_ctx->vdim.clear_after_px)
     {
-        *(row_out++) = pack_pixel_a324_p_to_1234_u_64bpp (*(row_in++));
-    }
-}
-
-static void
-pack_row_a324_p_to_234_u_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                uint8_t * SMOL_RESTRICT row_out,
-                                uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
+        /* Row doesn't intersect placement */
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_a324_p_to_1234_u_64bpp (*(row_in++));
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 8;
-        *(row_out++) = p;
+        if (scale_ctx->composite_op == SMOL_COMPOSITE_SRC_CLEAR_DEST)
+        {
+            /* Clear entire row */
+            scale_ctx->clear_dest_func (scale_ctx->color_pixels_clear_batch,
+                                        row_out,
+                                        scale_ctx->hdim.dest_size_px);
+        }
     }
-}
-
-static void
-pack_row_a324_p_to_432_u_64bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                uint8_t * SMOL_RESTRICT row_out,
-                                uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
+    else
     {
-        uint32_t p = pack_pixel_a324_p_to_1234_u_64bpp (*(row_in++));
-        *(row_out++) = p;
-        *(row_out++) = p >> 8;
-        *(row_out++) = p >> 16;
-    }
-}
-
-#define DEF_PACK_FROM_A324_P_TO_U_64BPP(a, b, c, d)                     \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_a324_p_to_##a##b##c##d##_u_64bpp (uint64_t in)               \
-{                                                                       \
-    uint8_t alpha = (in >> 48) & 0xff;  /* FIXME: May not need mask */  \
-    in = (unpremul_p_to_u_64bpp (in, alpha) & 0x0000ffffffffffff) | ((uint64_t) alpha << 48); \
-    return PACK_FROM_1324_64BPP (in, a, b, c, d);                       \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_a324_p_to_##a##b##c##d##_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                           uint32_t * SMOL_RESTRICT row_out, \
-                                           uint32_t n_pixels)           \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-        *(row_out++) = pack_pixel_a324_p_to_##a##b##c##d##_u_64bpp (*(row_in++)); \
-}
-
-DEF_PACK_FROM_A324_P_TO_U_64BPP (1, 4, 3, 2)
-DEF_PACK_FROM_A324_P_TO_U_64BPP (2, 3, 4, 1)
-DEF_PACK_FROM_A324_P_TO_U_64BPP (4, 3, 2, 1)
-
-#define DEF_PACK_FROM_A234_P_TO_U_128BPP(a, b, c, d)                    \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_a234_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT in) \
-{                                                                       \
-    uint64_t t [2];                                                     \
-    uint8_t alpha = in [0] >> 32;                                       \
-    unpremul_p_to_u_128bpp (in, t, alpha);                              \
-    t [0] = (t [0] & 0x00000000ffffffff) | ((uint64_t) alpha << 32);    \
-    return PACK_FROM_1234_128BPP (t, a, b, c, d);                       \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_a234_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                            uint32_t * SMOL_RESTRICT row_out, \
-                                            uint32_t n_pixels)          \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-    {                                                                   \
-        *(row_out++) = pack_pixel_a234_p_to_##a##b##c##d##_u_128bpp (row_in); \
-        row_in += 2;                                                    \
-    }                                                                   \
-}
-
-DEF_PACK_FROM_A234_P_TO_U_128BPP (1, 2, 3, 4)
-DEF_PACK_FROM_A234_P_TO_U_128BPP (1, 4, 3, 2)
-DEF_PACK_FROM_A234_P_TO_U_128BPP (2, 3, 4, 1)
-DEF_PACK_FROM_A234_P_TO_U_128BPP (4, 3, 2, 1)
+        if (scale_ctx->composite_op == SMOL_COMPOSITE_SRC_CLEAR_DEST)
+        {
+            /* Clear left */
+            scale_ctx->clear_dest_func (scale_ctx->color_pixels_clear_batch,
+                                        row_out,
+                                        scale_ctx->hdim.clear_before_px);
+        }
 
-static void
-pack_row_a234_p_to_234_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
+        if (scale_ctx->is_noop)
+        {
+            copy_row (scale_ctx, dest_row_index, row_out);
+        }
+        else
+        {
+            int scaled_row_index;
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+            scaled_row_index = scale_ctx->vfilter_func (scale_ctx,
+                                                        local_ctx,
+                                                        dest_row_index - scale_ctx->vdim.clear_before_px);
 
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_a234_p_to_1234_u_128bpp (row_in);
-        row_in += 2;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 8;
-        *(row_out++) = p;
-    }
-}
+            if ((scale_ctx->composite_op == SMOL_COMPOSITE_SRC
+                 || scale_ctx->composite_op == SMOL_COMPOSITE_SRC_CLEAR_DEST)
+                && scale_ctx->have_composite_color)
+            {
+                scale_ctx->composite_over_color_func (local_ctx->parts_row [scaled_row_index],
+                                                      scale_ctx->color_pixel,
+                                                      scale_ctx->hdim.placement_size_px);
+            }
 
-static void
-pack_row_a234_p_to_432_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
+            scale_ctx->pack_row_func (local_ctx->parts_row [scaled_row_index],
+                                      dest_hofs_to_pointer (scale_ctx, row_out, scale_ctx->hdim.placement_ofs_px),
+                                      scale_ctx->hdim.placement_size_px);
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+        }
 
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_a234_p_to_1234_u_128bpp (row_in);
-        row_in += 2;
-        *(row_out++) = p;
-        *(row_out++) = p >> 8;
-        *(row_out++) = p >> 16;
+        if (scale_ctx->composite_op == SMOL_COMPOSITE_SRC_CLEAR_DEST)
+        {
+            /* Clear right */
+            scale_ctx->clear_dest_func (scale_ctx->color_pixels_clear_batch,
+                                        dest_hofs_to_pointer (scale_ctx, row_out,
+                                                              scale_ctx->hdim.placement_ofs_px
+                                                              + scale_ctx->hdim.placement_size_px),
+                                        scale_ctx->hdim.clear_after_px);
+        }
     }
-}
-
-/* Pack i (alpha last) to u */
 
-static SMOL_INLINE uint32_t
-pack_pixel_123a_i_to_1234_u_128bpp (const uint64_t * SMOL_RESTRICT in)
-{
-    uint8_t alpha = (in [1] >> 8) & 0xff;
-    uint64_t t [2];
-
-    unpremul_i_to_u_128bpp (in, t, alpha);
-
-    return ((t [0] >> 8) & 0xff000000)
-           | ((t [0] << 16) & 0x00ff0000)
-           | ((t [1] >> 24) & 0x0000ff00)
-           | alpha;
+    if (scale_ctx->post_row_func)
+        scale_ctx->post_row_func (row_out, scale_ctx->hdim.dest_size_px, scale_ctx->user_data);
 }
 
 static void
-pack_row_123a_i_to_1234_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                  uint32_t * SMOL_RESTRICT row_out,
-                                  uint32_t n_pixels)
+do_rows (const SmolScaleCtx *scale_ctx,
+         void *dest,
+         uint32_t row_dest_index,
+         uint32_t n_rows)
 {
-    uint32_t *row_out_max = row_out + n_pixels;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        *(row_out++) = pack_pixel_123a_i_to_1234_u_128bpp (row_in);
-        row_in += 2;
-    }
-}
+    SmolLocalCtx local_ctx = { 0 };
+    uint32_t n_parts_per_pixel = 1;
+    uint32_t n_stored_rows = 4;
+    uint32_t i;
 
-static void
-pack_row_123a_i_to_123_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
+    if (scale_ctx->storage_type == SMOL_STORAGE_128BPP)
+        n_parts_per_pixel = 2;
 
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+    /* Must be one less, or this test in update_local_ctx() will wrap around:
+     * if (new_src_ofs == local_ctx->src_ofs + 1) { ... } */
+    local_ctx.src_ofs = UINT_MAX - 1;
 
-    while (row_out != row_out_max)
+    for (i = 0; i < n_stored_rows; i++)
     {
-        uint32_t p = pack_pixel_123a_i_to_1234_u_128bpp (row_in);
-        row_in += 2;
-        *(row_out++) = p >> 24;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 8;
-    }
-}
-
-static void
-pack_row_123a_i_to_321_u_128bpp (const uint64_t * SMOL_RESTRICT row_in,
-                                 uint8_t * SMOL_RESTRICT row_out,
-                                 uint32_t n_pixels)
-{
-    uint8_t *row_out_max = row_out + n_pixels * 3;
-
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);
+        /* Allocate space for an extra pixel at the rightmost edge. This pixel
+         * allows bilinear horizontal sampling to exceed the input width and
+         * produce transparency when the output is smaller than its whole-pixel
+         * count. This is especially noticeable with halving, which can
+         * produce 2^n such samples (the extra pixel is sampled repeatedly in
+         * those cases).
+         *
+         * FIXME: This is no longer true, and the extra storage is probably not
+         * needed. The edge transparency is now handled by applying a precalculated
+         * opacity directly. We should verify that the extra storage can be
+         * eliminated without overruns. */
+
+        local_ctx.parts_row [i] =
+            smol_alloc_aligned (MAX (scale_ctx->hdim.src_size_px + 1, scale_ctx->hdim.placement_size_px)
+                                * n_parts_per_pixel * sizeof (uint64_t),
+                                &local_ctx.row_storage [i]);
 
-    while (row_out != row_out_max)
-    {
-        uint32_t p = pack_pixel_123a_i_to_1234_u_128bpp (row_in);
-        row_in += 2;
-        *(row_out++) = p >> 8;
-        *(row_out++) = p >> 16;
-        *(row_out++) = p >> 24;
+        local_ctx.parts_row [i] [scale_ctx->hdim.src_size_px * n_parts_per_pixel] = 0;
+        if (n_parts_per_pixel == 2)
+            local_ctx.parts_row [i] [scale_ctx->hdim.src_size_px * n_parts_per_pixel + 1] = 0;
     }
-}
-
-#define DEF_PACK_FROM_123A_I_TO_U_128BPP(a, b, c, d)                    \
-static SMOL_INLINE uint32_t                                             \
-pack_pixel_123a_i_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT in) \
-{                                                                       \
-    uint8_t alpha = (in [1] >> 8) & 0xff;                               \
-    uint64_t t [2];                                                     \
-    unpremul_i_to_u_128bpp (in, t, alpha);                              \
-    t [1] = (t [1] & 0xffffffff00000000ULL) | alpha;                    \
-    return PACK_FROM_1234_128BPP (t, a, b, c, d);                       \
-}                                                                       \
-                                                                        \
-static void                                                             \
-pack_row_123a_i_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, \
-                                            uint32_t * SMOL_RESTRICT row_out, \
-                                            uint32_t n_pixels)          \
-{                                                                       \
-    uint32_t *row_out_max = row_out + n_pixels;                         \
-    SMOL_ASSUME_ALIGNED (row_in, const uint64_t *);                     \
-    while (row_out != row_out_max)                                      \
-    {                                                                   \
-        *(row_out++) = pack_pixel_123a_i_to_##a##b##c##d##_u_128bpp (row_in); \
-        row_in += 2;                                                    \
-    }                                                                   \
-}
-
-DEF_PACK_FROM_123A_I_TO_U_128BPP(3, 2, 1, 4)
-DEF_PACK_FROM_123A_I_TO_U_128BPP(4, 1, 2, 3)
-DEF_PACK_FROM_123A_I_TO_U_128BPP(4, 3, 2, 1)
-
-/* Unpack p -> p */
 
-static SMOL_INLINE uint64_t
-unpack_pixel_1234_p_to_1324_p_64bpp (uint32_t p)
-{
-    return (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff00ff);
-}
-
-/* AVX2 has a useful instruction for this: __m256i _mm256_cvtepu8_epi16 (__m128i a);
- * It results in a different channel ordering, so it'd be important to match with
- * the right kind of re-pack. */
-static void
-unpack_row_1234_p_to_1324_p_64bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                   uint64_t * SMOL_RESTRICT row_out,
-                                   uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels;
-
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
-
-    while (row_out != row_out_max)
+    for (i = row_dest_index; i < row_dest_index + n_rows; i++)
     {
-        *(row_out++) = unpack_pixel_1234_p_to_1324_p_64bpp (*(row_in++));
+        scale_dest_row (scale_ctx, &local_ctx, i, dest);
+        dest = (char *) dest + scale_ctx->dest_rowstride;
     }
-}
 
-static SMOL_INLINE uint64_t
-unpack_pixel_123_p_to_132a_p_64bpp (const uint8_t *p)
-{
-    return ((uint64_t) p [0] << 48) | ((uint32_t) p [1] << 16)
-        | ((uint64_t) p [2] << 32) | 0xff;
-}
-
-static void
-unpack_row_123_p_to_132a_p_64bpp (const uint8_t * SMOL_RESTRICT row_in,
-                                  uint64_t * SMOL_RESTRICT row_out,
-                                  uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels;
-
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
-
-    while (row_out != row_out_max)
+    for (i = 0; i < n_stored_rows; i++)
     {
-        *(row_out++) = unpack_pixel_123_p_to_132a_p_64bpp (row_in);
-        row_in += 3;
+        smol_free (local_ctx.row_storage [i]);
     }
-}
 
-static SMOL_INLINE void
-unpack_pixel_1234_p_to_1234_p_128bpp (uint32_t p,
-                                      uint64_t *out)
-{
-    uint64_t p64 = p;
-    out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16);
-    out [1] = ((p64 & 0x0000ff00) << 24) | (p64 & 0x000000ff);
+    /* Used to align row data if needed. May be allocated in scale_horizontal(). */
+    if (local_ctx.src_aligned)
+        smol_free (local_ctx.src_aligned_storage);
 }
 
-static void
-unpack_row_1234_p_to_1234_p_128bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                    uint64_t * SMOL_RESTRICT row_out,
-                                    uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels * 2;
-
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        unpack_pixel_1234_p_to_1234_p_128bpp (*(row_in++), row_out);
-        row_out += 2;
-    }
-}
+/* -------------------- *
+ * Architecture support *
+ * -------------------- */
 
-static SMOL_INLINE void
-unpack_pixel_123_p_to_123a_p_128bpp (const uint8_t *in,
-                                     uint64_t *out)
-{
-    out [0] = ((uint64_t) in [0] << 32) | in [1];
-    out [1] = ((uint64_t) in [2] << 32) | 0xff;
-}
+#ifdef SMOL_WITH_AVX2
 
-static void
-unpack_row_123_p_to_123a_p_128bpp (const uint8_t * SMOL_RESTRICT row_in,
-                                    uint64_t * SMOL_RESTRICT row_out,
-                                    uint32_t n_pixels)
+static SmolBool
+have_avx2 (void)
 {
-    uint64_t *row_out_max = row_out + n_pixels * 2;
+    __builtin_cpu_init ();
 
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
+    if (__builtin_cpu_supports ("avx2"))
+        return TRUE;
 
-    while (row_out != row_out_max)
-    {
-        unpack_pixel_123_p_to_123a_p_128bpp (row_in, row_out);
-        row_in += 3;
-        row_out += 2;
-    }
+    return FALSE;
 }
 
-/* Unpack u (alpha first) -> p */
-
-static SMOL_INLINE uint64_t
-unpack_pixel_a234_u_to_a324_p_64bpp (uint32_t p)
-{
-    uint64_t p64 = (((uint64_t) p & 0x0000ff00) << 24) | (p & 0x00ff00ff);
-    uint8_t alpha = p >> 24;
-
-    return premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha << 48);
-}
+#endif
 
-static void
-unpack_row_a234_u_to_a324_p_64bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                   uint64_t * SMOL_RESTRICT row_out,
-                                   uint32_t n_pixels)
+static SmolBool
+host_is_little_endian (void)
 {
-    uint64_t *row_out_max = row_out + n_pixels;
-
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
-
-    while (row_out != row_out_max)
+    static const union
     {
-        *(row_out++) = unpack_pixel_a234_u_to_a324_p_64bpp (*(row_in++));
+        uint8_t u8 [4];
+        uint32_t u32;
     }
-}
-
-static SMOL_INLINE void
-unpack_pixel_a234_u_to_a234_p_128bpp (uint32_t p,
-                                      uint64_t *out)
-{
-    uint64_t p64 = (((uint64_t) p & 0x0000ff00) << 24) | (p & 0x00ff00ff);
-    uint8_t alpha = p >> 24;
-
-    p64 = premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha << 48);
-    out [0] = (p64 >> 16) & 0x000000ff000000ff;
-    out [1] = p64 & 0x000000ff000000ff;
-}
-
-static void
-unpack_row_a234_u_to_a234_p_128bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                    uint64_t * SMOL_RESTRICT row_out,
-                                    uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels * 2;
+    host_bytes = { { 0, 1, 2, 3 } };
 
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
+    if (host_bytes.u32 == 0x03020100UL)
+        return TRUE;
 
-    while (row_out != row_out_max)
-    {
-        unpack_pixel_a234_u_to_a234_p_128bpp (*(row_in++), row_out);
-        row_out += 2;
-    }
-}
-
-/* Unpack u (alpha first) -> i */
-
-static SMOL_INLINE void
-unpack_pixel_a234_u_to_234a_i_128bpp (uint32_t p,
-                                      uint64_t *out)
-{
-    uint64_t p64 = p;
-    uint64_t alpha = p >> 24;
-
-    out [0] = (((((p64 & 0x00ff0000) << 16) | ((p64 & 0x0000ff00) >> 8)) * alpha));
-    out [1] = (((((p64 & 0x000000ff) << 32) * alpha))) | (alpha << 8) | 0x80;
-}
-
-static void
-unpack_row_a234_u_to_234a_i_128bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                    uint64_t * SMOL_RESTRICT row_out,
-                                    uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels * 2;
-
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        unpack_pixel_a234_u_to_234a_i_128bpp (*(row_in++), row_out);
-        row_out += 2;
-    }
-}
-
-/* Unpack u (alpha last) -> p */
-
-static SMOL_INLINE uint64_t
-unpack_pixel_123a_u_to_132a_p_64bpp (uint32_t p)
-{
-    uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000);
-    uint8_t alpha = p & 0xff;
-
-    return premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha);
-}
-
-static void
-unpack_row_123a_u_to_132a_p_64bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                   uint64_t * SMOL_RESTRICT row_out,
-                                   uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels;
-
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        *(row_out++) = unpack_pixel_123a_u_to_132a_p_64bpp (*(row_in++));
-    }
-}
-
-static SMOL_INLINE void
-unpack_pixel_123a_u_to_123a_p_128bpp (uint32_t p,
-                                      uint64_t *out)
-{
-    uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000);
-    uint8_t alpha = p & 0xff;
-
-    p64 = premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha);
-    out [0] = (p64 >> 16) & 0x000000ff000000ff;
-    out [1] = p64 & 0x000000ff000000ff;
-}
-
-static void
-unpack_row_123a_u_to_123a_p_128bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                    uint64_t * SMOL_RESTRICT row_out,
-                                    uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels * 2;
-
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        unpack_pixel_123a_u_to_123a_p_128bpp (*(row_in++), row_out);
-        row_out += 2;
-    }
-}
-
-/* Unpack u (alpha last) -> i */
-
-static SMOL_INLINE void
-unpack_pixel_123a_u_to_123a_i_128bpp (uint32_t p,
-                                      uint64_t *out)
-{
-    uint64_t p64 = p;
-    uint64_t alpha = p & 0xff;
-
-    out [0] = (((((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16)) * alpha));
-    out [1] = (((((p64 & 0x0000ff00) << 24) * alpha))) | (alpha << 8) | 0x80;
-}
-
-static void
-unpack_row_123a_u_to_123a_i_128bpp (const uint32_t * SMOL_RESTRICT row_in,
-                                    uint64_t * SMOL_RESTRICT row_out,
-                                    uint32_t n_pixels)
-{
-    uint64_t *row_out_max = row_out + n_pixels * 2;
-
-    SMOL_ASSUME_ALIGNED (row_out, uint64_t *);
-
-    while (row_out != row_out_max)
-    {
-        unpack_pixel_123a_u_to_123a_i_128bpp (*(row_in++), row_out);
-        row_out += 2;
-    }
-}
-
-/* --- Filter helpers --- */
-
-static SMOL_INLINE const uint32_t *
-inrow_ofs_to_pointer (const SmolScaleCtx *scale_ctx,
-                      uint32_t inrow_ofs)
-{
-    return (const uint32_t *) (((const uint8_t *) scale_ctx->pixels_in) + scale_ctx->rowstride_in * inrow_ofs);
-}
-
-static SMOL_INLINE uint32_t *
-outrow_ofs_to_pointer (const SmolScaleCtx *scale_ctx,
-                       uint32_t outrow_ofs)
-{
-    return (uint32_t *) (((uint8_t *) scale_ctx->pixels_out) + scale_ctx->rowstride_out * outrow_ofs);
-}
-
-static SMOL_INLINE uint64_t
-weight_pixel_64bpp (uint64_t p,
-                    uint16_t w)
-{
-    return ((p * w) >> 8) & 0x00ff00ff00ff00ff;
-}
-
-/* p and out may be the same address */
-static SMOL_INLINE void
-weight_pixel_128bpp (uint64_t *p,
-                     uint64_t *out,
-                     uint16_t w)
-{
-    out [0] = ((p [0] * w) >> 8) & 0x00ffffff00ffffffULL;
-    out [1] = ((p [1] * w) >> 8) & 0x00ffffff00ffffffULL;
-}
-
-static SMOL_INLINE void
-sum_parts_64bpp (const uint64_t ** SMOL_RESTRICT parts_in,
-                 uint64_t * SMOL_RESTRICT accum,
-                 uint32_t n)
-{
-    const uint64_t *pp_end;
-    const uint64_t * SMOL_RESTRICT pp = *parts_in;
-
-    SMOL_ASSUME_ALIGNED_TO (pp, const uint64_t *, sizeof (uint64_t));
-
-    for (pp_end = pp + n; pp < pp_end; pp++)
-    {
-        *accum += *pp;
-    }
-
-    *parts_in = pp;
-}
-
-static SMOL_INLINE void
-sum_parts_128bpp (const uint64_t ** SMOL_RESTRICT parts_in,
-                  uint64_t * SMOL_RESTRICT accum,
-                  uint32_t n)
-{
-    const uint64_t *pp_end;
-    const uint64_t * SMOL_RESTRICT pp = *parts_in;
-
-    SMOL_ASSUME_ALIGNED_TO (pp, const uint64_t *, sizeof (uint64_t) * 2);
-
-    for (pp_end = pp + n * 2; pp < pp_end; )
-    {
-        accum [0] += *(pp++);
-        accum [1] += *(pp++);
-    }
-
-    *parts_in = pp;
-}
-
-static SMOL_INLINE uint64_t
-scale_64bpp (uint64_t accum,
-             uint64_t multiplier)
-{
-    uint64_t a, b;
-
-    /* Average the inputs */
-    a = ((accum & 0x0000ffff0000ffffULL) * multiplier
-         + (SMOL_BOXES_MULTIPLIER / 2) + ((SMOL_BOXES_MULTIPLIER / 2) << 32)) / SMOL_BOXES_MULTIPLIER;
-    b = (((accum & 0xffff0000ffff0000ULL) >> 16) * multiplier
-         + (SMOL_BOXES_MULTIPLIER / 2) + ((SMOL_BOXES_MULTIPLIER / 2) << 32)) / SMOL_BOXES_MULTIPLIER;
-
-    /* Return pixel */
-    return (a & 0x000000ff000000ffULL) | ((b & 0x000000ff000000ffULL) << 16);
-}
-
-static SMOL_INLINE uint64_t
-scale_128bpp_half (uint64_t accum,
-                   uint64_t multiplier)
-{
-    uint64_t a, b;
-
-    a = accum & 0x00000000ffffffffULL;
-    a = (a * multiplier + SMOL_BOXES_MULTIPLIER / 2) / SMOL_BOXES_MULTIPLIER;
-
-    b = (accum & 0xffffffff00000000ULL) >> 32;
-    b = (b * multiplier + SMOL_BOXES_MULTIPLIER / 2) / SMOL_BOXES_MULTIPLIER;
-
-    return (a & 0x000000000000ffffULL)
-           | ((b & 0x000000000000ffffULL) << 32);
-}
-
-static SMOL_INLINE void
-scale_and_store_128bpp (const uint64_t * SMOL_RESTRICT accum,
-                        uint64_t multiplier,
-                        uint64_t ** SMOL_RESTRICT row_parts_out)
-{
-    *(*row_parts_out)++ = scale_128bpp_half (accum [0], multiplier);
-    *(*row_parts_out)++ = scale_128bpp_half (accum [1], multiplier);
-}
-
-static void
-add_parts (const uint64_t * SMOL_RESTRICT parts_in,
-           uint64_t * SMOL_RESTRICT parts_acc_out,
-           uint32_t n)
-{
-    const uint64_t *parts_in_max = parts_in + n;
-
-    SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *);
-
-    while (parts_in < parts_in_max)
-        *(parts_acc_out++) += *(parts_in++);
-}
-
-/* --- Precalculation --- */
-
-static void
-pick_filter_params (uint32_t dim_in,
-                    uint32_t dim_out,
-                    uint32_t *halvings_out,
-                    uint32_t *dim_bilin_out,
-                    SmolFilterType *filter_out,
-                    SmolStorageType *storage_out)
-{
-    *dim_bilin_out = dim_out;
-    *storage_out = SMOL_STORAGE_64BPP;
-
-    /* The box algorithms are only sufficiently precise when
-     * dim_in > dim_out * 5. box_64bpp typically starts outperforming
-     * bilinear+halving at dim_in > dim_out * 8. */
-
-    if (dim_in > dim_out * 255)
-    {
-        *filter_out = SMOL_FILTER_BOX;
-        *storage_out = SMOL_STORAGE_128BPP;
-    }
-    else if (dim_in > dim_out * 8)
-    {
-        *filter_out = SMOL_FILTER_BOX;
-    }
-    else if (dim_in == 1)
-    {
-        *filter_out = SMOL_FILTER_ONE;
-    }
-    else if (dim_in == dim_out)
-    {
-        *filter_out = SMOL_FILTER_COPY;
-    }
-    else
-    {
-        uint32_t n_halvings = 0;
-        uint32_t d = dim_out;
-
-        for (;;)
-        {
-            d *= 2;
-            if (d >= dim_in)
-                break;
-            n_halvings++;
-        }
-
-        dim_out <<= n_halvings;
-        *dim_bilin_out = dim_out;
-        *filter_out = SMOL_FILTER_BILINEAR_0H + n_halvings;
-        *halvings_out = n_halvings;
-    }
-}
-
-static void
-precalc_bilinear_array (uint16_t *array,
-                        uint32_t dim_in,
-                        uint32_t dim_out,
-                        unsigned int make_absolute_offsets)
-{
-    uint64_t ofs_stepF, fracF, frac_stepF;
-    uint16_t *pu16 = array;
-    uint16_t last_ofs = 0;
-
-    if (dim_in > dim_out)
-    {
-        /* Minification */
-        frac_stepF = ofs_stepF = (dim_in * SMOL_BILIN_MULTIPLIER) / dim_out;
-        fracF = (frac_stepF - SMOL_BILIN_MULTIPLIER) / 2;
-    }
-    else
-    {
-        /* Magnification */
-        frac_stepF = ofs_stepF = ((dim_in - 1) * SMOL_BILIN_MULTIPLIER) / (dim_out > 1 ? (dim_out - 1) : 1);
-        fracF = 0;
-    }
-
-    do
-    {
-        uint16_t ofs = fracF / SMOL_BILIN_MULTIPLIER;
-
-        /* We sample ofs and its neighbor -- prevent out of bounds access
-         * for the latter. */
-        if (ofs >= dim_in - 1)
-            break;
-
-        *(pu16++) = make_absolute_offsets ? ofs : ofs - last_ofs;
-        *(pu16++) = SMOL_SMALL_MUL - ((fracF / (SMOL_BILIN_MULTIPLIER / SMOL_SMALL_MUL)) % SMOL_SMALL_MUL);
-        fracF += frac_stepF;
-
-        last_ofs = ofs;
-    }
-    while (--dim_out);
-
-    /* Instead of going out of bounds, sample the final pair of pixels with a 100%
-     * bias towards the last pixel */
-    while (dim_out)
-    {
-        *(pu16++) = make_absolute_offsets ? dim_in - 2 : (dim_in - 2) - last_ofs;
-        *(pu16++) = 0;
-        dim_out--;
-
-        last_ofs = dim_in - 2;
-    }
-}
-
-static void
-precalc_boxes_array (uint16_t *array,
-                     uint32_t *span_mul,
-                     uint32_t dim_in,
-                     uint32_t dim_out,
-                     unsigned int make_absolute_offsets)
-{
-    uint64_t fracF, frac_stepF;
-    uint16_t *pu16 = array;
-    uint16_t ofs, next_ofs;
-    uint64_t f;
-    uint64_t stride;
-    uint64_t a, b;
-
-    frac_stepF = ((uint64_t) dim_in * SMOL_BIG_MUL) / (uint64_t) dim_out;
-    fracF = 0;
-    ofs = 0;
-
-    stride = frac_stepF / (uint64_t) SMOL_BIG_MUL;
-    f = (frac_stepF / SMOL_SMALL_MUL) % SMOL_SMALL_MUL;
-
-    a = (SMOL_BOXES_MULTIPLIER * 255);
-    b = ((stride * 255) + ((f * 255) / 256));
-    *span_mul = (a + (b / 2)) / b;
-
-    do
-    {
-        fracF += frac_stepF;
-        next_ofs = (uint64_t) fracF / ((uint64_t) SMOL_BIG_MUL);
-
-        /* Prevent out of bounds access */
-        if (ofs >= dim_in - 1)
-            break;
-
-        if (next_ofs > dim_in)
-        {
-            next_ofs = dim_in;
-            if (next_ofs <= ofs)
-                break;
-        }
-
-        stride = next_ofs - ofs - 1;
-        f = (fracF / SMOL_SMALL_MUL) % SMOL_SMALL_MUL;
-
-        /* Fraction is the other way around, since left pixel of each span
-         * comes first, and it's on the right side of the fractional sample. */
-        *(pu16++) = make_absolute_offsets ? ofs : stride;
-        *(pu16++) = f;
-
-        ofs = next_ofs;
-    }
-    while (--dim_out);
-
-    /* Instead of going out of bounds, sample the final pair of pixels with a 100%
-     * bias towards the last pixel */
-    while (dim_out)
-    {
-        *(pu16++) = make_absolute_offsets ? ofs : 0;
-        *(pu16++) = 0;
-        dim_out--;
-    }
-
-    *(pu16++) = make_absolute_offsets ? ofs : 0;
-    *(pu16++) = 0;
-}
-
-/* --- Horizontal scaling --- */
-
-#define DEF_INTERP_HORIZONTAL_BILINEAR(n_halvings)                      \
-static void                                                             \
-interp_horizontal_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \
-                                                  const uint64_t * SMOL_RESTRICT row_parts_in, \
-                                                  uint64_t * SMOL_RESTRICT row_parts_out) \
-{                                                                       \
-    uint64_t p, q;                                                      \
-    const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x;        \
-    uint64_t F;                                                         \
-    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out; \
-    int i;                                                              \
-                                                                        \
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);               \
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);                    \
-                                                                        \
-    do                                                                  \
-    {                                                                   \
-        uint64_t accum = 0;                                             \
-                                                                        \
-        for (i = 0; i < (1 << (n_halvings)); i++)                       \
-        {                                                               \
-            row_parts_in += *(ofs_x++);                                 \
-            F = *(ofs_x++);                                             \
-                                                                        \
-            p = *row_parts_in;                                          \
-            q = *(row_parts_in + 1);                                    \
-                                                                        \
-            accum += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \
-        }                                                               \
-        *(row_parts_out++) = ((accum) >> (n_halvings)) & 0x00ff00ff00ff00ffULL; \
-    }                                                                   \
-    while (row_parts_out != row_parts_out_max);                         \
-}                                                                       \
-                                                                        \
-static void                                                             \
-interp_horizontal_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \
-                                                   const uint64_t * SMOL_RESTRICT row_parts_in, \
-                                                   uint64_t * SMOL_RESTRICT row_parts_out) \
-{                                                                       \
-    uint64_t p, q;                                                      \
-    const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x;        \
-    uint64_t F;                                                         \
-    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out * 2; \
-    int i;                                                              \
-                                                                        \
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);               \
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);                    \
-                                                                        \
-    do                                                                  \
-    {                                                                   \
-        uint64_t accum [2] = { 0 };                                     \
-                                                                        \
-        for (i = 0; i < (1 << (n_halvings)); i++)                       \
-        {                                                               \
-            row_parts_in += *(ofs_x++) * 2;                             \
-            F = *(ofs_x++);                                             \
-                                                                        \
-            p = row_parts_in [0];                                       \
-            q = row_parts_in [2];                                       \
-                                                                        \
-            accum [0] += ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \
-                                                                        \
-            p = row_parts_in [1];                                       \
-            q = row_parts_in [3];                                       \
-                                                                        \
-            accum [1] += ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \
-        }                                                               \
-        *(row_parts_out++) = ((accum [0]) >> (n_halvings)) & 0x00ffffff00ffffffULL; \
-        *(row_parts_out++) = ((accum [1]) >> (n_halvings)) & 0x00ffffff00ffffffULL; \
-    }                                                                   \
-    while (row_parts_out != row_parts_out_max);                         \
-}
-
-static void
-interp_horizontal_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx,
-                                     const uint64_t * SMOL_RESTRICT row_parts_in,
-                                     uint64_t * SMOL_RESTRICT row_parts_out)
-{
-    uint64_t p, q;
-    const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x;
-    uint64_t F;
-    uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->width_out;
-
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
-
-    do
-    {
-        row_parts_in += *(ofs_x++);
-        F = *(ofs_x++);
-
-        p = *row_parts_in;
-        q = *(row_parts_in + 1);
-
-        *(row_parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;
-    }
-    while (row_parts_out != row_parts_out_max);
-}
-
-static void
-interp_horizontal_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx,
-                                      const uint64_t * SMOL_RESTRICT row_parts_in,
-                                      uint64_t * SMOL_RESTRICT row_parts_out)
-{
-    uint64_t p, q;
-    const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x;
-    uint64_t F;
-    uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->width_out * 2;
-
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
-
-    do
-    {
-        row_parts_in += *(ofs_x++) * 2;
-        F = *(ofs_x++);
-
-        p = row_parts_in [0];
-        q = row_parts_in [2];
-
-        *(row_parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL;
-
-        p = row_parts_in [1];
-        q = row_parts_in [3];
-
-        *(row_parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL;
-    }
-    while (row_parts_out != row_parts_out_max);
-}
-
-DEF_INTERP_HORIZONTAL_BILINEAR(1)
-DEF_INTERP_HORIZONTAL_BILINEAR(2)
-DEF_INTERP_HORIZONTAL_BILINEAR(3)
-DEF_INTERP_HORIZONTAL_BILINEAR(4)
-DEF_INTERP_HORIZONTAL_BILINEAR(5)
-DEF_INTERP_HORIZONTAL_BILINEAR(6)
-
-static void
-interp_horizontal_boxes_64bpp (const SmolScaleCtx *scale_ctx,
-                               const uint64_t *row_parts_in,
-                               uint64_t * SMOL_RESTRICT row_parts_out)
-{
-    const uint64_t * SMOL_RESTRICT pp;
-    const uint16_t *ofs_x = scale_ctx->offsets_x;
-    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out - 1;
-    uint64_t accum = 0;
-    uint64_t p, q, r, s;
-    uint32_t n;
-    uint64_t F;
-
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
-
-    pp = row_parts_in;
-    p = weight_pixel_64bpp (*(pp++), 256);
-    n = *(ofs_x++);
-
-    while (row_parts_out != row_parts_out_max)
-    {
-        sum_parts_64bpp ((const uint64_t ** SMOL_RESTRICT) &pp, &accum, n);
-
-        F = *(ofs_x++);
-        n = *(ofs_x++);
-
-        r = *(pp++);
-        s = r * F;
-
-        q = (s >> 8) & 0x00ff00ff00ff00ffULL;
-
-        accum += p + q;
-
-        /* (255 * r) - (F * r) */
-        p = (((r << 8) - r - s) >> 8) & 0x00ff00ff00ff00ffULL;
-
-        *(row_parts_out++) = scale_64bpp (accum, scale_ctx->span_mul_x);
-        accum = 0;
-    }
-
-    /* Final box optionally features the rightmost fractional pixel */
-
-    sum_parts_64bpp ((const uint64_t ** SMOL_RESTRICT) &pp, &accum, n);
-
-    q = 0;
-    F = *(ofs_x);
-    if (F > 0)
-        q = weight_pixel_64bpp (*(pp), F);
-
-    accum += p + q;
-    *(row_parts_out++) = scale_64bpp (accum, scale_ctx->span_mul_x);
-}
-
-static void
-interp_horizontal_boxes_128bpp (const SmolScaleCtx *scale_ctx,
-                                const uint64_t *row_parts_in,
-                                uint64_t * SMOL_RESTRICT row_parts_out)
-{
-    const uint64_t * SMOL_RESTRICT pp;
-    const uint16_t *ofs_x = scale_ctx->offsets_x;
-    uint64_t *row_parts_out_max = row_parts_out + (scale_ctx->width_out - /* 2 */ 1) * 2;
-    uint64_t accum [2] = { 0, 0 };
-    uint64_t p [2], q [2], r [2], s [2];
-    uint32_t n;
-    uint64_t F;
-
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
-
-    pp = row_parts_in;
-
-    p [0] = *(pp++);
-    p [1] = *(pp++);
-    weight_pixel_128bpp (p, p, 256);
-
-    n = *(ofs_x++);
-
-    while (row_parts_out != row_parts_out_max)
-    {
-        sum_parts_128bpp ((const uint64_t ** SMOL_RESTRICT) &pp, accum, n);
-
-        F = *(ofs_x++);
-        n = *(ofs_x++);
-
-        r [0] = *(pp++);
-        r [1] = *(pp++);
-
-        s [0] = r [0] * F;
-        s [1] = r [1] * F;
-
-        q [0] = (s [0] >> 8) & 0x00ffffff00ffffff;
-        q [1] = (s [1] >> 8) & 0x00ffffff00ffffff;
-
-        accum [0] += p [0] + q [0];
-        accum [1] += p [1] + q [1];
-
-        p [0] = (((r [0] << 8) - r [0] - s [0]) >> 8) & 0x00ffffff00ffffff;
-        p [1] = (((r [1] << 8) - r [1] - s [1]) >> 8) & 0x00ffffff00ffffff;
-
-        scale_and_store_128bpp (accum,
-                                scale_ctx->span_mul_x,
-                                (uint64_t ** SMOL_RESTRICT) &row_parts_out);
-
-        accum [0] = 0;
-        accum [1] = 0;
-    }
-
-    /* Final box optionally features the rightmost fractional pixel */
-
-    sum_parts_128bpp ((const uint64_t ** SMOL_RESTRICT) &pp, accum, n);
-
-    q [0] = 0;
-    q [1] = 0;
-
-    F = *(ofs_x);
-    if (F > 0)
-    {
-        q [0] = *(pp++);
-        q [1] = *(pp++);
-        weight_pixel_128bpp (q, q, F);
-    }
-
-    accum [0] += p [0] + q [0];
-    accum [1] += p [1] + q [1];
-
-    scale_and_store_128bpp (accum,
-                            scale_ctx->span_mul_x,
-                            (uint64_t ** SMOL_RESTRICT) &row_parts_out);
-}
-
-static void
-interp_horizontal_one_64bpp (const SmolScaleCtx *scale_ctx,
-                             const uint64_t * SMOL_RESTRICT row_parts_in,
-                             uint64_t * SMOL_RESTRICT row_parts_out)
-{
-    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out;
-    uint64_t part;
-
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
-
-    part = *row_parts_in;
-    while (row_parts_out != row_parts_out_max)
-        *(row_parts_out++) = part;
-}
-
-static void
-interp_horizontal_one_128bpp (const SmolScaleCtx *scale_ctx,
-                              const uint64_t * SMOL_RESTRICT row_parts_in,
-                              uint64_t * SMOL_RESTRICT row_parts_out)
-{
-    uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out * 2;
-
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
-
-    while (row_parts_out != row_parts_out_max)
-    {
-        *(row_parts_out++) = row_parts_in [0];
-        *(row_parts_out++) = row_parts_in [1];
-    }
-}
-
-static void
-interp_horizontal_copy_64bpp (const SmolScaleCtx *scale_ctx,
-                              const uint64_t * SMOL_RESTRICT row_parts_in,
-                              uint64_t * SMOL_RESTRICT row_parts_out)
-{
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
-
-    memcpy (row_parts_out, row_parts_in, scale_ctx->width_out * sizeof (uint64_t));
-}
-
-static void
-interp_horizontal_copy_128bpp (const SmolScaleCtx *scale_ctx,
-                               const uint64_t * SMOL_RESTRICT row_parts_in,
-                               uint64_t * SMOL_RESTRICT row_parts_out)
-{
-    SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *);
-
-    memcpy (row_parts_out, row_parts_in, scale_ctx->width_out * 2 * sizeof (uint64_t));
-}
-
-static void
-scale_horizontal (const SmolScaleCtx *scale_ctx,
-                  SmolVerticalCtx *vertical_ctx,
-                  const uint32_t *row_in,
-                  uint64_t *row_parts_out)
-{
-    uint64_t * SMOL_RESTRICT unpacked_in;
-
-    unpacked_in = vertical_ctx->parts_row [3];
-
-    /* 32-bit unpackers need 32-bit alignment */
-    if ((((uintptr_t) row_in) & 3)
-        && scale_ctx->pixel_type_in != SMOL_PIXEL_RGB8
-        && scale_ctx->pixel_type_in != SMOL_PIXEL_BGR8)
-    {
-        if (!vertical_ctx->in_aligned)
-            vertical_ctx->in_aligned =
-                smol_alloc_aligned (scale_ctx->width_in * sizeof (uint32_t),
-                                    &vertical_ctx->in_aligned_storage);
-        memcpy (vertical_ctx->in_aligned, row_in, scale_ctx->width_in * sizeof (uint32_t));
-        row_in = vertical_ctx->in_aligned;
-    }
-
-    scale_ctx->unpack_row_func (row_in,
-                                unpacked_in,
-                                scale_ctx->width_in);
-    scale_ctx->hfilter_func (scale_ctx,
-                             unpacked_in,
-                             row_parts_out);
-}
-
-/* --- Vertical scaling --- */
-
-static void
-update_vertical_ctx_bilinear (const SmolScaleCtx *scale_ctx,
-                              SmolVerticalCtx *vertical_ctx,
-                              uint32_t outrow_index)
-{
-    uint32_t new_in_ofs = scale_ctx->offsets_y [outrow_index * 2];
-
-    if (new_in_ofs == vertical_ctx->in_ofs)
-        return;
-
-    if (new_in_ofs == vertical_ctx->in_ofs + 1)
-    {
-        uint64_t *t = vertical_ctx->parts_row [0];
-        vertical_ctx->parts_row [0] = vertical_ctx->parts_row [1];
-        vertical_ctx->parts_row [1] = t;
-
-        scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, new_in_ofs + 1),
-                          vertical_ctx->parts_row [1]);
-    }
-    else
-    {
-        scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, new_in_ofs),
-                          vertical_ctx->parts_row [0]);
-        scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, new_in_ofs + 1),
-                          vertical_ctx->parts_row [1]);
-    }
-
-    vertical_ctx->in_ofs = new_in_ofs;
-}
-
-static void
-interp_vertical_bilinear_store_64bpp (uint64_t F,
-                                      const uint64_t * SMOL_RESTRICT top_row_parts_in,
-                                      const uint64_t * SMOL_RESTRICT bottom_row_parts_in,
-                                      uint64_t * SMOL_RESTRICT parts_out,
-                                      uint32_t width)
-{
-    uint64_t *parts_out_last = parts_out + width;
-
-    SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (parts_out, uint64_t *);
-
-    do
-    {
-        uint64_t p, q;
-
-        p = *(top_row_parts_in++);
-        q = *(bottom_row_parts_in++);
-
-        *(parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;
-    }
-    while (parts_out != parts_out_last);
-}
-
-static void
-interp_vertical_bilinear_add_64bpp (uint64_t F,
-                                    const uint64_t * SMOL_RESTRICT top_row_parts_in,
-                                    const uint64_t * SMOL_RESTRICT bottom_row_parts_in,
-                                    uint64_t * SMOL_RESTRICT accum_out,
-                                    uint32_t width)
-{
-    uint64_t *accum_out_last = accum_out + width;
-
-    SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (accum_out, uint64_t *);
-
-    do
-    {
-        uint64_t p, q;
-
-        p = *(top_row_parts_in++);
-        q = *(bottom_row_parts_in++);
-
-        *(accum_out++) += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;
-    }
-    while (accum_out != accum_out_last);
-}
-
-static void
-interp_vertical_bilinear_store_128bpp (uint64_t F,
-                                       const uint64_t * SMOL_RESTRICT top_row_parts_in,
-                                       const uint64_t * SMOL_RESTRICT bottom_row_parts_in,
-                                       uint64_t * SMOL_RESTRICT parts_out,
-                                       uint32_t width)
-{
-    uint64_t *parts_out_last = parts_out + width;
-
-    SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (parts_out, uint64_t *);
-
-    do
-    {
-        uint64_t p, q;
-
-        p = *(top_row_parts_in++);
-        q = *(bottom_row_parts_in++);
-
-        *(parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL;
-    }
-    while (parts_out != parts_out_last);
-}
-
-static void
-interp_vertical_bilinear_add_128bpp (uint64_t F,
-                                     const uint64_t * SMOL_RESTRICT top_row_parts_in,
-                                     const uint64_t * SMOL_RESTRICT bottom_row_parts_in,
-                                     uint64_t * SMOL_RESTRICT accum_out,
-                                     uint32_t width)
-{
-    uint64_t *accum_out_last = accum_out + width;
-
-    SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (accum_out, uint64_t *);
-
-    do
-    {
-        uint64_t p, q;
-
-        p = *(top_row_parts_in++);
-        q = *(bottom_row_parts_in++);
-
-        *(accum_out++) += ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL;
-    }
-    while (accum_out != accum_out_last);
-}
-
-#define DEF_INTERP_VERTICAL_BILINEAR_FINAL(n_halvings)                  \
-static void                                                             \
-interp_vertical_bilinear_final_##n_halvings##h_64bpp (uint64_t F,                \
-                                                      const uint64_t * SMOL_RESTRICT top_row_parts_in, \
-                                                      const uint64_t * SMOL_RESTRICT bottom_row_parts_in, \
-                                                      uint64_t * SMOL_RESTRICT accum_inout, \
-                                                      uint32_t width)   \
-{                                                                       \
-    uint64_t *accum_inout_last = accum_inout + width;                   \
-                                                                        \
-    SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *);           \
-    SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *);        \
-    SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *);                      \
-                                                                        \
-    do                                                                  \
-    {                                                                   \
-        uint64_t p, q;                                                  \
-                                                                        \
-        p = *(top_row_parts_in++);                                      \
-        q = *(bottom_row_parts_in++);                                   \
-                                                                        \
-        p = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL;         \
-        p = ((p + *accum_inout) >> n_halvings) & 0x00ff00ff00ff00ffULL; \
-                                                                        \
-        *(accum_inout++) = p;                                           \
-    }                                                                   \
-    while (accum_inout != accum_inout_last);                            \
-}                                                                       \
-                                                                        \
-static void                                                             \
-interp_vertical_bilinear_final_##n_halvings##h_128bpp (uint64_t F,      \
-                                                       const uint64_t * SMOL_RESTRICT top_row_parts_in, \
-                                                       const uint64_t * SMOL_RESTRICT bottom_row_parts_in, \
-                                                       uint64_t * SMOL_RESTRICT accum_inout, \
-                                                       uint32_t width)  \
-{                                                                       \
-    uint64_t *accum_inout_last = accum_inout + width;                   \
-                                                                        \
-    SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *);           \
-    SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *);        \
-    SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *);                      \
-                                                                        \
-    do                                                                  \
-    {                                                                   \
-        uint64_t p, q;                                                  \
-                                                                        \
-        p = *(top_row_parts_in++);                                      \
-        q = *(bottom_row_parts_in++);                                   \
-                                                                        \
-        p = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL;         \
-        p = ((p + *accum_inout) >> n_halvings) & 0x00ffffff00ffffffULL; \
-                                                                        \
-        *(accum_inout++) = p;                                           \
-    }                                                                   \
-    while (accum_inout != accum_inout_last);                            \
-}
-
-#define DEF_SCALE_OUTROW_BILINEAR(n_halvings)                           \
-static void                                                             \
-scale_outrow_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \
-                                             SmolVerticalCtx *vertical_ctx, \
-                                             uint32_t outrow_index,     \
-                                             uint32_t *row_out)         \
-{                                                                       \
-    uint32_t bilin_index = outrow_index << (n_halvings);                \
-    unsigned int i;                                                     \
-                                                                        \
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \
-    interp_vertical_bilinear_store_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \
-                                          vertical_ctx->parts_row [0],  \
-                                          vertical_ctx->parts_row [1],  \
-                                          vertical_ctx->parts_row [2],  \
-                                          scale_ctx->width_out);        \
-    bilin_index++;                                                      \
-                                                                        \
-    for (i = 0; i < (1 << (n_halvings)) - 2; i++)                       \
-    {                                                                   \
-        update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \
-        interp_vertical_bilinear_add_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \
-                                            vertical_ctx->parts_row [0], \
-                                            vertical_ctx->parts_row [1], \
-                                            vertical_ctx->parts_row [2], \
-                                            scale_ctx->width_out);      \
-        bilin_index++;                                                  \
-    }                                                                   \
-                                                                        \
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \
-    interp_vertical_bilinear_final_##n_halvings##h_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \
-                                                          vertical_ctx->parts_row [0], \
-                                                          vertical_ctx->parts_row [1], \
-                                                          vertical_ctx->parts_row [2], \
-                                                          scale_ctx->width_out); \
-                                                                        \
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); \
-}                                                                       \
-                                                                        \
-static void                                                             \
-scale_outrow_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \
-                                              SmolVerticalCtx *vertical_ctx, \
-                                              uint32_t outrow_index,    \
-                                              uint32_t *row_out)        \
-{                                                                       \
-    uint32_t bilin_index = outrow_index << (n_halvings);                \
-    unsigned int i;                                                     \
-                                                                        \
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \
-    interp_vertical_bilinear_store_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \
-                                           vertical_ctx->parts_row [0], \
-                                           vertical_ctx->parts_row [1], \
-                                           vertical_ctx->parts_row [2], \
-                                           scale_ctx->width_out * 2);   \
-    bilin_index++;                                                      \
-                                                                        \
-    for (i = 0; i < (1 << (n_halvings)) - 2; i++)                       \
-    {                                                                   \
-        update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \
-        interp_vertical_bilinear_add_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \
-                                             vertical_ctx->parts_row [0], \
-                                             vertical_ctx->parts_row [1], \
-                                             vertical_ctx->parts_row [2], \
-                                             scale_ctx->width_out * 2); \
-        bilin_index++;                                                  \
-    }                                                                   \
-                                                                        \
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \
-    interp_vertical_bilinear_final_##n_halvings##h_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \
-                                                           vertical_ctx->parts_row [0], \
-                                                           vertical_ctx->parts_row [1], \
-                                                           vertical_ctx->parts_row [2], \
-                                                           scale_ctx->width_out * 2); \
-                                                                        \
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); \
-}
-
-static void
-scale_outrow_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx,
-                                SmolVerticalCtx *vertical_ctx,
-                                uint32_t outrow_index,
-                                uint32_t *row_out)
-{
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, outrow_index);
-    interp_vertical_bilinear_store_64bpp (scale_ctx->offsets_y [outrow_index * 2 + 1],
-                                          vertical_ctx->parts_row [0],
-                                          vertical_ctx->parts_row [1],
-                                          vertical_ctx->parts_row [2],
-                                          scale_ctx->width_out);
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out);
-}
-
-static void
-scale_outrow_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx,
-                                 SmolVerticalCtx *vertical_ctx,
-                                 uint32_t outrow_index,
-                                 uint32_t *row_out)
-{
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, outrow_index);
-    interp_vertical_bilinear_store_128bpp (scale_ctx->offsets_y [outrow_index * 2 + 1],
-                                           vertical_ctx->parts_row [0],
-                                           vertical_ctx->parts_row [1],
-                                           vertical_ctx->parts_row [2],
-                                           scale_ctx->width_out * 2);
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out);
-}
-
-DEF_INTERP_VERTICAL_BILINEAR_FINAL(1)
-
-static void
-scale_outrow_bilinear_1h_64bpp (const SmolScaleCtx *scale_ctx,
-                                SmolVerticalCtx *vertical_ctx,
-                                uint32_t outrow_index,
-                                uint32_t *row_out)
-{
-    uint32_t bilin_index = outrow_index << 1;
-
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index);
-    interp_vertical_bilinear_store_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1],
-                                          vertical_ctx->parts_row [0],
-                                          vertical_ctx->parts_row [1],
-                                          vertical_ctx->parts_row [2],
-                                          scale_ctx->width_out);
-    bilin_index++;
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index);
-    interp_vertical_bilinear_final_1h_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1],
-                                             vertical_ctx->parts_row [0],
-                                             vertical_ctx->parts_row [1],
-                                             vertical_ctx->parts_row [2],
-                                             scale_ctx->width_out);
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out);
+    return FALSE;
 }
 
-static void
-scale_outrow_bilinear_1h_128bpp (const SmolScaleCtx *scale_ctx,
-                                 SmolVerticalCtx *vertical_ctx,
-                                 uint32_t outrow_index,
-                                 uint32_t *row_out)
+/* The generic unpack/pack functions fetch and store pixels as u32.
+ * This means the byte order will be reversed on little endian, with
+ * consequences for the alpha channel and reordering logic. We deal
+ * with this by using the apparent byte order internally. */
+static SmolPixelType
+get_host_pixel_type (SmolPixelType pixel_type)
 {
-    uint32_t bilin_index = outrow_index << 1;
-
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index);
-    interp_vertical_bilinear_store_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1],
-                                           vertical_ctx->parts_row [0],
-                                           vertical_ctx->parts_row [1],
-                                           vertical_ctx->parts_row [2],
-                                           scale_ctx->width_out * 2);
-    bilin_index++;
-    update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index);
-    interp_vertical_bilinear_final_1h_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1],
-                                              vertical_ctx->parts_row [0],
-                                              vertical_ctx->parts_row [1],
-                                              vertical_ctx->parts_row [2],
-                                              scale_ctx->width_out * 2);
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out);
+    if (host_is_little_endian ())
+        return pixel_type_u32_le [pixel_type];
+
+    return pixel_type;
 }
 
-DEF_INTERP_VERTICAL_BILINEAR_FINAL(2)
-DEF_SCALE_OUTROW_BILINEAR(2)
-DEF_INTERP_VERTICAL_BILINEAR_FINAL(3)
-DEF_SCALE_OUTROW_BILINEAR(3)
-DEF_INTERP_VERTICAL_BILINEAR_FINAL(4)
-DEF_SCALE_OUTROW_BILINEAR(4)
-DEF_INTERP_VERTICAL_BILINEAR_FINAL(5)
-DEF_SCALE_OUTROW_BILINEAR(5)
-DEF_INTERP_VERTICAL_BILINEAR_FINAL(6)
-DEF_SCALE_OUTROW_BILINEAR(6)
+/* ---------------------- *
+ * Context initialization *
+ * ---------------------- */
 
 static void
-finalize_vertical_64bpp (const uint64_t * SMOL_RESTRICT accums,
-                         uint64_t multiplier,
-                         uint64_t * SMOL_RESTRICT parts_out,
-                         uint32_t n)
+pick_filter_params (uint32_t src_dim,
+                    uint32_t src_dim_spx,
+                    int32_t dest_ofs_spx,
+                    uint32_t dest_dim,
+                    uint32_t dest_dim_spx,
+                    uint32_t *dest_halvings,
+                    uint32_t *dest_dim_prehalving,
+                    uint32_t *dest_dim_prehalving_spx,
+                    SmolFilterType *dest_filter,
+                    SmolStorageType *dest_storage,
+                    uint16_t *first_opacity,
+                    uint16_t *last_opacity,
+                    SmolFlags flags)
 {
-    uint64_t *parts_out_max = parts_out + n;
+    *dest_dim_prehalving = dest_dim;
+    *dest_storage = (flags & SMOL_DISABLE_SRGB_LINEARIZATION) ? SMOL_STORAGE_64BPP : SMOL_STORAGE_128BPP;
 
-    SMOL_ASSUME_ALIGNED (accums, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (parts_out, uint64_t *);
+    *first_opacity = SMOL_SUBPIXEL_MOD (-dest_ofs_spx - 1) + 1;
+    *last_opacity = SMOL_SUBPIXEL_MOD (dest_ofs_spx + dest_dim_spx - 1) + 1;
 
-    while (parts_out != parts_out_max)
-    {
-        *(parts_out++) = scale_64bpp (*(accums++), multiplier);
-    }
-}
-
-static void
-weight_edge_row_64bpp (uint64_t *row,
-                       uint16_t w,
-                       uint32_t n)
-{
-    uint64_t *row_max = row + n;
+    /* Special handling when the output is a single pixel */
 
-    SMOL_ASSUME_ALIGNED (row, uint64_t *);
-
-    while (row != row_max)
+    if (dest_dim == 1)
     {
-        *row = ((*row * w) >> 8) & 0x00ff00ff00ff00ffULL;
-        row++;
+        *first_opacity = dest_dim_spx;
+        *last_opacity = 256;
     }
-}
 
-static void
-scale_and_weight_edge_rows_box_64bpp (const uint64_t * SMOL_RESTRICT first_row,
-                                      uint64_t * SMOL_RESTRICT last_row,
-                                      uint64_t * SMOL_RESTRICT accum,
-                                      uint16_t w2,
-                                      uint32_t n)
-{
-    const uint64_t *first_row_max = first_row + n;
-
-    SMOL_ASSUME_ALIGNED (first_row, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (last_row, uint64_t *);
-    SMOL_ASSUME_ALIGNED (accum, uint64_t *);
+    /* The box algorithms are only sufficiently precise when
+     * src_dim > dest_dim * 5. box_64bpp typically starts outperforming
+     * bilinear+halving at src_dim > dest_dim * 8. */
 
-    while (first_row != first_row_max)
+    if (src_dim > dest_dim * 255)
     {
-        uint64_t r, s, p, q;
-
-        p = *(first_row++);
-
-        r = *(last_row);
-        s = r * w2;
-        q = (s >> 8) & 0x00ff00ff00ff00ffULL;
-        /* (255 * r) - (F * r) */
-        *(last_row++) = (((r << 8) - r - s) >> 8) & 0x00ff00ff00ff00ffULL;
-
-        *(accum++) = p + q;
+        *dest_storage = SMOL_STORAGE_128BPP;
+        *dest_filter = SMOL_FILTER_BOX;
     }
-}
-
-static void
-update_vertical_ctx_box_64bpp (const SmolScaleCtx *scale_ctx,
-                             SmolVerticalCtx *vertical_ctx,
-                             uint32_t ofs_y,
-                             uint32_t ofs_y_max,
-                             uint16_t w1,
-                             uint16_t w2)
-{
-    /* Old in_ofs is the previous max */
-    if (ofs_y == vertical_ctx->in_ofs)
+    else if (src_dim > dest_dim * 8)
     {
-        uint64_t *t = vertical_ctx->parts_row [0];
-        vertical_ctx->parts_row [0] = vertical_ctx->parts_row [1];
-        vertical_ctx->parts_row [1] = t;
+        *dest_filter = SMOL_FILTER_BOX;
     }
-    else
+    else if (src_dim <= 1)
     {
-        scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, ofs_y),
-                          vertical_ctx->parts_row [0]);
-        weight_edge_row_64bpp (vertical_ctx->parts_row [0], w1, scale_ctx->width_out);
+        *dest_filter = SMOL_FILTER_ONE;
+        *last_opacity = ((dest_ofs_spx + dest_dim_spx - 1) % SMOL_SUBPIXEL_MUL) + 1;
     }
-
-    /* When w2 == 0, the final inrow may be out of bounds. Don't try to access it in
-     * that case. */
-    if (w2 || ofs_y_max < scale_ctx->height_in)
+    else if ((dest_ofs_spx & 0xff) == 0 && src_dim_spx == dest_dim_spx)
     {
-        scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, ofs_y_max),
-                          vertical_ctx->parts_row [1]);
+        *dest_filter = SMOL_FILTER_COPY;
+        *first_opacity = 256;
+        *last_opacity = 256;
     }
     else
     {
-        memset (vertical_ctx->parts_row [1], 0, scale_ctx->width_out * sizeof (uint64_t));
+        uint32_t n_halvings = 0;
+        uint32_t d = dest_dim_spx;
+
+        for (;;)
+        {
+            d *= 2;
+            if (d >= src_dim_spx)
+                break;
+            n_halvings++;
+        }
+
+        *dest_dim_prehalving = dest_dim << n_halvings;
+        *dest_dim_prehalving_spx = dest_dim_spx << n_halvings;
+        *dest_filter = SMOL_FILTER_BILINEAR_0H + n_halvings;
+        *dest_halvings = n_halvings;
     }
 
-    vertical_ctx->in_ofs = ofs_y_max;
 }
 
-static void
-scale_outrow_box_64bpp (const SmolScaleCtx *scale_ctx,
-                        SmolVerticalCtx *vertical_ctx,
-                        uint32_t outrow_index,
-                        uint32_t *row_out)
+static const SmolRepackMeta *
+find_repack_match (const SmolRepackMeta *meta, uint16_t sig, uint16_t mask)
 {
-    uint32_t ofs_y, ofs_y_max;
-    uint16_t w1, w2;
-
-    /* Get the inrow range for this outrow: [ofs_y .. ofs_y_max> */
+    sig &= mask;
 
-    ofs_y = scale_ctx->offsets_y [outrow_index * 2];
-    ofs_y_max = scale_ctx->offsets_y [(outrow_index + 1) * 2];
-
-    /* Scale the first and last rows, weight them and store in accumulator */
-
-    w1 = (outrow_index == 0) ? 256 : 255 - scale_ctx->offsets_y [outrow_index * 2 - 1];
-    w2 = scale_ctx->offsets_y [outrow_index * 2 + 1];
-
-    update_vertical_ctx_box_64bpp (scale_ctx, vertical_ctx, ofs_y, ofs_y_max, w1, w2);
-
-    scale_and_weight_edge_rows_box_64bpp (vertical_ctx->parts_row [0],
-                                          vertical_ctx->parts_row [1],
-                                          vertical_ctx->parts_row [2],
-                                          w2,
-                                          scale_ctx->width_out);
-
-    ofs_y++;
-
-    /* Add up whole rows */
-
-    while (ofs_y < ofs_y_max)
+    for (;; meta++)
     {
-        scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, ofs_y),
-                          vertical_ctx->parts_row [0]);
-        add_parts (vertical_ctx->parts_row [0],
-                   vertical_ctx->parts_row [2],
-                   scale_ctx->width_out);
-
-        ofs_y++;
+        if (!meta->repack_row_func)
+        {
+            meta = NULL;
+            break;
+        }
+
+        if (sig == (meta->signature & mask))
+            break;
     }
 
-    finalize_vertical_64bpp (vertical_ctx->parts_row [2],
-                             scale_ctx->span_mul_y,
-                             vertical_ctx->parts_row [0],
-                             scale_ctx->width_out);
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out);
+    return meta;
 }
 
 static void
-finalize_vertical_128bpp (const uint64_t * SMOL_RESTRICT accums,
-                          uint64_t multiplier,
-                          uint64_t * SMOL_RESTRICT parts_out,
-                          uint32_t n)
+do_reorder (const uint8_t *order_in, uint8_t *order_out, const uint8_t *reorder)
 {
-    uint64_t *parts_out_max = parts_out + n * 2;
+    int i;
 
-    SMOL_ASSUME_ALIGNED (accums, const uint64_t *);
-    SMOL_ASSUME_ALIGNED (parts_out, uint64_t *);
-
-    while (parts_out != parts_out_max)
+    for (i = 0; i < 4; i++)
     {
-        *(parts_out++) = scale_128bpp_half (*(accums++), multiplier);
-        *(parts_out++) = scale_128bpp_half (*(accums++), multiplier);
-    }
-}
-
-static void
-weight_row_128bpp (uint64_t *row,
-                   uint16_t w,
-                   uint32_t n)
-{
-    uint64_t *row_max = row + (n * 2);
+        uint8_t r = reorder [i];
+        uint8_t o;
 
-    SMOL_ASSUME_ALIGNED (row, uint64_t *);
+        if (r == 0)
+        {
+            o = 0;
+        }
+        else
+        {
+            o = order_in [r - 1];
+            if (o == 0)
+                o = i + 1;
+        }
 
-    while (row != row_max)
-    {
-        row [0] = ((row [0] * w) >> 8) & 0x00ffffff00ffffffULL;
-        row [1] = ((row [1] * w) >> 8) & 0x00ffffff00ffffffULL;
-        row += 2;
+        order_out [i] = o;
     }
 }
 
 static void
-scale_outrow_box_128bpp (const SmolScaleCtx *scale_ctx,
-                         SmolVerticalCtx *vertical_ctx,
-                         uint32_t outrow_index,
-                         uint32_t *row_out)
+find_repacks (const SmolImplementation **implementations,
+              SmolStorageType src_storage, SmolStorageType mid_storage, SmolStorageType dest_storage,
+              SmolAlphaType src_alpha, SmolAlphaType mid_alpha, SmolAlphaType dest_alpha,
+              SmolGammaType src_gamma, SmolGammaType mid_gamma, SmolGammaType dest_gamma,
+              const SmolPixelTypeMeta *src_pmeta, const SmolPixelTypeMeta *dest_pmeta,
+              const SmolRepackMeta **src_repack, const SmolRepackMeta **dest_repack)
 {
-    uint32_t ofs_y, ofs_y_max;
-    uint16_t w;
+    int src_impl, dest_impl;
+    const SmolRepackMeta *src_meta, *dest_meta = NULL;
+    uint16_t src_to_mid_sig, mid_to_dest_sig;
+    uint16_t sig_mask;
+    int reorder_dest_alpha_ch;
 
-    /* Get the inrow range for this outrow: [ofs_y .. ofs_y_max> */
+    sig_mask = SMOL_REPACK_SIGNATURE_ANY_ORDER_MASK (1, 1, 1, 1, 1, 1);
+    src_to_mid_sig = SMOL_MAKE_REPACK_SIGNATURE_ANY_ORDER (src_storage, src_alpha, src_gamma,
+                                                           mid_storage, mid_alpha, mid_gamma);
+    mid_to_dest_sig = SMOL_MAKE_REPACK_SIGNATURE_ANY_ORDER (mid_storage, mid_alpha, mid_gamma,
+                                                            dest_storage, dest_alpha, dest_gamma);
 
-    ofs_y = scale_ctx->offsets_y [outrow_index * 2];
-    ofs_y_max = scale_ctx->offsets_y [(outrow_index + 1) * 2];
+    /* The initial conversion must always leave alpha in position #4, so further
+     * processing knows where to find it. The order of the other channels
+     * doesn't matter, as long as there's a repack chain that ultimately
+     * produces the desired result. */
+    reorder_dest_alpha_ch = src_pmeta->order [0] == 4 ? 1 : 4;
 
-    /* Scale the first inrow and store it */
+    for (src_impl = 0; implementations [src_impl]; src_impl++)
+    {
+        src_meta = &implementations [src_impl]->repack_meta [0];
 
-    scale_horizontal (scale_ctx,
-                      vertical_ctx,
-                      inrow_ofs_to_pointer (scale_ctx, ofs_y),
-                      vertical_ctx->parts_row [0]);
-    weight_row_128bpp (vertical_ctx->parts_row [0],
-                       outrow_index == 0 ? 256 : 255 - scale_ctx->offsets_y [outrow_index * 2 - 1],
-                       scale_ctx->width_out);
-    ofs_y++;
+        for (;; src_meta++)
+        {
+            uint8_t mid_order [4];
 
-    /* Add up whole rows */
+            src_meta = find_repack_match (src_meta, src_to_mid_sig, sig_mask);
+            if (!src_meta)
+                break;
 
-    while (ofs_y < ofs_y_max)
-    {
-        scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, ofs_y),
-                          vertical_ctx->parts_row [1]);
-        add_parts (vertical_ctx->parts_row [1],
-                   vertical_ctx->parts_row [0],
-                   scale_ctx->width_out * 2);
-
-        ofs_y++;
-    }
+            if (reorder_meta [SMOL_REPACK_SIGNATURE_GET_REORDER (src_meta->signature)].dest [3] != reorder_dest_alpha_ch)
+                continue;
 
-    /* Final row is optional; if this is the bottommost outrow it could be out of bounds */
+            do_reorder (src_pmeta->order, mid_order,
+                        reorder_meta [SMOL_REPACK_SIGNATURE_GET_REORDER (src_meta->signature)].dest);
 
-    w = scale_ctx->offsets_y [outrow_index * 2 + 1];
-    if (w > 0)
-    {
-        scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, ofs_y),
-                          vertical_ctx->parts_row [1]);
-        weight_row_128bpp (vertical_ctx->parts_row [1],
-                           w - 1,  /* Subtract 1 to avoid overflow */
-                           scale_ctx->width_out);
-        add_parts (vertical_ctx->parts_row [1],
-                   vertical_ctx->parts_row [0],
-                   scale_ctx->width_out * 2);
-    }
+            for (dest_impl = 0; implementations [dest_impl]; dest_impl++)
+            {
+                dest_meta = &implementations [dest_impl]->repack_meta [0];
 
-    finalize_vertical_128bpp (vertical_ctx->parts_row [0],
-                              scale_ctx->span_mul_y,
-                              vertical_ctx->parts_row [1],
-                              scale_ctx->width_out);
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [1], row_out, scale_ctx->width_out);
-}
+                for (;; dest_meta++)
+                {
+                    uint8_t dest_order [4];
 
-static void
-scale_outrow_one_64bpp (const SmolScaleCtx *scale_ctx,
-                        SmolVerticalCtx *vertical_ctx,
-                        uint32_t row_index,
-                        uint32_t *row_out)
-{
-    SMOL_UNUSED (row_index);
+                    dest_meta = find_repack_match (dest_meta, mid_to_dest_sig, sig_mask);
+                    if (!dest_meta)
+                        break;
 
-    /* Scale the row and store it */
+                    do_reorder (mid_order, dest_order,
+                                reorder_meta [SMOL_REPACK_SIGNATURE_GET_REORDER (dest_meta->signature)].dest);
 
-    if (vertical_ctx->in_ofs != 0)
-    {
-        scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, 0),
-                          vertical_ctx->parts_row [0]);
-        vertical_ctx->in_ofs = 0;
+                    if (*((uint32_t *) dest_order) == *((uint32_t *) dest_pmeta->order))
+                    {
+                        /* Success */
+                        goto out;
+                    }
+                }
+            }
+        }
     }
 
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out);
+out:
+    if (src_repack)
+        *src_repack = src_meta;
+    if (dest_repack)
+        *dest_repack = dest_meta;
 }
 
 static void
-scale_outrow_one_128bpp (const SmolScaleCtx *scale_ctx,
-                        SmolVerticalCtx *vertical_ctx,
-                        uint32_t row_index,
-                        uint32_t *row_out)
+populate_clear_batch (SmolScaleCtx *scale_ctx)
 {
-    SMOL_UNUSED (row_index);
+    uint8_t dest_color [16];
+    int pixel_stride;
+    int i;
 
-    /* Scale the row and store it */
+    scale_ctx->pack_row_func (scale_ctx->color_pixel, dest_color, 1);
+    pixel_stride = pixel_type_meta [scale_ctx->dest_pixel_type].pixel_stride;
 
-    if (vertical_ctx->in_ofs != 0)
+    for (i = 0; i != SMOL_CLEAR_BATCH_SIZE; i += pixel_stride)
     {
-        scale_horizontal (scale_ctx,
-                          vertical_ctx,
-                          inrow_ofs_to_pointer (scale_ctx, 0),
-                          vertical_ctx->parts_row [0]);
-        vertical_ctx->in_ofs = 0;
-    }
-
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out);
-}
-
-static void
-scale_outrow_copy (const SmolScaleCtx *scale_ctx,
-                   SmolVerticalCtx *vertical_ctx,
-                   uint32_t row_index,
-                   uint32_t *row_out)
-{
-    scale_horizontal (scale_ctx,
-                      vertical_ctx,
-                      inrow_ofs_to_pointer (scale_ctx, row_index),
-                      vertical_ctx->parts_row [0]);
+        /* Must be an exact fit */
+        SMOL_ASSERT (i + pixel_stride <= SMOL_CLEAR_BATCH_SIZE);
 
-    scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out);
+        memcpy (scale_ctx->color_pixels_clear_batch + i, dest_color, pixel_stride);
+    }
 }
 
-static void
-scale_outrow (const SmolScaleCtx *scale_ctx,
-              SmolVerticalCtx *vertical_ctx,
-              uint32_t outrow_index,
-              uint32_t *row_out)
-{
-    scale_ctx->vfilter_func (scale_ctx,
-                             vertical_ctx,
-                             outrow_index,
-                             row_out);
-
-    if (scale_ctx->post_row_func)
-        scale_ctx->post_row_func (row_out, scale_ctx->width_out, scale_ctx->user_data);
-}
+#define IMPLEMENTATION_MAX 8
 
+/* scale_ctx->storage_type must be initialized first by pick_filter_params() */
 static void
-do_rows (const SmolScaleCtx *scale_ctx,
-         void *outrows_dest,
-         uint32_t row_out_index,
-         uint32_t n_rows)
+get_implementations (SmolScaleCtx *scale_ctx, const void *color_pixel, SmolPixelType color_pixel_type)
 {
-    SmolVerticalCtx vertical_ctx = { 0 };
-    uint32_t n_parts_per_pixel = 1;
-    uint32_t n_stored_rows = 4;
-    uint32_t i;
+    SmolPixelType src_ptype, dest_ptype;
+    const SmolPixelTypeMeta *src_pmeta, *dest_pmeta;
+    const SmolRepackMeta *src_rmeta, *dest_rmeta;
+    SmolAlphaType internal_alpha = SMOL_ALPHA_PREMUL8;
+    const SmolImplementation *implementations [IMPLEMENTATION_MAX];
+    int i = 0;
 
-    if (scale_ctx->storage_type == SMOL_STORAGE_128BPP)
-        n_parts_per_pixel = 2;
+    if (color_pixel)
+        scale_ctx->have_composite_color = TRUE;
 
-    /* Must be one less, or this test in update_vertical_ctx() will wrap around:
-     * if (new_in_ofs == vertical_ctx->in_ofs + 1) { ... } */
-    vertical_ctx.in_ofs = UINT_MAX - 1;
+    /* Check for noop (direct copy) */
 
-    for (i = 0; i < n_stored_rows; i++)
+    if (scale_ctx->hdim.src_size_spx == scale_ctx->hdim.dest_size_spx
+        && scale_ctx->vdim.src_size_spx == scale_ctx->vdim.dest_size_spx
+        && scale_ctx->src_pixel_type == scale_ctx->dest_pixel_type
+        && scale_ctx->composite_op != SMOL_COMPOSITE_SRC_OVER_DEST)
     {
-        vertical_ctx.parts_row [i] =
-            smol_alloc_aligned (MAX (scale_ctx->width_in, scale_ctx->width_out)
-                                * n_parts_per_pixel * sizeof (uint64_t),
-                                &vertical_ctx.row_storage [i]);
+        /* The scaling and packing is a no-op, but we may still need to
+         * clear dest, so allow the rest of the function to run so we get
+         * the clear functions etc. */
+        scale_ctx->is_noop = TRUE;
     }
 
-    for (i = row_out_index; i < row_out_index + n_rows; i++)
-    {
-        scale_outrow (scale_ctx, &vertical_ctx, i, outrows_dest);
-        outrows_dest = (uint8_t *) outrows_dest + scale_ctx->rowstride_out;
-    }
+    /* Enumerate implementations, preferred first */
 
-    for (i = 0; i < n_stored_rows; i++)
+    if (!(scale_ctx->flags & SMOL_DISABLE_ACCELERATION))
     {
-        smol_free (vertical_ctx.row_storage [i]);
+#ifdef SMOL_WITH_AVX2
+        if (have_avx2 ())
+            implementations [i++] = _smol_get_avx2_implementation ();
+#endif
     }
 
-    /* Used to align row data if needed. May be allocated in scale_horizontal(). */
-    if (vertical_ctx.in_aligned)
-        smol_free (vertical_ctx.in_aligned_storage);
-}
+    implementations [i++] = _smol_get_generic_implementation ();
+    implementations [i] = NULL;
 
-/* --- Conversion tables --- */
+    /* Install repackers */
 
-static const SmolConversionTable generic_conversions =
-{
-{ {
-    /* Conversions where accumulators must hold the sum of fewer than
-     * 256 pixels. This can be done in 64bpp, but 128bpp may be used
-     * e.g. for 16 bits per channel internally premultiplied data. */
+    src_ptype = get_host_pixel_type (scale_ctx->src_pixel_type);
+    dest_ptype = get_host_pixel_type (scale_ctx->dest_pixel_type);
 
-    /* RGBA8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 3214, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4123, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 1234, u, 64),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 3214, u, 64),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4123, u, 64),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4321, u, 64),
-        /* RGB8      */ SMOL_CONV (1234, p, 1324, p, 132a, p, 123, u, 64),
-        /* BGR8      */ SMOL_CONV (1234, p, 1324, p, 132a, p, 321, u, 64),
-    },
-    /* BGRA8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 3214, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4123, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 3214, u, 64),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 1234, u, 64),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4321, u, 64),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4123, u, 64),
-        /* RGB8      */ SMOL_CONV (1234, p, 1324, p, 132a, p, 321, u, 64),
-        /* BGR8      */ SMOL_CONV (1234, p, 1324, p, 132a, p, 123, u, 64),
-    },
-    /* ARGB8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 2341, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1432, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 2341, u, 64),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 4321, u, 64),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 1234, u, 64),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 1432, u, 64),
-        /* RGB8      */ SMOL_CONV (1234, p, 1324, p, a324, p, 234, u, 64),
-        /* BGR8      */ SMOL_CONV (1234, p, 1324, p, a324, p, 432, u, 64),
-    },
-    /* ABGR8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 2341, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1432, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 4321, u, 64),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 2341, u, 64),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 1432, u, 64),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1324, p, a324, p, 1234, u, 64),
-        /* RGB8      */ SMOL_CONV (1234, p, 1324, p, a324, p, 432, u, 64),
-        /* BGR8      */ SMOL_CONV (1234, p, 1324, p, a324, p, 234, u, 64),
-    },
-    /* RGBA8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 1234, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 3214, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4123, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4321, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128),
-        /* RGB8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128),
-        /* BGR8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128),
-    },
-    /* BGRA8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 3214, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 1234, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4321, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4123, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128),
-        /* RGB8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128),
-        /* BGR8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128),
-    },
-    /* ARGB8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 2341, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 4321, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1234, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1432, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128),
-        /* RGB8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128),
-        /* BGR8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128),
-    },
-    /* ABGR8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 4321, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 2341, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1432, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1234, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128),
-        /* RGB8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128),
-        /* BGR8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128),
-    },
-    /* RGB8 -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64),
-        /* BGRA8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64),
-        /* ARGB8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64),
-        /* ABGR8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64),
-        /* RGB8      */ SMOL_CONV (123, p, 132a, p, 132a, p, 123, p, 64),
-        /* BGR8      */ SMOL_CONV (123, p, 132a, p, 132a, p, 321, p, 64),
-    },
-    /* BGR8 -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64),
-        /* BGRA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64),
-        /* ARGB8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64),
-        /* ABGR8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64),
-        /* RGBA8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64),
-        /* BGRA8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64),
-        /* ARGB8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64),
-        /* ABGR8 un  */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64),
-        /* RGB8      */ SMOL_CONV (123, p, 132a, p, 132a, p, 321, p, 64),
-        /* BGR8      */ SMOL_CONV (123, p, 132a, p, 132a, p, 123, p, 64),
-    }
-    },
+    src_pmeta = &pixel_type_meta [src_ptype];
+    dest_pmeta = &pixel_type_meta [dest_ptype];
 
+    if (src_pmeta->alpha == SMOL_ALPHA_UNASSOCIATED
+        && dest_pmeta->alpha == SMOL_ALPHA_UNASSOCIATED)
     {
-    /* Conversions where accumulators must hold the sum of up to
-     * 65535 pixels. We need 128bpp for this. */
+        /* In order to preserve the color range in transparent pixels when going
+         * from unassociated to unassociated, we use 16 bits per channel internally. */
+        internal_alpha = SMOL_ALPHA_PREMUL16;
+        scale_ctx->storage_type = SMOL_STORAGE_128BPP;
+    }
 
-    /* RGBA8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 3214, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4123, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 1234, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 3214, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4123, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4321, u, 128),
-        /* RGB8      */ SMOL_CONV (1234, p, 1234, p, 123a, p, 123, u, 128),
-        /* BGR8      */ SMOL_CONV (1234, p, 1234, p, 123a, p, 321, u, 128),
-    },
-    /* BGRA8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 3214, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4123, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 3214, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 1234, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4321, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4123, u, 128),
-        /* RGB8      */ SMOL_CONV (1234, p, 1234, p, 123a, p, 321, u, 128),
-        /* BGR8      */ SMOL_CONV (1234, p, 1234, p, 123a, p, 123, u, 128),
-    },
-    /* ARGB8 pre -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 2341, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1432, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 2341, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 4321, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 1234, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 1432, u, 128),
-        /* RGB8      */ SMOL_CONV (1234, p, 1234, p, a234, p, 234, u, 128),
-        /* BGR8      */ SMOL_CONV (1234, p, 1234, p, a234, p, 432, u, 128),
-    },
-    /* ABGR8 pre -> */
+    if (scale_ctx->hdim.src_size_px > scale_ctx->hdim.dest_size_px * 8191
+        || scale_ctx->vdim.src_size_px > scale_ctx->vdim.dest_size_px * 8191)
     {
-        /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 2341, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1432, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 4321, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 2341, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 1432, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (1234, p, 1234, p, a234, p, 1234, u, 128),
-        /* RGB8      */ SMOL_CONV (1234, p, 1234, p, a234, p, 432, u, 128),
-        /* BGR8      */ SMOL_CONV (1234, p, 1234, p, a234, p, 234, u, 128),
-    },
-    /* RGBA8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 1234, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 3214, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4123, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4321, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128),
-        /* RGB8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128),
-        /* BGR8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128),
-    },
-    /* BGRA8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 3214, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 1234, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4321, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4123, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128),
-        /* RGB8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128),
-        /* BGR8      */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128),
-    },
-    /* ARGB8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 2341, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 4321, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1234, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1432, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128),
-        /* RGB8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128),
-        /* BGR8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128),
-    },
-    /* ABGR8 un -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 4321, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 2341, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1432, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1234, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128),
-        /* BGRA8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128),
-        /* ARGB8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128),
-        /* ABGR8 un  */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128),
-        /* RGB8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128),
-        /* BGR8      */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128),
-    },
-    /* RGB8 -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128),
-        /* BGRA8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128),
-        /* ARGB8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128),
-        /* ABGR8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128),
-        /* RGB8      */ SMOL_CONV (123, p, 123a, p, 123a, p, 123, p, 128),
-        /* BGR8      */ SMOL_CONV (123, p, 123a, p, 123a, p, 321, p, 128),
-    },
-    /* BGR8 -> */
-    {
-        /* RGBA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128),
-        /* BGRA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128),
-        /* ARGB8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128),
-        /* ABGR8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128),
-        /* RGBA8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128),
-        /* BGRA8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128),
-        /* ARGB8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128),
-        /* ABGR8 un  */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128),
-        /* RGB8      */ SMOL_CONV (123, p, 123a, p, 123a, p, 321, p, 128),
-        /* BGR8      */ SMOL_CONV (123, p, 123a, p, 123a, p, 123, p, 128),
+        /* Even with 128bpp, there's only enough bits to store 11-bit linearized
+         * times 13 bits of summed pixels plus 8 bits of scratch space for
+         * multiplying with an 8-bit weight -> 32 bits total per channel.
+         *
+         * For now, just turn off sRGB linearization if the input is bigger
+         * than the output by a factor of 2^13 or more. */
+        scale_ctx->gamma_type = SMOL_GAMMA_SRGB_COMPRESSED;
     }
-} }
-};
 
-static const SmolImplementation generic_implementation =
-{
-    {
-        /* Horizontal filters */
-        {
-            /* 64bpp */
-            interp_horizontal_copy_64bpp,
-            interp_horizontal_one_64bpp,
-            interp_horizontal_bilinear_0h_64bpp,
-            interp_horizontal_bilinear_1h_64bpp,
-            interp_horizontal_bilinear_2h_64bpp,
-            interp_horizontal_bilinear_3h_64bpp,
-            interp_horizontal_bilinear_4h_64bpp,
-            interp_horizontal_bilinear_5h_64bpp,
-            interp_horizontal_bilinear_6h_64bpp,
-            interp_horizontal_boxes_64bpp
-        },
-        {
-            /* 128bpp */
-            interp_horizontal_copy_128bpp,
-            interp_horizontal_one_128bpp,
-            interp_horizontal_bilinear_0h_128bpp,
-            interp_horizontal_bilinear_1h_128bpp,
-            interp_horizontal_bilinear_2h_128bpp,
-            interp_horizontal_bilinear_3h_128bpp,
-            interp_horizontal_bilinear_4h_128bpp,
-            interp_horizontal_bilinear_5h_128bpp,
-            interp_horizontal_bilinear_6h_128bpp,
-            interp_horizontal_boxes_128bpp
-        }
-    },
-    {
-        /* Vertical filters */
-        {
-            /* 64bpp */
-            scale_outrow_copy,
-            scale_outrow_one_64bpp,
-            scale_outrow_bilinear_0h_64bpp,
-            scale_outrow_bilinear_1h_64bpp,
-            scale_outrow_bilinear_2h_64bpp,
-            scale_outrow_bilinear_3h_64bpp,
-            scale_outrow_bilinear_4h_64bpp,
-            scale_outrow_bilinear_5h_64bpp,
-            scale_outrow_bilinear_6h_64bpp,
-            scale_outrow_box_64bpp
-        },
-        {
-            /* 128bpp */
-            scale_outrow_copy,
-            scale_outrow_one_128bpp,
-            scale_outrow_bilinear_0h_128bpp,
-            scale_outrow_bilinear_1h_128bpp,
-            scale_outrow_bilinear_2h_128bpp,
-            scale_outrow_bilinear_3h_128bpp,
-            scale_outrow_bilinear_4h_128bpp,
-            scale_outrow_bilinear_5h_128bpp,
-            scale_outrow_bilinear_6h_128bpp,
-            scale_outrow_box_128bpp
-        }
-    },
-    &generic_conversions
-};
+    find_repacks (implementations,
+                  src_pmeta->storage, scale_ctx->storage_type, dest_pmeta->storage,
+                  src_pmeta->alpha, internal_alpha, dest_pmeta->alpha,
+                  SMOL_GAMMA_SRGB_COMPRESSED, scale_ctx->gamma_type, SMOL_GAMMA_SRGB_COMPRESSED,
+                  src_pmeta, dest_pmeta,
+                  &src_rmeta, &dest_rmeta);
 
-/* In the absence of a proper build system, runtime detection is more
-   portable than compiler macros. WFM. */
-static SmolBool
-host_is_little_endian (void)
-{
-    static const union
-    {
-        uint8_t u8 [4];
-        uint32_t u32;
-    }
-    host_bytes = { { 0, 1, 2, 3 } };
+    SMOL_ASSERT (src_rmeta != NULL);
+    SMOL_ASSERT (dest_rmeta != NULL);
 
-    if (host_bytes.u32 == 0x03020100UL)
-        return TRUE;
+    scale_ctx->src_unpack_row_func = src_rmeta->repack_row_func;
+    scale_ctx->pack_row_func = dest_rmeta->repack_row_func;
 
-    return FALSE;
-}
+    if (scale_ctx->composite_op == SMOL_COMPOSITE_SRC_OVER_DEST)
+    {
+        const SmolRepackMeta *dest_unpack_rmeta;
 
-/* The generic unpack/pack functions fetch and store pixels as u32.
- * This means the byte order will be reversed on little endian, with
- * consequences for the alpha channel and reordering logic. We deal
- * with this by using the apparent byte order internally. */
-static SmolPixelType
-get_host_pixel_type (SmolPixelType pixel_type)
-{
-    SmolPixelType host_pixel_type = SMOL_PIXEL_MAX;
+        /* Need to unpack destination rows and composite on them */
 
-    if (!host_is_little_endian ())
-        return pixel_type;
+        find_repacks (implementations,
+                      dest_pmeta->storage, scale_ctx->storage_type, dest_pmeta->storage,
+                      dest_pmeta->alpha, internal_alpha, dest_pmeta->alpha,
+                      SMOL_GAMMA_SRGB_COMPRESSED, scale_ctx->gamma_type, SMOL_GAMMA_SRGB_COMPRESSED,
+                      dest_pmeta, dest_pmeta,
+                      &dest_unpack_rmeta, NULL);
 
-    /* We use a switch for this so the compiler can remind us
-     * to keep it in sync with the SmolPixelType enum. */
-    switch (pixel_type)
-    {
-        case SMOL_PIXEL_RGBA8_PREMULTIPLIED:
-            host_pixel_type = SMOL_PIXEL_ABGR8_PREMULTIPLIED; break;
-        case SMOL_PIXEL_BGRA8_PREMULTIPLIED:
-            host_pixel_type = SMOL_PIXEL_ARGB8_PREMULTIPLIED; break;
-        case SMOL_PIXEL_ARGB8_PREMULTIPLIED:
-            host_pixel_type = SMOL_PIXEL_BGRA8_PREMULTIPLIED; break;
-        case SMOL_PIXEL_ABGR8_PREMULTIPLIED:
-            host_pixel_type = SMOL_PIXEL_RGBA8_PREMULTIPLIED; break;
-        case SMOL_PIXEL_RGBA8_UNASSOCIATED:
-            host_pixel_type = SMOL_PIXEL_ABGR8_UNASSOCIATED; break;
-        case SMOL_PIXEL_BGRA8_UNASSOCIATED:
-            host_pixel_type = SMOL_PIXEL_ARGB8_UNASSOCIATED; break;
-        case SMOL_PIXEL_ARGB8_UNASSOCIATED:
-            host_pixel_type = SMOL_PIXEL_BGRA8_UNASSOCIATED; break;
-        case SMOL_PIXEL_ABGR8_UNASSOCIATED:
-            host_pixel_type = SMOL_PIXEL_RGBA8_UNASSOCIATED; break;
-        case SMOL_PIXEL_RGB8:
-            host_pixel_type = SMOL_PIXEL_RGB8; break;
-        case SMOL_PIXEL_BGR8:
-            host_pixel_type = SMOL_PIXEL_BGR8; break;
-        case SMOL_PIXEL_MAX:
-            host_pixel_type = SMOL_PIXEL_MAX; break;
+        SMOL_ASSERT (dest_unpack_rmeta != NULL);
+
+        scale_ctx->dest_unpack_row_func = dest_unpack_rmeta->repack_row_func;
     }
+    else
+    {
+        /* Compositing on solid color */
 
-    return host_pixel_type;
-}
+        if (color_pixel)
+        {
+            SmolPixelType color_ptype;
+            const SmolPixelTypeMeta *color_pmeta;
+            const SmolRepackMeta *color_rmeta;
 
-#ifdef SMOL_WITH_AVX2
+            color_ptype = get_host_pixel_type (color_pixel_type);
+            color_pmeta = &pixel_type_meta [color_ptype];
 
-static SmolBool
-have_avx2 (void)
-{
-#ifdef HAVE_GCC_X86_FEATURE_BUILTINS
-    __builtin_cpu_init ();
+            find_repacks (implementations,
+                          color_pmeta->storage, scale_ctx->storage_type, dest_pmeta->storage,
+                          color_pmeta->alpha, internal_alpha, dest_pmeta->alpha,
+                          SMOL_GAMMA_SRGB_COMPRESSED, scale_ctx->gamma_type, SMOL_GAMMA_SRGB_COMPRESSED,
+                          color_pmeta, dest_pmeta,
+                          &color_rmeta, NULL);
 
-    if (__builtin_cpu_supports ("avx2"))
-        return TRUE;
-#endif
+            SMOL_ASSERT (color_rmeta != NULL);
 
-    return FALSE;
-}
+            color_rmeta->repack_row_func (color_pixel, scale_ctx->color_pixel, 1);
+        }
+        else
+        {
+            /* No color provided; use fully transparent black */
+            memset (scale_ctx->color_pixel, 0, sizeof (scale_ctx->color_pixel));
+        }
 
-#endif
+        populate_clear_batch (scale_ctx);
+    }
 
-static void
-try_override_conversion (SmolScaleCtx *scale_ctx,
-                         const SmolImplementation *impl,
-                         SmolPixelType ptype_in,
-                         SmolPixelType ptype_out,
-                         uint8_t *n_bytes_per_pixel)
-{
-    const SmolConversion *conv;
+    /* Install filters and compositors */
 
-    conv = &impl->ctab->conversions
-        [scale_ctx->storage_type] [ptype_in] [ptype_out];
+    scale_ctx->hfilter_func = NULL;
+    scale_ctx->vfilter_func = NULL;
+    scale_ctx->composite_over_color_func = NULL;
+    scale_ctx->composite_over_dest_func = NULL;
+    scale_ctx->clear_dest_func = NULL;
 
-    if (conv->unpack_row_func && conv->pack_row_func)
+    for (i = 0; implementations [i]; i++)
     {
-        *n_bytes_per_pixel = conv->n_bytes_per_pixel;
-        scale_ctx->unpack_row_func = conv->unpack_row_func;
-        scale_ctx->pack_row_func = conv->pack_row_func;
+        SmolHFilterFunc *hfilter_func =
+            implementations [i]->hfilter_funcs [scale_ctx->storage_type] [scale_ctx->hdim.filter_type];
+        SmolVFilterFunc *vfilter_func =
+            implementations [i]->vfilter_funcs [scale_ctx->storage_type] [scale_ctx->vdim.filter_type];
+        SmolCompositeOverColorFunc *composite_over_color_func =
+            implementations [i]->composite_over_color_funcs [scale_ctx->storage_type];
+        SmolCompositeOverDestFunc *composite_over_dest_func =
+            implementations [i]->composite_over_dest_funcs [scale_ctx->storage_type];
+        SmolClearFunc *clear_dest_func =
+            implementations [i]->clear_funcs [dest_pmeta->storage];
+
+        if (!scale_ctx->hfilter_func && hfilter_func)
+        {
+            scale_ctx->hfilter_func = hfilter_func;
+            if (implementations [i]->init_h_func)
+                implementations [i]->init_h_func (scale_ctx);
+        }
+
+        if (!scale_ctx->vfilter_func && vfilter_func)
+        {
+            scale_ctx->vfilter_func = vfilter_func;
+            if (implementations [i]->init_v_func)
+                implementations [i]->init_v_func (scale_ctx);
+        }
+
+        if (!scale_ctx->composite_over_color_func && composite_over_color_func)
+            scale_ctx->composite_over_color_func = composite_over_color_func;
+        if (!scale_ctx->composite_over_dest_func && composite_over_dest_func)
+            scale_ctx->composite_over_dest_func = composite_over_dest_func;
+        if (!scale_ctx->clear_dest_func && clear_dest_func)
+            scale_ctx->clear_dest_func = clear_dest_func;
     }
-}
 
-static void
-try_override_filters (SmolScaleCtx *scale_ctx,
-                      const SmolImplementation *impl)
-{
-    SmolHFilterFunc *hfilter_func;
-    SmolVFilterFunc *vfilter_func;
-
-    hfilter_func = impl->hfilter_funcs
-        [scale_ctx->storage_type] [scale_ctx->filter_h];
-    vfilter_func = impl->vfilter_funcs
-        [scale_ctx->storage_type] [scale_ctx->filter_v];
-
-    if (hfilter_func)
-        scale_ctx->hfilter_func = hfilter_func;
-    if (vfilter_func)
-        scale_ctx->vfilter_func = vfilter_func;
+    SMOL_ASSERT (scale_ctx->hfilter_func != NULL);
+    SMOL_ASSERT (scale_ctx->vfilter_func != NULL);
 }
 
 static void
-get_implementations (SmolScaleCtx *scale_ctx)
+init_dim (SmolDim *dim,
+          uint32_t src_size_spx,
+          uint32_t dest_size_spx,
+          int32_t placement_ofs_spx,
+          int32_t placement_size_spx,
+          SmolFlags flags,
+          SmolStorageType *storage_type_out)
 {
-    const SmolConversion *conv;
-    SmolPixelType ptype_in, ptype_out;
-    uint8_t n_bytes_per_pixel;
-    const SmolImplementation *avx2_impl = NULL;
-
-#ifdef SMOL_WITH_AVX2
-    if (have_avx2 ())
-        avx2_impl = _smol_get_avx2_implementation ();
-#endif
-
-    ptype_in = get_host_pixel_type (scale_ctx->pixel_type_in);
-    ptype_out = get_host_pixel_type (scale_ctx->pixel_type_out);
-
-    /* Install generic unpack()/pack() */
-
-    conv = &generic_implementation.ctab->conversions
-        [scale_ctx->storage_type] [ptype_in] [ptype_out];
-
-    n_bytes_per_pixel = conv->n_bytes_per_pixel;
-    scale_ctx->unpack_row_func = conv->unpack_row_func;
-    scale_ctx->pack_row_func = conv->pack_row_func;
+    dim->src_size_spx = src_size_spx;
+    dim->src_size_px = SMOL_SPX_TO_PX (src_size_spx);
+    dim->dest_size_spx = dest_size_spx;
+    dim->dest_size_px = SMOL_SPX_TO_PX (dest_size_spx);
+    dim->placement_ofs_spx = placement_ofs_spx;
+    if (placement_ofs_spx < 0)
+        dim->placement_ofs_px = (placement_ofs_spx - 255) / SMOL_SUBPIXEL_MUL;
+    else
+        dim->placement_ofs_px = placement_ofs_spx / SMOL_SUBPIXEL_MUL;
+    dim->placement_size_spx = placement_size_spx;
+    dim->placement_size_px = SMOL_SPX_TO_PX (placement_size_spx + SMOL_SUBPIXEL_MOD (placement_ofs_spx));
 
-    /* Try to override with better unpack()/pack() implementations */
+    pick_filter_params (dim->src_size_px,
+                        dim->src_size_spx,
+                        dim->placement_ofs_spx,
+                        dim->placement_size_px,
+                        dim->placement_size_spx,
+                        &dim->n_halvings,
+                        &dim->placement_size_prehalving_px,
+                        &dim->placement_size_prehalving_spx,
+                        &dim->filter_type,
+                        storage_type_out,
+                        &dim->first_opacity,
+                        &dim->last_opacity,
+                        flags);
 
-    if (avx2_impl)
-        try_override_conversion (scale_ctx, avx2_impl,
-                                 ptype_in, ptype_out,
-                                 &n_bytes_per_pixel);
+    /* Calculate clip and clear intervals */
 
-    /* Some conversions require extra precision. This can only ever
-     * upgrade the storage from 64bpp to 128bpp, but we handle both
-     * cases here for clarity. */
-    if (n_bytes_per_pixel == 8)
-        scale_ctx->storage_type = SMOL_STORAGE_64BPP;
-    else if (n_bytes_per_pixel == 16)
-        scale_ctx->storage_type = SMOL_STORAGE_128BPP;
-    else
+    if (dim->placement_ofs_px > 0)
+    {
+        dim->clear_before_px = dim->placement_ofs_px;
+        dim->clip_before_px = 0;
+    }
+    else if (dim->placement_ofs_px < 0)
     {
-        assert (n_bytes_per_pixel == 8 || n_bytes_per_pixel == 16);
+        dim->clear_before_px = 0;
+        dim->clip_before_px = -dim->placement_ofs_px;
+        dim->first_opacity = 256;
     }
 
-    /* Install generic filters */
+    if (dim->placement_ofs_px + dim->placement_size_px < dim->dest_size_px)
+    {
+        dim->clear_after_px = dim->dest_size_px - dim->placement_ofs_px - dim->placement_size_px;
+        dim->clip_after_px = 0;
+    }
+    else if (dim->placement_ofs_px + dim->placement_size_px > dim->dest_size_px)
+    {
+        dim->clear_after_px = 0;
+        dim->clip_after_px = dim->placement_ofs_px + dim->placement_size_px - dim->dest_size_px;
+        dim->last_opacity = 256;
+    }
 
-    scale_ctx->hfilter_func = generic_implementation.hfilter_funcs
-        [scale_ctx->storage_type] [scale_ctx->filter_h];
-    scale_ctx->vfilter_func = generic_implementation.vfilter_funcs
-        [scale_ctx->storage_type] [scale_ctx->filter_v];
+    /* Clamp placement */
 
-    /* Try to override with better filter implementations */
+    if (dim->placement_ofs_px < 0)
+    {
+        dim->placement_size_px += dim->placement_ofs_px;
+        dim->placement_ofs_px = 0;
+    }
 
-    if (avx2_impl)
-        try_override_filters (scale_ctx, avx2_impl);
+    if (dim->placement_ofs_px + dim->placement_size_px > dim->dest_size_px)
+    {
+        dim->placement_size_px = dim->dest_size_px - dim->placement_ofs_px;
+    }
 }
 
 static void
 smol_scale_init (SmolScaleCtx *scale_ctx,
-                 SmolPixelType pixel_type_in,
-                 const void *pixels_in,
-                 uint32_t width_in,
-                 uint32_t height_in,
-                 uint32_t rowstride_in,
-                 SmolPixelType pixel_type_out,
-                 void *pixels_out,
-                 uint32_t width_out,
-                 uint32_t height_out,
-                 uint32_t rowstride_out,
+                 const void *src_pixels,
+                 SmolPixelType src_pixel_type,
+                 uint32_t src_width_spx,
+                 uint32_t src_height_spx,
+                 uint32_t src_rowstride,
+                 const void *color_pixel,
+                 SmolPixelType color_pixel_type,
+                 void *dest_pixels,
+                 SmolPixelType dest_pixel_type,
+                 uint32_t dest_width_spx,
+                 uint32_t dest_height_spx,
+                 uint32_t dest_rowstride,
+                 int32_t placement_x_spx,
+                 int32_t placement_y_spx,
+                 int32_t placement_width_spx,
+                 int32_t placement_height_spx,
+                 SmolCompositeOp composite_op,
+                 SmolFlags flags,
                  SmolPostRowFunc post_row_func,
                  void *user_data)
 {
     SmolStorageType storage_type [2];
 
-    scale_ctx->pixel_type_in = pixel_type_in;
-    scale_ctx->pixels_in = pixels_in;
-    scale_ctx->width_in = width_in;
-    scale_ctx->height_in = height_in;
-    scale_ctx->rowstride_in = rowstride_in;
-    scale_ctx->pixel_type_out = pixel_type_out;
-    scale_ctx->pixels_out = pixels_out;
-    scale_ctx->width_out = width_out;
-    scale_ctx->height_out = height_out;
-    scale_ctx->rowstride_out = rowstride_out;
+    if (placement_width_spx <= 0 || placement_height_spx <= 0)
+    {
+        placement_width_spx = 0;
+        placement_height_spx = 0;
+        placement_x_spx = 0;
+        placement_y_spx = 0;
+    }
+
+    scale_ctx->src_pixels = src_pixels;
+    scale_ctx->src_pixel_type = src_pixel_type;
+    scale_ctx->src_rowstride = src_rowstride;
+
+    scale_ctx->dest_pixels = dest_pixels;
+    scale_ctx->dest_pixel_type = dest_pixel_type;
+    scale_ctx->dest_rowstride = dest_rowstride;
+
+    scale_ctx->composite_op = composite_op;
+    scale_ctx->flags = flags;
+    scale_ctx->gamma_type = (flags & SMOL_DISABLE_SRGB_LINEARIZATION)
+        ? SMOL_GAMMA_SRGB_COMPRESSED : SMOL_GAMMA_SRGB_LINEAR;
 
     scale_ctx->post_row_func = post_row_func;
     scale_ctx->user_data = user_data;
 
-    pick_filter_params (width_in, width_out,
-                        &scale_ctx->width_halvings,
-                        &scale_ctx->width_bilin_out,
-                        &scale_ctx->filter_h,
-                        &storage_type [0]);
-    pick_filter_params (height_in, height_out,
-                        &scale_ctx->height_halvings,
-                        &scale_ctx->height_bilin_out,
-                        &scale_ctx->filter_v,
-                        &storage_type [1]);
+    init_dim (&scale_ctx->hdim,
+              src_width_spx, dest_width_spx,
+              placement_x_spx, placement_width_spx,
+              flags, &storage_type [0]);
+    init_dim (&scale_ctx->vdim,
+              src_height_spx, dest_height_spx,
+              placement_y_spx, placement_height_spx,
+              flags, &storage_type [1]);
 
     scale_ctx->storage_type = MAX (storage_type [0], storage_type [1]);
 
-    scale_ctx->offsets_x = malloc (((scale_ctx->width_bilin_out + 1) * 2
-                                    + (scale_ctx->height_bilin_out + 1) * 2) * sizeof (uint16_t));
-    scale_ctx->offsets_y = scale_ctx->offsets_x + (scale_ctx->width_bilin_out + 1) * 2;
-
-    if (scale_ctx->filter_h == SMOL_FILTER_ONE)
-    {
-    }
-    else if (scale_ctx->filter_h == SMOL_FILTER_BOX)
-    {
-        precalc_boxes_array (scale_ctx->offsets_x, &scale_ctx->span_mul_x,
-                             width_in, scale_ctx->width_out, FALSE);
-    }
-    else /* SMOL_FILTER_BILINEAR_?H */
-    {
-        precalc_bilinear_array (scale_ctx->offsets_x,
-                                width_in, scale_ctx->width_bilin_out, FALSE);
-    }
-
-    if (scale_ctx->filter_v == SMOL_FILTER_ONE)
-    {
-    }
-    else if (scale_ctx->filter_v == SMOL_FILTER_BOX)
-    {
-        precalc_boxes_array (scale_ctx->offsets_y, &scale_ctx->span_mul_y,
-                             height_in, scale_ctx->height_out, TRUE);
-    }
-    else /* SMOL_FILTER_BILINEAR_?H */
-    {
-        precalc_bilinear_array (scale_ctx->offsets_y,
-                                height_in, scale_ctx->height_bilin_out, TRUE);
-    }
+    scale_ctx->hdim.precalc = smol_alloc_aligned (((scale_ctx->hdim.placement_size_prehalving_px + 1) * 2
+                                                + (scale_ctx->vdim.placement_size_prehalving_px + 1) * 2)
+                                               * sizeof (uint16_t),
+                                               &scale_ctx->precalc_storage);
+    scale_ctx->vdim.precalc = ((uint16_t *) scale_ctx->hdim.precalc) + (scale_ctx->hdim.placement_size_prehalving_px + 1) * 2;
 
-    get_implementations (scale_ctx);
+    get_implementations (scale_ctx, color_pixel, color_pixel_type);
 }
 
 static void
 smol_scale_finalize (SmolScaleCtx *scale_ctx)
 {
-    free (scale_ctx->offsets_x);
+    free (scale_ctx->precalc_storage);
 }
 
-/* --- Public API --- */
+/* ---------- *
+ * Public API *
+ * ---------- */
 
 SmolScaleCtx *
-smol_scale_new (SmolPixelType pixel_type_in,
-                const void *pixels_in,
-                uint32_t width_in,
-                uint32_t height_in,
-                uint32_t rowstride_in,
-                SmolPixelType pixel_type_out,
-                void *pixels_out,
-                uint32_t width_out,
-                uint32_t height_out,
-                uint32_t rowstride_out)
+smol_scale_new_simple (const void *src_pixels,
+                       SmolPixelType src_pixel_type,
+                       uint32_t src_width,
+                       uint32_t src_height,
+                       uint32_t src_rowstride,
+                       void *dest_pixels,
+                       SmolPixelType dest_pixel_type,
+                       uint32_t dest_width,
+                       uint32_t dest_height,
+                       uint32_t dest_rowstride,
+                       SmolFlags flags)
 {
     SmolScaleCtx *scale_ctx;
 
     scale_ctx = calloc (sizeof (SmolScaleCtx), 1);
     smol_scale_init (scale_ctx,
-                     pixel_type_in,
-                     pixels_in,
-                     width_in,
-                     height_in,
-                     rowstride_in,
-                     pixel_type_out,
-                     pixels_out,
-                     width_out,
-                     height_out,
-                     rowstride_out,
+                     src_pixels,
+                     src_pixel_type,
+                     SMOL_PX_TO_SPX (src_width),
+                     SMOL_PX_TO_SPX (src_height),
+                     src_rowstride,
+                     NULL,
+                     0,
+                     dest_pixels,
+                     dest_pixel_type,
+                     SMOL_PX_TO_SPX (dest_width),
+                     SMOL_PX_TO_SPX (dest_height),
+                     dest_rowstride,
+                     0,
+                     0,
+                     SMOL_PX_TO_SPX (dest_width),
+                     SMOL_PX_TO_SPX (dest_height),
+                     SMOL_COMPOSITE_SRC,
+                     flags,
                      NULL,
                      NULL);
     return scale_ctx;
 }
 
+void
+smol_scale_simple (const void *src_pixels,
+                   SmolPixelType src_pixel_type,
+                   uint32_t src_width,
+                   uint32_t src_height,
+                   uint32_t src_rowstride,
+                   void *dest_pixels,
+                   SmolPixelType dest_pixel_type,
+                   uint32_t dest_width,
+                   uint32_t dest_height,
+                   uint32_t dest_rowstride,
+                   SmolFlags flags)
+{
+    SmolScaleCtx scale_ctx = { 0 };
+    int first_row, n_rows;
+
+    smol_scale_init (&scale_ctx,
+                     src_pixels,
+                     src_pixel_type,
+                     SMOL_PX_TO_SPX (src_width),
+                     SMOL_PX_TO_SPX (src_height),
+                     src_rowstride,
+                     NULL,
+                     0,
+                     dest_pixels,
+                     dest_pixel_type,
+                     SMOL_PX_TO_SPX (dest_width),
+                     SMOL_PX_TO_SPX (dest_height),
+                     dest_rowstride,
+                     0,
+                     0,
+                     SMOL_PX_TO_SPX (dest_width),
+                     SMOL_PX_TO_SPX (dest_height),
+                     SMOL_COMPOSITE_SRC,
+                     flags,
+                     NULL, NULL);
+
+    first_row = 0;
+    n_rows = scale_ctx.vdim.dest_size_px;
+
+    if (check_row_range (&scale_ctx, &first_row, &n_rows))
+    {
+        do_rows (&scale_ctx,
+                 dest_row_ofs_to_pointer (&scale_ctx, 0),
+                 first_row,
+                 n_rows);
+    }
+
+    smol_scale_finalize (&scale_ctx);
+}
+
 SmolScaleCtx *
-smol_scale_new_full (SmolPixelType pixel_type_in,
-                     const void *pixels_in,
-                     uint32_t width_in,
-                     uint32_t height_in,
-                     uint32_t rowstride_in,
-                     SmolPixelType pixel_type_out,
-                     void *pixels_out,
-                     uint32_t width_out,
-                     uint32_t height_out,
-                     uint32_t rowstride_out,
+smol_scale_new_full (const void *src_pixels,
+                     SmolPixelType src_pixel_type,
+                     uint32_t src_width,
+                     uint32_t src_height,
+                     uint32_t src_rowstride,
+                     const void *color_pixel,
+                     SmolPixelType color_pixel_type,
+                     void *dest_pixels,
+                     SmolPixelType dest_pixel_type,
+                     uint32_t dest_width,
+                     uint32_t dest_height,
+                     uint32_t dest_rowstride,
+                     int32_t placement_x,
+                     int32_t placement_y,
+                     uint32_t placement_width,
+                     uint32_t placement_height,
+                     SmolCompositeOp composite_op,
+                     SmolFlags flags,
                      SmolPostRowFunc post_row_func,
                      void *user_data)
 {
@@ -3032,16 +1346,24 @@ smol_scale_new_full (SmolPixelType pixel_type_in,
 
     scale_ctx = calloc (sizeof (SmolScaleCtx), 1);
     smol_scale_init (scale_ctx,
-                     pixel_type_in,
-                     pixels_in,
-                     width_in,
-                     height_in,
-                     rowstride_in,
-                     pixel_type_out,
-                     pixels_out,
-                     width_out,
-                     height_out,
-                     rowstride_out,
+                     src_pixels,
+                     src_pixel_type,
+                     SMOL_PX_TO_SPX (src_width),
+                     SMOL_PX_TO_SPX (src_height),
+                     src_rowstride,
+                     color_pixel,
+                     color_pixel_type,
+                     dest_pixels,
+                     dest_pixel_type,
+                     SMOL_PX_TO_SPX (dest_width),
+                     SMOL_PX_TO_SPX (dest_height),
+                     dest_rowstride,
+                     placement_x,
+                     placement_y,
+                     placement_width,
+                     placement_height,
+                     composite_op,
+                     flags,
                      post_row_func,
                      user_data);
     return scale_ctx;
@@ -3054,52 +1376,31 @@ smol_scale_destroy (SmolScaleCtx *scale_ctx)
     free (scale_ctx);
 }
 
-void
-smol_scale_simple (SmolPixelType pixel_type_in,
-                   const void *pixels_in,
-                   uint32_t width_in,
-                   uint32_t height_in,
-                   uint32_t rowstride_in,
-                   SmolPixelType pixel_type_out,
-                   void *pixels_out,
-                   uint32_t width_out,
-                   uint32_t height_out,
-                   uint32_t rowstride_out)
-{
-    SmolScaleCtx scale_ctx;
-
-    smol_scale_init (&scale_ctx,
-                     pixel_type_in, pixels_in,
-                     width_in, height_in, rowstride_in,
-                     pixel_type_out, pixels_out,
-                     width_out, height_out, rowstride_out,
-                     NULL, NULL);
-    do_rows (&scale_ctx,
-             outrow_ofs_to_pointer (&scale_ctx, 0),
-             0,
-             scale_ctx.height_out);
-    smol_scale_finalize (&scale_ctx);
-}
-
 void
 smol_scale_batch (const SmolScaleCtx *scale_ctx,
-                  uint32_t first_out_row,
-                  uint32_t n_out_rows)
+                  int32_t first_dest_row,
+                  int32_t n_dest_rows)
 {
+    if (!check_row_range (scale_ctx, &first_dest_row, &n_dest_rows))
+        return;
+
     do_rows (scale_ctx,
-             outrow_ofs_to_pointer (scale_ctx, first_out_row),
-             first_out_row,
-             n_out_rows);
+             dest_row_ofs_to_pointer (scale_ctx, first_dest_row),
+             first_dest_row,
+             n_dest_rows);
 }
 
 void
 smol_scale_batch_full (const SmolScaleCtx *scale_ctx,
-                       void *outrows_dest,
-                       uint32_t first_out_row,
-                       uint32_t n_out_rows)
+                       void *dest,
+                       int32_t first_dest_row,
+                       int32_t n_dest_rows)
 {
+    if (!check_row_range (scale_ctx, &first_dest_row, &n_dest_rows))
+        return;
+
     do_rows (scale_ctx,
-             outrows_dest,
-             first_out_row,
-             n_out_rows);
+             dest,
+             first_dest_row,
+             n_dest_rows);
 }
diff --git a/chafa/internal/smolscale/smolscale.h b/chafa/internal/smolscale/smolscale.h
index c7cc99e6..2a6fe3a1 100644
--- a/chafa/internal/smolscale/smolscale.h
+++ b/chafa/internal/smolscale/smolscale.h
@@ -11,6 +11,23 @@
 extern "C" {
 #endif
 
+#define SMOL_SUBPIXEL_SHIFT 8
+#define SMOL_SUBPIXEL_MUL (1 << (SMOL_SUBPIXEL_SHIFT))
+
+/* Applies modulo twice, yielding a positive fraction for negative offsets */
+#define SMOL_SUBPIXEL_MOD(n) ((((n) % SMOL_SUBPIXEL_MUL) + SMOL_SUBPIXEL_MUL) % SMOL_SUBPIXEL_MUL)
+
+#define SMOL_PX_TO_SPX(px) ((px) * (SMOL_SUBPIXEL_MUL))
+#define SMOL_SPX_TO_PX(spx) (((spx) + (SMOL_SUBPIXEL_MUL) - 1) / (SMOL_SUBPIXEL_MUL))
+
+typedef enum
+{
+    SMOL_NO_FLAGS                   = 0,
+    SMOL_DISABLE_ACCELERATION       = (1 << 0),
+    SMOL_DISABLE_SRGB_LINEARIZATION = (1 << 1)
+}
+SmolFlags;
+
 typedef enum
 {
     /* 32 bits per pixel */
@@ -34,7 +51,15 @@ typedef enum
 }
 SmolPixelType;
 
-typedef void (SmolPostRowFunc) (uint32_t *row_inout,
+typedef enum
+{
+    SMOL_COMPOSITE_SRC,
+    SMOL_COMPOSITE_SRC_CLEAR_DEST,
+    SMOL_COMPOSITE_SRC_OVER_DEST
+}
+SmolCompositeOp;
+
+typedef void (SmolPostRowFunc) (void *row_inout,
                                 int width,
                                 void *user_data);
 
@@ -44,23 +69,52 @@ typedef struct SmolScaleCtx SmolScaleCtx;
  * the source memory and an existing allocation to receive the output data.
  * This interface can only be used from a single thread. */
 
-void smol_scale_simple (SmolPixelType pixel_type_in, const void *pixels_in,
-                        uint32_t width_in, uint32_t height_in, uint32_t rowstride_in,
-                        SmolPixelType pixel_type_out, void *pixels_out,
-                        uint32_t width_out, uint32_t height_out, uint32_t rowstride_out);
+void smol_scale_simple (const void *src_pixels,
+                        SmolPixelType src_pixel_type,
+                        uint32_t src_width,
+                        uint32_t src_height,
+                        uint32_t src_rowstride,
+                        void *dest_pixels,
+                        SmolPixelType dest_pixel_type,
+                        uint32_t dest_width,
+                        uint32_t dest_height,
+                        uint32_t dest_rowstride,
+                        SmolFlags flags);
 
 /* Batch API: Allows scaling a few rows at a time. Suitable for multithreading. */
 
-SmolScaleCtx *smol_scale_new (SmolPixelType pixel_type_in, const void *pixels_in,
-                              uint32_t width_in, uint32_t height_in, uint32_t rowstride_in,
-                              SmolPixelType pixel_type_out, void *pixels_out,
-                              uint32_t width_out, uint32_t height_out, uint32_t rowstride_out);
-
-SmolScaleCtx *smol_scale_new_full (SmolPixelType pixel_type_in, const void *pixels_in,
-                                   uint32_t width_in, uint32_t height_in, uint32_t rowstride_in,
-                                   SmolPixelType pixel_type_out, void *pixels_out,
-                                   uint32_t width_out, uint32_t height_out, uint32_t rowstride_out,
-                                   SmolPostRowFunc post_row_func, void *user_data);
+SmolScaleCtx *smol_scale_new_simple (const void *src_pixels,
+                                     SmolPixelType src_pixel_type,
+                                     uint32_t src_width,
+                                     uint32_t src_height,
+                                     uint32_t src_rowstride,
+                                     void *dest_pixels,
+                                     SmolPixelType dest_pixel_type,
+                                     uint32_t dest_width,
+                                     uint32_t dest_height,
+                                     uint32_t dest_rowstride,
+                                     SmolFlags flags);
+
+SmolScaleCtx *smol_scale_new_full (const void *src_pixels,
+                                   SmolPixelType src_pixel_type,
+                                   uint32_t src_width,
+                                   uint32_t src_height,
+                                   uint32_t src_rowstride,
+                                   const void *color_pixel,
+                                   SmolPixelType color_pixel_type,
+                                   void *dest_pixels,
+                                   SmolPixelType dest_pixel_type,
+                                   uint32_t dest_width,
+                                   uint32_t dest_height,
+                                   uint32_t dest_rowstride,
+                                   int32_t placement_x,
+                                   int32_t placement_y,
+                                   uint32_t placement_width,
+                                   uint32_t placement_height,
+                                   SmolCompositeOp composite_op,
+                                   SmolFlags flags,
+                                   SmolPostRowFunc post_row_func,
+                                   void *user_data);
 
 void smol_scale_destroy (SmolScaleCtx *scale_ctx);
 
@@ -68,7 +122,7 @@ void smol_scale_destroy (SmolScaleCtx *scale_ctx);
  * threads, as long as the outrows do not overlap. Make sure all workers are
  * finished before you call smol_scale_destroy(). */
 
-void smol_scale_batch (const SmolScaleCtx *scale_ctx, uint32_t first_outrow, uint32_t n_outrows);
+void smol_scale_batch (const SmolScaleCtx *scale_ctx, int32_t first_outrow, int32_t n_outrows);
 
 /* Like smol_scale_batch(), but will write the output rows to outrows_dest
  * instead of relative to pixels_out address handed to smol_scale_new(). The
@@ -76,7 +130,7 @@ void smol_scale_batch (const SmolScaleCtx *scale_ctx, uint32_t first_outrow, uin
 
 void smol_scale_batch_full (const SmolScaleCtx *scale_ctx,
                             void *outrows_dest,
-                            uint32_t first_outrow, uint32_t n_outrows);
+                            int32_t first_outrow, int32_t n_outrows);
 
 #ifdef __cplusplus
 }