diff --git a/chafa/chafa-symbol-map.c b/chafa/chafa-symbol-map.c index 8222f2e8..1b13222b 100644 --- a/chafa/chafa-symbol-map.c +++ b/chafa/chafa-symbol-map.c @@ -279,11 +279,12 @@ glyph_to_bitmap (gint width, gint height, /* Scale to cell dimensions */ - smol_scale_simple ((SmolPixelType) pixel_format, pixels, width, height, rowstride, - SMOL_PIXEL_RGBA8_PREMULTIPLIED, + smol_scale_simple (pixels, (SmolPixelType) pixel_format, width, height, rowstride, (gpointer) scaled_pixels, + SMOL_PIXEL_RGBA8_UNASSOCIATED, /* FIXME: Premul */ CHAFA_SYMBOL_WIDTH_PIXELS, CHAFA_SYMBOL_HEIGHT_PIXELS, - CHAFA_SYMBOL_WIDTH_PIXELS * 4); + CHAFA_SYMBOL_WIDTH_PIXELS * 4, + SMOL_NO_FLAGS); /* Generate coverage map */ @@ -308,11 +309,12 @@ glyph_to_bitmap_wide (gint width, gint height, /* Scale to cell dimensions */ - smol_scale_simple ((SmolPixelType) pixel_format, pixels, width, height, rowstride, - SMOL_PIXEL_RGBA8_PREMULTIPLIED, + smol_scale_simple (pixels, (SmolPixelType) pixel_format, width, height, rowstride, (gpointer) scaled_pixels, + SMOL_PIXEL_RGBA8_UNASSOCIATED, /* FIXME: Premul */ CHAFA_SYMBOL_WIDTH_PIXELS * 2, CHAFA_SYMBOL_HEIGHT_PIXELS, - CHAFA_SYMBOL_WIDTH_PIXELS * 4 * 2); + CHAFA_SYMBOL_WIDTH_PIXELS * 4 * 2, + SMOL_NO_FLAGS); /* Generate coverage map */ @@ -1872,10 +1874,11 @@ chafa_symbol_map_get_glyph (ChafaSymbolMap *symbol_map, gpointer temp_pixels = g_malloc (width * CHAFA_SYMBOL_HEIGHT_PIXELS * 4); /* Convert to desired pixel format */ - smol_scale_simple (SMOL_PIXEL_ARGB8_PREMULTIPLIED, *pixels_out, + smol_scale_simple (*pixels_out, SMOL_PIXEL_ARGB8_UNASSOCIATED, /* FIXME: Premul */ width, height, rowstride, - (SmolPixelType) pixel_format, temp_pixels, - width, height, rowstride); + temp_pixels, (SmolPixelType) pixel_format, + width, height, rowstride, + SMOL_NO_FLAGS); g_free (*pixels_out); *pixels_out = temp_pixels; } diff --git a/chafa/internal/chafa-indexed-image.c b/chafa/internal/chafa-indexed-image.c index b3fd5e46..88023aa4 100644 --- a/chafa/internal/chafa-indexed-image.c +++ b/chafa/internal/chafa-indexed-image.c @@ -427,6 +427,7 @@ chafa_indexed_image_draw_pixels (ChafaIndexedImage *indexed_image, gint dest_width, gint dest_height) { DrawPixelsCtx ctx; + ChafaColor bg; g_return_if_fail (dest_width == indexed_image->width); g_return_if_fail (dest_height <= indexed_image->height); @@ -444,23 +445,54 @@ chafa_indexed_image_draw_pixels (ChafaIndexedImage *indexed_image, ctx.dest_width = dest_width; ctx.dest_height = dest_height; +#if 0 + /* FIXME: Need a new smolscale compositing mode that preserves src + * alpha before this can be implemented */ + bg = *chafa_palette_get_color (&indexed_image->palette, + CHAFA_COLOR_SPACE_RGB, + CHAFA_PALETTE_INDEX_BG); + bg.ch [3] = 0xff; +#else gen_color_lut_rgba8 (ctx.bg_color_lut, *chafa_palette_get_color (&indexed_image->palette, CHAFA_COLOR_SPACE_RGB, CHAFA_PALETTE_INDEX_BG)); +#endif ctx.scaled_data = g_new (guint32, dest_width * dest_height); - ctx.scale_ctx = smol_scale_new_full ((SmolPixelType) src_pixel_type, + ctx.scale_ctx = smol_scale_new_full (/* Source */ (const guint32 *) src_pixels, + (SmolPixelType) src_pixel_type, src_width, src_height, src_rowstride, - SMOL_PIXEL_RGBA8_PREMULTIPLIED, + /* Fill */ +#if 0 + bg.ch, +#else + NULL, +#endif + SMOL_PIXEL_RGBA8_UNASSOCIATED, + /* Destination */ NULL, + SMOL_PIXEL_RGBA8_PREMULTIPLIED, dest_width, dest_height, dest_width * sizeof (guint32), + /* Placement */ + 0, + 0, + dest_width * SMOL_SUBPIXEL_MUL, + dest_height * SMOL_SUBPIXEL_MUL, + /* Extra args */ + SMOL_COMPOSITE_SRC_CLEAR_DEST, +#if 0 + SMOL_NO_FLAGS, + NULL, +#else + SMOL_DISABLE_SRGB_LINEARIZATION, post_scale_row, +#endif &ctx); draw_pixels (&ctx); diff --git a/chafa/internal/chafa-iterm2-canvas.c b/chafa/internal/chafa-iterm2-canvas.c index e305ad60..31cb7399 100644 --- a/chafa/internal/chafa-iterm2-canvas.c +++ b/chafa/internal/chafa-iterm2-canvas.c @@ -76,10 +76,17 @@ typedef enum } TiffTagId; +typedef enum +{ + TIFF_EXTRA_SAMPLE_UNSPECIFIED = 0, + TIFF_EXTRA_SAMPLE_ASSOC_ALPHA = 1, + TIFF_EXTRA_SAMPLE_UNASSOC_ALPHA = 2 +} +TiffExtraSampleType; + #define TIFF_PHOTOMETRIC_INTERPRETATION_RGB 2 #define TIFF_ORIENTATION_TOPLEFT 1 #define TIFF_PLANAR_CONFIGURATION_CONTIGUOUS 1 -#define TIFF_EXTRA_SAMPLE_ASSOC_ALPHA 1 typedef struct { @@ -150,16 +157,29 @@ chafa_iterm2_canvas_draw_all_pixels (ChafaIterm2Canvas *iterm2_canvas, ChafaPixe return; ctx.iterm2_canvas = iterm2_canvas; - ctx.scale_ctx = smol_scale_new_full ((SmolPixelType) src_pixel_type, + ctx.scale_ctx = smol_scale_new_full (/* Source */ (const guint32 *) src_pixels, + (SmolPixelType) src_pixel_type, src_width, src_height, src_rowstride, - SMOL_PIXEL_RGBA8_PREMULTIPLIED, + /* Fill */ + NULL, + SMOL_PIXEL_RGBA8_UNASSOCIATED, + /* Destination */ NULL, + SMOL_PIXEL_RGBA8_UNASSOCIATED, /* FIXME: Premul? */ iterm2_canvas->width, iterm2_canvas->height, iterm2_canvas->width * sizeof (guint32), + /* Placement */ + 0, + 0, + iterm2_canvas->width * SMOL_SUBPIXEL_MUL, + iterm2_canvas->height * SMOL_SUBPIXEL_MUL, + /* Extra args */ + SMOL_COMPOSITE_SRC, + SMOL_DISABLE_SRGB_LINEARIZATION, NULL, &ctx); @@ -246,7 +266,7 @@ chafa_iterm2_canvas_build_ansi (ChafaIterm2Canvas *iterm2_canvas, ChafaTermInfo generate_tag (&base64, out_str, TIFF_TAG_STRIP_BYTE_COUNTS, TIFF_TYPE_LONG, 1, iterm2_canvas->width * iterm2_canvas->height * 4); generate_tag (&base64, out_str, TIFF_TAG_PLANAR_CONFIGURATION, TIFF_TYPE_SHORT, 1, TIFF_PLANAR_CONFIGURATION_CONTIGUOUS); - generate_tag (&base64, out_str, TIFF_TAG_EXTRA_SAMPLES, TIFF_TYPE_SHORT, 1, TIFF_EXTRA_SAMPLE_ASSOC_ALPHA); + generate_tag (&base64, out_str, TIFF_TAG_EXTRA_SAMPLES, TIFF_TYPE_SHORT, 1, TIFF_EXTRA_SAMPLE_UNASSOC_ALPHA); /* Next IFD offset (terminator) */ diff --git a/chafa/internal/chafa-kitty-canvas.c b/chafa/internal/chafa-kitty-canvas.c index 411f7eaa..bdd70062 100644 --- a/chafa/internal/chafa-kitty-canvas.c +++ b/chafa/internal/chafa-kitty-canvas.c @@ -41,8 +41,6 @@ typedef struct { ChafaKittyCanvas *kitty_canvas; SmolScaleCtx *scale_ctx; - ChafaColor bg_color; - gboolean flatten_alpha; } DrawCtx; @@ -130,14 +128,6 @@ draw_pixels_worker (ChafaBatchInfo *batch, const DrawCtx *ctx) ((guint32 *) ctx->kitty_canvas->rgba_image) + (ctx->kitty_canvas->width * batch->first_row), batch->first_row, batch->n_rows); - - /* FIXME: Smolscale should be able to do this */ - if (ctx->flatten_alpha) - chafa_composite_rgba_on_solid_color (ctx->bg_color, - ctx->kitty_canvas->rgba_image, - ctx->kitty_canvas->width, - batch->first_row, - batch->n_rows); } void @@ -146,7 +136,9 @@ chafa_kitty_canvas_draw_all_pixels (ChafaKittyCanvas *kitty_canvas, ChafaPixelTy gint src_width, gint src_height, gint src_rowstride, ChafaColor bg_color) { + uint8_t bg_color_rgba [4]; DrawCtx ctx; + gboolean flatten_alpha; g_return_if_fail (kitty_canvas != NULL); g_return_if_fail (src_pixel_type < CHAFA_PIXEL_MAX); @@ -157,22 +149,36 @@ chafa_kitty_canvas_draw_all_pixels (ChafaKittyCanvas *kitty_canvas, ChafaPixelTy if (src_width == 0 || src_height == 0) return; + flatten_alpha = bg_color.ch [3] == 0; + bg_color.ch [3] = 0xff; + chafa_color8_store_to_rgba8 (bg_color, bg_color_rgba); + ctx.kitty_canvas = kitty_canvas; - ctx.scale_ctx = smol_scale_new_full ((SmolPixelType) src_pixel_type, + ctx.scale_ctx = smol_scale_new_full (/* Source */ (const guint32 *) src_pixels, + (SmolPixelType) src_pixel_type, src_width, src_height, src_rowstride, - SMOL_PIXEL_RGBA8_PREMULTIPLIED, + /* Fill */ + flatten_alpha ? bg_color_rgba : NULL, + SMOL_PIXEL_RGBA8_UNASSOCIATED, + /* Destination */ NULL, + SMOL_PIXEL_RGBA8_UNASSOCIATED, /* FIXME: Opaque? */ kitty_canvas->width, kitty_canvas->height, kitty_canvas->width * sizeof (guint32), + /* Placement */ + 0, + 0, + kitty_canvas->width * SMOL_SUBPIXEL_MUL, + kitty_canvas->height * SMOL_SUBPIXEL_MUL, + /* Extra args */ + SMOL_COMPOSITE_SRC, + SMOL_NO_FLAGS, NULL, &ctx); - ctx.bg_color = bg_color; - ctx.flatten_alpha = bg_color.ch [3] == 0; - chafa_process_batches (&ctx, (GFunc) draw_pixels_worker, NULL, diff --git a/chafa/internal/chafa-pixops.c b/chafa/internal/chafa-pixops.c index 6dfd7ac1..de6658bb 100644 --- a/chafa/internal/chafa-pixops.c +++ b/chafa/internal/chafa-pixops.c @@ -620,33 +620,6 @@ composite_alpha_on_bg (ChafaColor bg_color, } } -/* FIXME: Could we always destroy the alpha channel and eliminate the other - * variant? */ -static void -composite_alpha_on_solid (ChafaColor bg_color, - ChafaPixel *pixels, gint width, gint first_row, gint n_rows) -{ - ChafaPixel *p0, *p1; - - p0 = pixels + first_row * width; - p1 = p0 + n_rows * width; - - for ( ; p0 < p1; p0++) - { - p0->col.ch [0] += (bg_color.ch [0] * (255 - (guint32) p0->col.ch [3])) / 255; - p0->col.ch [1] += (bg_color.ch [1] * (255 - (guint32) p0->col.ch [3])) / 255; - p0->col.ch [2] += (bg_color.ch [2] * (255 - (guint32) p0->col.ch [3])) / 255; - p0->col.ch [3] = 0xff; - } -} - -void -chafa_composite_rgba_on_solid_color (ChafaColor color, - ChafaPixel *pixels, gint width, gint first_row, gint n_rows) -{ - composite_alpha_on_solid (color, pixels, width, first_row, n_rows); -} - static void prepare_pixels_2_worker (ChafaBatchInfo *batch, PrepareContext *prep_ctx) { @@ -798,16 +771,17 @@ chafa_prepare_pixel_data_for_symbols (const ChafaPalette *palette, prep_ctx.dest_width = dest_width; prep_ctx.dest_height = dest_height; - prep_ctx.scale_ctx = smol_scale_new ((SmolPixelType) prep_ctx.src_pixel_type, - (const guint32 *) prep_ctx.src_pixels, - prep_ctx.src_width, - prep_ctx.src_height, - prep_ctx.src_rowstride, - SMOL_PIXEL_RGBA8_PREMULTIPLIED, - NULL, - prep_ctx.dest_width, - prep_ctx.dest_height, - prep_ctx.dest_width * sizeof (guint32)); + prep_ctx.scale_ctx = smol_scale_new_simple (prep_ctx.src_pixels, + (SmolPixelType) prep_ctx.src_pixel_type, + prep_ctx.src_width, + prep_ctx.src_height, + prep_ctx.src_rowstride, + NULL, + SMOL_PIXEL_RGBA8_UNASSOCIATED, /* FIXME: Premul */ + prep_ctx.dest_width, + prep_ctx.dest_height, + prep_ctx.dest_width * sizeof (guint32), + SMOL_NO_FLAGS); prepare_pixels_pass_1 (&prep_ctx); prepare_pixels_pass_2 (&prep_ctx); diff --git a/chafa/internal/chafa-pixops.h b/chafa/internal/chafa-pixops.h index 5adab95e..5980dada 100644 --- a/chafa/internal/chafa-pixops.h +++ b/chafa/internal/chafa-pixops.h @@ -43,9 +43,6 @@ void chafa_sort_pixel_index_by_channel (guint8 *index, const ChafaPixel *pixels, gint n_pixels, gint ch); -void chafa_composite_rgba_on_solid_color (ChafaColor color, - ChafaPixel *pixels, gint width, gint first_row, gint n_rows); - G_END_DECLS #endif /* __CHAFA_PIXOPS_H__ */ diff --git a/chafa/internal/smolscale/Makefile.am b/chafa/internal/smolscale/Makefile.am index ae60118b..47c59f7a 100644 --- a/chafa/internal/smolscale/Makefile.am +++ b/chafa/internal/smolscale/Makefile.am @@ -14,6 +14,7 @@ libsmolscale_la_LIBADD = libsmolscale_la_SOURCES = \ smolscale.c \ smolscale.h \ + smolscale-generic.c \ smolscale-private.h if HAVE_AVX2_INTRINSICS diff --git a/chafa/internal/smolscale/smolscale-avx2.c b/chafa/internal/smolscale/smolscale-avx2.c index adc96966..7f179ea5 100644 --- a/chafa/internal/smolscale/smolscale-avx2.c +++ b/chafa/internal/smolscale/smolscale-avx2.c @@ -10,1042 +10,600 @@ #include #include "smolscale-private.h" +/* ---------------------- * + * Context initialization * + * ---------------------- */ -/* --- Linear interpolation helpers --- */ +/* Number of horizontal pixels to process in a single batch. The define exists for + * clarity and cannot be changed without significant changes to the code elsewhere. */ +#define BILIN_HORIZ_BATCH_PIXELS 16 -#define LERP_SIMD256_EPI32(a, b, f) \ - _mm256_add_epi32 ( \ - _mm256_srli_epi32 ( \ - _mm256_mullo_epi32 ( \ - _mm256_sub_epi32 ((a), (b)), factors), 8), (b)) - -#define LERP_SIMD128_EPI32(a, b, f) \ - _mm_add_epi32 ( \ - _mm_srli_epi32 ( \ - _mm_mullo_epi32 ( \ - _mm_sub_epi32 ((a), (b)), factors), 8), (b)) - -#define LERP_SIMD256_EPI32_AND_MASK(a, b, f, mask) \ - _mm256_and_si256 (LERP_SIMD256_EPI32 ((a), (b), (f)), (mask)) - -#define LERP_SIMD128_EPI32_AND_MASK(a, b, f, mask) \ - _mm_and_si128 (LERP_SIMD128_EPI32 ((a), (b), (f)), (mask)) - -/* --- Premultiplication --- */ - -#define INVERTED_DIV_SHIFT 21 -#define INVERTED_DIV_ROUNDING (1U << (INVERTED_DIV_SHIFT - 1)) -#define INVERTED_DIV_ROUNDING_128BPP \ - (((uint64_t) INVERTED_DIV_ROUNDING << 32) | INVERTED_DIV_ROUNDING) - -/* This table is used to divide by an integer [1..255] using only a lookup, - * multiplication and a shift. This is faster than plain division on most - * architectures. +/* Batched precalc array layout: * - * Each entry represents the integer 2097152 (1 << 21) divided by the index - * of the entry. Consequently, + * 16 offsets followed by 16 factors, repeating until epilogue. The epilogue + * has offsets and factors alternating one by one, and will always have fewer + * than 16 o/f pairs: * - * (v / i) ~= (v * inverted_div_table [i] + (1 << 20)) >> 21 + * ooooooooooooooooffffffffffffffffooooooooooooooooffffffffffffffffofofofofof... * - * (1 << 20) is added for nearest rounding. It would've been nice to keep - * this table in uint16_t, but alas, we need the extra bits for sufficient - * precision. */ -static const uint32_t inverted_div_table [256] = -{ - 0,2097152,1048576, 699051, 524288, 419430, 349525, 299593, - 262144, 233017, 209715, 190650, 174763, 161319, 149797, 139810, - 131072, 123362, 116508, 110376, 104858, 99864, 95325, 91181, - 87381, 83886, 80660, 77672, 74898, 72316, 69905, 67650, - 65536, 63550, 61681, 59919, 58254, 56680, 55188, 53773, - 52429, 51150, 49932, 48771, 47663, 46603, 45590, 44620, - 43691, 42799, 41943, 41121, 40330, 39569, 38836, 38130, - 37449, 36792, 36158, 35545, 34953, 34380, 33825, 33288, - 32768, 32264, 31775, 31301, 30840, 30394, 29959, 29537, - 29127, 28728, 28340, 27962, 27594, 27236, 26887, 26546, - 26214, 25891, 25575, 25267, 24966, 24672, 24385, 24105, - 23831, 23564, 23302, 23046, 22795, 22550, 22310, 22075, - 21845, 21620, 21400, 21183, 20972, 20764, 20560, 20361, - 20165, 19973, 19784, 19600, 19418, 19240, 19065, 18893, - 18725, 18559, 18396, 18236, 18079, 17924, 17772, 17623, - 17476, 17332, 17190, 17050, 16913, 16777, 16644, 16513, - 16384, 16257, 16132, 16009, 15888, 15768, 15650, 15534, - 15420, 15308, 15197, 15087, 14980, 14873, 14769, 14665, - 14564, 14463, 14364, 14266, 14170, 14075, 13981, 13888, - 13797, 13707, 13618, 13530, 13443, 13358, 13273, 13190, - 13107, 13026, 12945, 12866, 12788, 12710, 12633, 12558, - 12483, 12409, 12336, 12264, 12193, 12122, 12053, 11984, - 11916, 11848, 11782, 11716, 11651, 11586, 11523, 11460, - 11398, 11336, 11275, 11215, 11155, 11096, 11038, 10980, - 10923, 10866, 10810, 10755, 10700, 10645, 10592, 10538, - 10486, 10434, 10382, 10331, 10280, 10230, 10180, 10131, - 10082, 10034, 9986, 9939, 9892, 9846, 9800, 9754, - 9709, 9664, 9620, 9576, 9533, 9489, 9447, 9404, - 9362, 9321, 9279, 9239, 9198, 9158, 9118, 9079, - 9039, 9001, 8962, 8924, 8886, 8849, 8812, 8775, - 8738, 8702, 8666, 8630, 8595, 8560, 8525, 8490, - 8456, 8422, 8389, 8355, 8322, 8289, 8257, 8224, -}; - -/* Masking and shifting out the results is left to the caller. In - * and out may not overlap. */ -static SMOL_INLINE void -unpremul_i_to_u_128bpp (const uint64_t * SMOL_RESTRICT in, - uint64_t * SMOL_RESTRICT out, - uint8_t alpha) -{ - out [0] = ((in [0] * (uint64_t) inverted_div_table [alpha] - + INVERTED_DIV_ROUNDING_128BPP) >> INVERTED_DIV_SHIFT); - out [1] = ((in [1] * (uint64_t) inverted_div_table [alpha] - + INVERTED_DIV_ROUNDING_128BPP) >> INVERTED_DIV_SHIFT); -} + * 16 offsets layout: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 16 factors layout: 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15 + */ -static SMOL_INLINE void -unpremul_p_to_u_128bpp (const uint64_t * SMOL_RESTRICT in, - uint64_t * SMOL_RESTRICT out, - uint8_t alpha) +static uint32_t +array_offset_offset (uint32_t elem_i, int max_index, int do_batches) { - out [0] = (((in [0] << 8) * (uint64_t) inverted_div_table [alpha]) - >> INVERTED_DIV_SHIFT); - out [1] = (((in [1] << 8) * (uint64_t) inverted_div_table [alpha]) - >> INVERTED_DIV_SHIFT); + if (do_batches + && (max_index - ((elem_i / BILIN_HORIZ_BATCH_PIXELS) * BILIN_HORIZ_BATCH_PIXELS) + >= BILIN_HORIZ_BATCH_PIXELS)) + { + return (elem_i / (BILIN_HORIZ_BATCH_PIXELS)) * (BILIN_HORIZ_BATCH_PIXELS * 2) + + (elem_i % BILIN_HORIZ_BATCH_PIXELS); + } + else + { + return elem_i * 2; + } } -static SMOL_INLINE uint64_t -unpremul_p_to_u_64bpp (const uint64_t in, - uint8_t alpha) +static uint32_t +array_offset_factor (uint32_t elem_i, int max_index, int do_batches) { - uint64_t in_128bpp [2]; - uint64_t out_128bpp [2]; + const uint8_t o [BILIN_HORIZ_BATCH_PIXELS] = { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; - in_128bpp [0] = (in & 0x000000ff000000ff); - in_128bpp [1] = (in & 0x00ff000000ff0000) >> 16; - - unpremul_p_to_u_128bpp (in_128bpp, out_128bpp, alpha); - - return (out_128bpp [0] & 0x000000ff000000ff) - | ((out_128bpp [1] & 0x000000ff000000ff) << 16); + if (do_batches + && (max_index - ((elem_i / BILIN_HORIZ_BATCH_PIXELS) * BILIN_HORIZ_BATCH_PIXELS) + >= BILIN_HORIZ_BATCH_PIXELS)) + { + return (elem_i / (BILIN_HORIZ_BATCH_PIXELS)) * (BILIN_HORIZ_BATCH_PIXELS * 2) + + BILIN_HORIZ_BATCH_PIXELS + o [elem_i % BILIN_HORIZ_BATCH_PIXELS]; + } + else + { + return elem_i * 2 + 1; + } } -static SMOL_INLINE uint64_t -premul_u_to_p_64bpp (const uint64_t in, - uint8_t alpha) +static void +precalc_linear_range (uint16_t *array_out, + int first_index, + int last_index, + int max_index, + uint64_t first_sample_ofs, + uint64_t sample_step, + int sample_ofs_px_max, + int32_t dest_clip_before_px, + int do_batches, + int *array_i_inout) { - return ((in * ((uint16_t) alpha + 1)) >> 8) & 0x00ff00ff00ff00ff; -} - -/* --- Packing --- */ - -/* It's nice to be able to shift by a negative amount */ -#define SHIFT_S(in, s) ((s >= 0) ? (in) << (s) : (in) >> -(s)) - -#if 0 -/* Currently unused */ - -/* This is kind of bulky (~13 x86 insns), but it's about the same as using - * unions, and we don't have to worry about endianness. */ -#define PACK_FROM_1234_64BPP(in, a, b, c, d) \ - ((SHIFT_S ((in), ((a) - 1) * 16 + 8 - 32) & 0xff000000) \ - | (SHIFT_S ((in), ((b) - 1) * 16 + 8 - 40) & 0x00ff0000) \ - | (SHIFT_S ((in), ((c) - 1) * 16 + 8 - 48) & 0x0000ff00) \ - | (SHIFT_S ((in), ((d) - 1) * 16 + 8 - 56) & 0x000000ff)) -#endif - -#define PACK_FROM_1234_128BPP(in, a, b, c, d) \ - ((SHIFT_S ((in [((a) - 1) >> 1]), (((a) - 1) & 1) * 32 + 24 - 32) & 0xff000000) \ - | (SHIFT_S ((in [((b) - 1) >> 1]), (((b) - 1) & 1) * 32 + 24 - 40) & 0x00ff0000) \ - | (SHIFT_S ((in [((c) - 1) >> 1]), (((c) - 1) & 1) * 32 + 24 - 48) & 0x0000ff00) \ - | (SHIFT_S ((in [((d) - 1) >> 1]), (((d) - 1) & 1) * 32 + 24 - 56) & 0x000000ff)) - -#define SWAP_2_AND_3(n) ((n) == 2 ? 3 : (n) == 3 ? 2 : n) - -#define PACK_FROM_1324_64BPP(in, a, b, c, d) \ - ((SHIFT_S ((in), (SWAP_2_AND_3 (a) - 1) * 16 + 8 - 32) & 0xff000000) \ - | (SHIFT_S ((in), (SWAP_2_AND_3 (b) - 1) * 16 + 8 - 40) & 0x00ff0000) \ - | (SHIFT_S ((in), (SWAP_2_AND_3 (c) - 1) * 16 + 8 - 48) & 0x0000ff00) \ - | (SHIFT_S ((in), (SWAP_2_AND_3 (d) - 1) * 16 + 8 - 56) & 0x000000ff)) - -#if 0 -/* Currently unused */ - -#define PACK_FROM_1324_128BPP(in, a, b, c, d) \ - ((SHIFT_S ((in [(SWAP_2_AND_3 (a) - 1) >> 1]), \ - ((SWAP_2_AND_3 (a) - 1) & 1) * 32 + 24 - 32) & 0xff000000) \ - | (SHIFT_S ((in [(SWAP_2_AND_3 (b) - 1) >> 1]), \ - ((SWAP_2_AND_3 (b) - 1) & 1) * 32 + 24 - 40) & 0x00ff0000) \ - | (SHIFT_S ((in [(SWAP_2_AND_3 (c) - 1) >> 1]), \ - ((SWAP_2_AND_3 (c) - 1) & 1) * 32 + 24 - 48) & 0x0000ff00) \ - | (SHIFT_S ((in [(SWAP_2_AND_3 (d) - 1) >> 1]), \ - ((SWAP_2_AND_3 (d) - 1) & 1) * 32 + 24 - 56) & 0x000000ff)) -#endif + uint64_t sample_ofs; + int i; -/* Pack p -> p */ + sample_ofs = first_sample_ofs; -static SMOL_INLINE uint32_t -pack_pixel_1324_p_to_1234_p_64bpp (uint64_t in) -{ - return in | (in >> 24); -} + for (i = first_index; i < last_index; i++) + { + uint16_t sample_ofs_px = sample_ofs / SMOL_BILIN_MULTIPLIER; -static void -pack_row_1324_p_to_1234_p_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint32_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint32_t *row_out_max = row_out + n_pixels; + if (sample_ofs_px >= sample_ofs_px_max - 1) + { + if (i >= dest_clip_before_px) + { + array_out [array_offset_offset ((*array_i_inout), max_index, do_batches)] = sample_ofs_px_max - 2; + array_out [array_offset_factor ((*array_i_inout), max_index, do_batches)] = 0; + (*array_i_inout)++; + } + continue; + } - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); + if (i >= dest_clip_before_px) + { + array_out [array_offset_offset ((*array_i_inout), max_index, do_batches)] = sample_ofs_px; + array_out [array_offset_factor ((*array_i_inout), max_index, do_batches)] = SMOL_SMALL_MUL + - ((sample_ofs / (SMOL_BILIN_MULTIPLIER / SMOL_SMALL_MUL)) % SMOL_SMALL_MUL); + (*array_i_inout)++; + } - while (row_out != row_out_max) - { - *(row_out++) = pack_pixel_1324_p_to_1234_p_64bpp (*(row_in++)); + sample_ofs += sample_step; } } static void -pack_row_132a_p_to_123_p_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +precalc_bilinear_array (uint16_t *array, + uint64_t src_dim_spx, + uint64_t dest_ofs_spx, + uint64_t dest_dim_spx, + uint32_t dest_dim_prehalving_px, + unsigned int n_halvings, + int32_t dest_clip_before_px, + int32_t dest_clip_after_px, + unsigned int do_batches) { - uint8_t *row_out_max = row_out + n_pixels * 3; + uint32_t src_dim_px = SMOL_SPX_TO_PX (src_dim_spx); + uint64_t first_sample_ofs [3]; + uint64_t sample_step; + int i = 0; + + assert (src_dim_px > 1); - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); + dest_ofs_spx %= SMOL_SUBPIXEL_MUL; - while (row_out != row_out_max) + if (src_dim_spx > dest_dim_spx) { - /* FIXME: Would be faster to shift directly */ - uint32_t p = pack_pixel_1324_p_to_1234_p_64bpp (*(row_in++)); - *(row_out++) = p >> 24; - *(row_out++) = p >> 16; - *(row_out++) = p >> 8; + /* Minification */ + sample_step = ((uint64_t) src_dim_spx * SMOL_BILIN_MULTIPLIER) / dest_dim_spx; + first_sample_ofs [0] = (sample_step - SMOL_BILIN_MULTIPLIER) / 2; + first_sample_ofs [1] = ((sample_step - SMOL_BILIN_MULTIPLIER) / 2) + + ((sample_step * (SMOL_SUBPIXEL_MUL - dest_ofs_spx) * (1 << n_halvings)) / SMOL_SUBPIXEL_MUL); } + else + { + /* Magnification */ + sample_step = ((src_dim_spx - SMOL_SUBPIXEL_MUL) * SMOL_BILIN_MULTIPLIER) + / (dest_dim_spx > SMOL_SUBPIXEL_MUL ? (dest_dim_spx - SMOL_SUBPIXEL_MUL) : 1); + first_sample_ofs [0] = 0; + first_sample_ofs [1] = (sample_step * (SMOL_SUBPIXEL_MUL - dest_ofs_spx)) / SMOL_SUBPIXEL_MUL; + } + + first_sample_ofs [2] = (((uint64_t) src_dim_spx * SMOL_BILIN_MULTIPLIER) / SMOL_SUBPIXEL_MUL) + + ((sample_step - SMOL_BILIN_MULTIPLIER) / 2) + - sample_step * (1U << n_halvings); + + /* Left fringe */ + precalc_linear_range (array, + 0, + 1 << n_halvings, + dest_dim_prehalving_px - dest_clip_after_px, + first_sample_ofs [0], + sample_step, + src_dim_px, + dest_clip_before_px, + do_batches, + &i); + + /* Main range */ + precalc_linear_range (array, + 1 << n_halvings, + dest_dim_prehalving_px - (1 << n_halvings), + dest_dim_prehalving_px - dest_clip_after_px, + first_sample_ofs [1], + sample_step, + src_dim_px, + dest_clip_before_px, + do_batches, + &i); + + /* Right fringe */ + precalc_linear_range (array, + dest_dim_prehalving_px - (1 << n_halvings), + dest_dim_prehalving_px, + dest_dim_prehalving_px - dest_clip_after_px, + first_sample_ofs [2], + sample_step, + src_dim_px, + dest_clip_before_px, + do_batches, + &i); } static void -pack_row_132a_p_to_321_p_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; +precalc_boxes_array (uint32_t *array, + uint32_t *span_step, + uint32_t *span_mul, + uint32_t src_dim_spx, + int32_t dest_dim, + uint32_t dest_ofs_spx, + uint32_t dest_dim_spx, + int32_t dest_clip_before_px) +{ + uint64_t fracF, frac_stepF; + uint64_t f; + uint64_t stride; + uint64_t a, b; + int i, dest_i; - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); + dest_ofs_spx %= SMOL_SUBPIXEL_MUL; - while (row_out != row_out_max) - { - /* FIXME: Would be faster to shift directly */ - uint32_t p = pack_pixel_1324_p_to_1234_p_64bpp (*(row_in++)); - *(row_out++) = p >> 8; - *(row_out++) = p >> 16; - *(row_out++) = p >> 24; - } -} + /* Output sample can't be less than a pixel. Fringe opacity is applied in + * a separate step. FIXME: May cause wrong subpixel distribution -- revisit. */ + if (dest_dim_spx < 256) + dest_dim_spx = 256; -#define DEF_PACK_FROM_1324_P_TO_P_64BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_1324_p_to_##a##b##c##d##_p_64bpp (uint64_t in) \ -{ \ - return PACK_FROM_1324_64BPP (in, a, b, c, d); \ -} \ - \ -static void \ -pack_row_1324_p_to_##a##b##c##d##_p_64bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - *(row_out++) = pack_pixel_1324_p_to_##a##b##c##d##_p_64bpp (*(row_in++)); \ -} + frac_stepF = ((uint64_t) src_dim_spx * SMOL_BIG_MUL) / (uint64_t) dest_dim_spx; + fracF = 0; -DEF_PACK_FROM_1324_P_TO_P_64BPP (1, 4, 3, 2) -DEF_PACK_FROM_1324_P_TO_P_64BPP (2, 3, 4, 1) -DEF_PACK_FROM_1324_P_TO_P_64BPP (3, 2, 1, 4) -DEF_PACK_FROM_1324_P_TO_P_64BPP (4, 1, 2, 3) -DEF_PACK_FROM_1324_P_TO_P_64BPP (4, 3, 2, 1) + stride = frac_stepF / (uint64_t) SMOL_BIG_MUL; + f = (frac_stepF / SMOL_SMALL_MUL) % SMOL_SMALL_MUL; -static SMOL_INLINE uint32_t -pack_pixel_1234_p_to_1234_p_128bpp (const uint64_t *in) -{ - /* FIXME: Are masks needed? */ - return ((in [0] >> 8) & 0xff000000) - | ((in [0] << 16) & 0x00ff0000) - | ((in [1] >> 24) & 0x0000ff00) - | (in [1] & 0x000000ff); -} + /* We divide by (b + 1) instead of just (b) to avoid overflows in + * scale_128bpp_half(), which would affect horizontal box scaling. The + * fudge factor counters limited precision in the inverted division + * operation. It causes 16-bit values to undershoot by less than 127/65535 + * (<.2%). Since the final output is 8-bit, and rounding neutralizes the + * error, this doesn't matter. */ -static void -pack_row_1234_p_to_1234_p_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint32_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint32_t *row_out_max = row_out + n_pixels; + a = (SMOL_BOXES_MULTIPLIER * 255); + b = ((stride * 255) + ((f * 255) / 256)); + *span_step = frac_stepF / SMOL_SMALL_MUL; + *span_mul = (a + (b / 2)) / (b + 1); + + /* Left fringe */ + i = 0; + dest_i = 0; - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); + if (dest_i >= dest_clip_before_px) + array [i++] = 0; - while (row_out != row_out_max) + /* Main range */ + fracF = ((frac_stepF * (SMOL_SUBPIXEL_MUL - dest_ofs_spx)) / SMOL_SUBPIXEL_MUL); + for (dest_i = 1; dest_i < dest_dim - 1; dest_i++) { - *(row_out++) = pack_pixel_1234_p_to_1234_p_128bpp (row_in); - row_in += 2; + if (dest_i >= dest_clip_before_px) + array [i++] = fracF / SMOL_SMALL_MUL; + fracF += frac_stepF; } -} -#define DEF_PACK_FROM_1234_P_TO_P_128BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_1234_p_to_##a##b##c##d##_p_128bpp (const uint64_t * SMOL_RESTRICT in) \ -{ \ - return PACK_FROM_1234_128BPP (in, a, b, c, d); \ -} \ - \ -static void \ -pack_row_1234_p_to_##a##b##c##d##_p_128bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - { \ - *(row_out++) = pack_pixel_1234_p_to_##a##b##c##d##_p_128bpp (row_in); \ - row_in += 2; \ - } \ + /* Right fringe */ + if (dest_dim > 1 && dest_i >= dest_clip_before_px) + array [i++] = (((uint64_t) src_dim_spx * SMOL_SMALL_MUL - frac_stepF) / SMOL_SMALL_MUL); } -DEF_PACK_FROM_1234_P_TO_P_128BPP (1, 4, 3, 2) -DEF_PACK_FROM_1234_P_TO_P_128BPP (2, 3, 4, 1) -DEF_PACK_FROM_1234_P_TO_P_128BPP (3, 2, 1, 4) -DEF_PACK_FROM_1234_P_TO_P_128BPP (4, 1, 2, 3) -DEF_PACK_FROM_1234_P_TO_P_128BPP (4, 3, 2, 1) - static void -pack_row_123a_p_to_123_p_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +init_dim (SmolDim *dim, int do_batches) { - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) + if (dim->filter_type == SMOL_FILTER_ONE || dim->filter_type == SMOL_FILTER_COPY) { - *(row_out++) = *row_in >> 32; - *(row_out++) = *(row_in++); - *(row_out++) = *(row_in++) >> 32; } -} - -static void -pack_row_123a_p_to_321_p_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) + else if (dim->filter_type == SMOL_FILTER_BOX) { - *(row_out++) = row_in [1] >> 32; - *(row_out++) = row_in [0]; - *(row_out++) = row_in [0] >> 32; - row_in += 2; + precalc_boxes_array (dim->precalc, + &dim->span_step, + &dim->span_mul, + dim->src_size_spx, + dim->placement_size_px, + dim->placement_ofs_spx, + dim->placement_size_spx, + dim->clip_before_px); } -} - -/* Pack p (alpha last) -> u */ - -static SMOL_INLINE uint32_t -pack_pixel_132a_p_to_1234_u_64bpp (uint64_t in) -{ - uint8_t alpha = in; - in = (unpremul_p_to_u_64bpp (in, alpha) & 0xffffffffffffff00) | alpha; - return in | (in >> 24); -} - -static void -pack_row_132a_p_to_1234_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint32_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint32_t *row_out_max = row_out + n_pixels; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) + else /* SMOL_FILTER_BILINEAR_?H */ { - *(row_out++) = pack_pixel_132a_p_to_1234_u_64bpp (*(row_in++)); + precalc_bilinear_array (dim->precalc, + dim->src_size_spx, + dim->placement_ofs_spx, + dim->placement_size_prehalving_spx, + dim->placement_size_prehalving_px, + dim->n_halvings, + dim->clip_before_px, + dim->clip_after_px, + do_batches); } } static void -pack_row_132a_p_to_123_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +init_horizontal (SmolScaleCtx *scale_ctx) { - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_132a_p_to_1234_u_64bpp (*(row_in++)); - *(row_out++) = p >> 24; - *(row_out++) = p >> 16; - *(row_out++) = p >> 8; - } + init_dim (&scale_ctx->hdim, + scale_ctx->storage_type == SMOL_STORAGE_64BPP ? TRUE : FALSE); } static void -pack_row_132a_p_to_321_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_132a_p_to_1234_u_64bpp (*(row_in++)); - *(row_out++) = p >> 8; - *(row_out++) = p >> 16; - *(row_out++) = p >> 24; - } -} - -#define DEF_PACK_FROM_132A_P_TO_U_64BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_132a_p_to_##a##b##c##d##_u_64bpp (uint64_t in) \ -{ \ - uint8_t alpha = in; \ - in = (unpremul_p_to_u_64bpp (in, alpha) & 0xffffffffffffff00) | alpha; \ - return PACK_FROM_1324_64BPP (in, a, b, c, d); \ -} \ - \ -static void \ -pack_row_132a_p_to_##a##b##c##d##_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - *(row_out++) = pack_pixel_132a_p_to_##a##b##c##d##_u_64bpp (*(row_in++)); \ -} - -DEF_PACK_FROM_132A_P_TO_U_64BPP (3, 2, 1, 4) -DEF_PACK_FROM_132A_P_TO_U_64BPP (4, 1, 2, 3) -DEF_PACK_FROM_132A_P_TO_U_64BPP (4, 3, 2, 1) - -#define DEF_PACK_FROM_123A_P_TO_U_128BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_123a_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT in) \ -{ \ - uint64_t t [2]; \ - uint8_t alpha = in [1]; \ - unpremul_p_to_u_128bpp (in, t, alpha); \ - t [1] = (t [1] & 0xffffffff00000000) | alpha; \ - return PACK_FROM_1234_128BPP (t, a, b, c, d); \ -} \ - \ -static void \ -pack_row_123a_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - { \ - *(row_out++) = pack_pixel_123a_p_to_##a##b##c##d##_u_128bpp (row_in); \ - row_in += 2; \ - } \ -} - -DEF_PACK_FROM_123A_P_TO_U_128BPP (1, 2, 3, 4) -DEF_PACK_FROM_123A_P_TO_U_128BPP (3, 2, 1, 4) -DEF_PACK_FROM_123A_P_TO_U_128BPP (4, 1, 2, 3) -DEF_PACK_FROM_123A_P_TO_U_128BPP (4, 3, 2, 1) - -static void -pack_row_123a_p_to_123_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +init_vertical (SmolScaleCtx *scale_ctx) { - uint8_t *row_out_max = row_out + n_pixels * 3; + init_dim (&scale_ctx->vdim, FALSE); +} - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); +/* ----------------- * + * Premultiplication * + * ----------------- */ - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_123a_p_to_1234_u_128bpp (row_in); - row_in += 2; - *(row_out++) = p >> 24; - *(row_out++) = p >> 16; - *(row_out++) = p >> 8; - } +static SMOL_INLINE void +premul_u_to_p8_128bpp (uint64_t * SMOL_RESTRICT inout, + uint8_t alpha) +{ + inout [0] = (((inout [0] + 0x0000000100000001) * ((uint16_t) alpha + 1) - 0x0000000100000001) + >> 8) & 0x000000ff000000ff; + inout [1] = (((inout [1] + 0x0000000100000001) * ((uint16_t) alpha + 1) - 0x0000000100000001) + >> 8) & 0x000000ff000000ff; } -static void -pack_row_123a_p_to_321_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +static SMOL_INLINE void +unpremul_p8_to_u_128bpp (const uint64_t *in, + uint64_t *out, + uint8_t alpha) { - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_123a_p_to_1234_u_128bpp (row_in); - row_in += 2; - *(row_out++) = p >> 8; - *(row_out++) = p >> 16; - *(row_out++) = p >> 24; - } + out [0] = ((in [0] * _smol_inv_div_p8_lut [alpha]) + >> INVERTED_DIV_SHIFT_P8) & 0x000000ff000000ff; + out [1] = ((in [1] * _smol_inv_div_p8_lut [alpha]) + >> INVERTED_DIV_SHIFT_P8) & 0x000000ff000000ff; } -/* Pack p (alpha first) -> u */ - -static SMOL_INLINE uint32_t -pack_pixel_a324_p_to_1234_u_64bpp (uint64_t in) +static SMOL_INLINE uint64_t +premul_u_to_p8_64bpp (const uint64_t in, + uint8_t alpha) { - uint8_t alpha = (in >> 48) & 0xff; /* FIXME: May not need mask */ - in = (unpremul_p_to_u_64bpp (in, alpha) & 0x0000ffffffffffff) | ((uint64_t) alpha << 48); - return in | (in >> 24); + return (((in + 0x0001000100010001) * ((uint16_t) alpha + 1) - 0x0001000100010001) + >> 8) & 0x00ff00ff00ff00ff; } -static void -pack_row_a324_p_to_1234_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint32_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +static SMOL_INLINE uint64_t +unpremul_p8_to_u_64bpp (const uint64_t in, + uint8_t alpha) { - uint32_t *row_out_max = row_out + n_pixels; + uint64_t in_128bpp [2]; + uint64_t out_128bpp [2]; - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); + in_128bpp [0] = (in & 0x000000ff000000ff); + in_128bpp [1] = (in & 0x00ff000000ff0000) >> 16; - while (row_out != row_out_max) - { - *(row_out++) = pack_pixel_a324_p_to_1234_u_64bpp (*(row_in++)); - } + unpremul_p8_to_u_128bpp (in_128bpp, out_128bpp, alpha); + + return out_128bpp [0] | (out_128bpp [1] << 16); } -static void -pack_row_a324_p_to_234_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +static SMOL_INLINE void +premul_u_to_p16_128bpp (uint64_t *inout, + uint8_t alpha) { - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_a324_p_to_1234_u_64bpp (*(row_in++)); - *(row_out++) = p >> 16; - *(row_out++) = p >> 8; - *(row_out++) = p; - } + inout [0] = inout [0] * alpha; + inout [1] = inout [1] * alpha; } -static void -pack_row_a324_p_to_432_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_a324_p_to_1234_u_64bpp (*(row_in++)); - *(row_out++) = p; - *(row_out++) = p >> 8; - *(row_out++) = p >> 16; - } -} - -#define DEF_PACK_FROM_A324_P_TO_U_64BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_a324_p_to_##a##b##c##d##_u_64bpp (uint64_t in) \ -{ \ - uint8_t alpha = (in >> 48) & 0xff; /* FIXME: May not need mask */ \ - in = (unpremul_p_to_u_64bpp (in, alpha) & 0x0000ffffffffffff) | ((uint64_t) alpha << 48); \ - return PACK_FROM_1324_64BPP (in, a, b, c, d); \ -} \ - \ -static void \ -pack_row_a324_p_to_##a##b##c##d##_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - *(row_out++) = pack_pixel_a324_p_to_##a##b##c##d##_u_64bpp (*(row_in++)); \ -} - -DEF_PACK_FROM_A324_P_TO_U_64BPP (1, 4, 3, 2) -DEF_PACK_FROM_A324_P_TO_U_64BPP (2, 3, 4, 1) -DEF_PACK_FROM_A324_P_TO_U_64BPP (4, 3, 2, 1) - -#define DEF_PACK_FROM_A234_P_TO_U_128BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_a234_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT in) \ -{ \ - uint64_t t [2]; \ - uint8_t alpha = in [0] >> 32; \ - unpremul_p_to_u_128bpp (in, t, alpha); \ - t [0] = (t [0] & 0x00000000ffffffff) | ((uint64_t) alpha << 32); \ - return PACK_FROM_1234_128BPP (t, a, b, c, d); \ -} \ - \ -static void \ -pack_row_a234_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - { \ - *(row_out++) = pack_pixel_a234_p_to_##a##b##c##d##_u_128bpp (row_in); \ - row_in += 2; \ - } \ -} - -DEF_PACK_FROM_A234_P_TO_U_128BPP (1, 2, 3, 4) -DEF_PACK_FROM_A234_P_TO_U_128BPP (1, 4, 3, 2) -DEF_PACK_FROM_A234_P_TO_U_128BPP (2, 3, 4, 1) -DEF_PACK_FROM_A234_P_TO_U_128BPP (4, 3, 2, 1) - -static void -pack_row_a234_p_to_234_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +static SMOL_INLINE void +unpremul_p16_to_u_128bpp (const uint64_t * SMOL_RESTRICT in, + uint64_t * SMOL_RESTRICT out, + uint8_t alpha) { - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_a234_p_to_1234_u_128bpp (row_in); - row_in += 2; - *(row_out++) = p >> 16; - *(row_out++) = p >> 8; - *(row_out++) = p; - } + out [0] = ((in [0] * _smol_inv_div_p16_lut [alpha]) + >> INVERTED_DIV_SHIFT_P16) & 0x000000ff000000ffULL; + out [1] = ((in [1] * _smol_inv_div_p16_lut [alpha]) + >> INVERTED_DIV_SHIFT_P16) & 0x000000ff000000ffULL; } -static void -pack_row_a234_p_to_432_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; +/* --------- * + * Repacking * + * --------- */ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); +/* PACK_SHUF_MM256_EPI8_32_TO_128() + * + * Generates a shuffling register for packing 8bpc pixel channels in the + * provided order. The order (1, 2, 3, 4) is neutral and corresponds to + * + * _mm256_set_epi8 (13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2, + * 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); + */ +#define SHUF_ORDER_32_TO_128 0x01000302U +#define SHUF_CH_32_TO_128(n) ((char) (SHUF_ORDER_32_TO_128 >> ((4 - (n)) * 8))) +#define SHUF_QUAD_CH_32_TO_128(q, n) (4 * (q) + SHUF_CH_32_TO_128 (n)) +#define SHUF_QUAD_32_TO_128(q, a, b, c, d) \ + SHUF_QUAD_CH_32_TO_128 ((q), (a)), \ + SHUF_QUAD_CH_32_TO_128 ((q), (b)), \ + SHUF_QUAD_CH_32_TO_128 ((q), (c)), \ + SHUF_QUAD_CH_32_TO_128 ((q), (d)) +#define PACK_SHUF_EPI8_LANE_32_TO_128(a, b, c, d) \ + SHUF_QUAD_32_TO_128 (3, (a), (b), (c), (d)), \ + SHUF_QUAD_32_TO_128 (2, (a), (b), (c), (d)), \ + SHUF_QUAD_32_TO_128 (1, (a), (b), (c), (d)), \ + SHUF_QUAD_32_TO_128 (0, (a), (b), (c), (d)) +#define PACK_SHUF_MM256_EPI8_32_TO_128(a, b, c, d) _mm256_set_epi8 ( \ + PACK_SHUF_EPI8_LANE_32_TO_128 ((a), (b), (c), (d)), \ + PACK_SHUF_EPI8_LANE_32_TO_128 ((a), (b), (c), (d))) + +/* PACK_SHUF_MM256_EPI8_32_TO_64() + * + * 64bpp version. Packs only once, so fewer contortions required. */ +#define SHUF_CH_32_TO_64(n) ((char) (4 - (n))) +#define SHUF_QUAD_CH_32_TO_64(q, n) (4 * (q) + SHUF_CH_32_TO_64 (n)) +#define SHUF_QUAD_32_TO_64(q, a, b, c, d) \ + SHUF_QUAD_CH_32_TO_64 ((q), (a)), \ + SHUF_QUAD_CH_32_TO_64 ((q), (b)), \ + SHUF_QUAD_CH_32_TO_64 ((q), (c)), \ + SHUF_QUAD_CH_32_TO_64 ((q), (d)) +#define PACK_SHUF_EPI8_LANE_32_TO_64(a, b, c, d) \ + SHUF_QUAD_32_TO_64 (3, (a), (b), (c), (d)), \ + SHUF_QUAD_32_TO_64 (2, (a), (b), (c), (d)), \ + SHUF_QUAD_32_TO_64 (1, (a), (b), (c), (d)), \ + SHUF_QUAD_32_TO_64 (0, (a), (b), (c), (d)) +#define PACK_SHUF_MM256_EPI8_32_TO_64(a, b, c, d) _mm256_set_epi8 ( \ + PACK_SHUF_EPI8_LANE_32_TO_64 ((a), (b), (c), (d)), \ + PACK_SHUF_EPI8_LANE_32_TO_64 ((a), (b), (c), (d))) - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_a234_p_to_1234_u_128bpp (row_in); - row_in += 2; - *(row_out++) = p; - *(row_out++) = p >> 8; - *(row_out++) = p >> 16; - } -} +/* It's nice to be able to shift by a negative amount */ +#define SHIFT_S(in, s) ((s >= 0) ? (in) << (s) : (in) >> -(s)) -/* Pack i (alpha last) to u */ +/* This is kind of bulky (~13 x86 insns), but it's about the same as using + * unions, and we don't have to worry about endianness. */ +#define PACK_FROM_1234_64BPP(in, a, b, c, d) \ + ((SHIFT_S ((in), ((a) - 1) * 16 + 8 - 32) & 0xff000000) \ + | (SHIFT_S ((in), ((b) - 1) * 16 + 8 - 40) & 0x00ff0000) \ + | (SHIFT_S ((in), ((c) - 1) * 16 + 8 - 48) & 0x0000ff00) \ + | (SHIFT_S ((in), ((d) - 1) * 16 + 8 - 56) & 0x000000ff)) + +#define PACK_FROM_1234_128BPP(in, a, b, c, d) \ + ((SHIFT_S ((in [((a) - 1) >> 1]), (((a) - 1) & 1) * 32 + 24 - 32) & 0xff000000) \ + | (SHIFT_S ((in [((b) - 1) >> 1]), (((b) - 1) & 1) * 32 + 24 - 40) & 0x00ff0000) \ + | (SHIFT_S ((in [((c) - 1) >> 1]), (((c) - 1) & 1) * 32 + 24 - 48) & 0x0000ff00) \ + | (SHIFT_S ((in [((d) - 1) >> 1]), (((d) - 1) & 1) * 32 + 24 - 56) & 0x000000ff)) -static SMOL_INLINE uint32_t -pack_pixel_123a_i_to_1234_u_128bpp (const uint64_t * SMOL_RESTRICT in) -{ - uint8_t alpha = (in [1] >> 8) & 0xff; - uint64_t t [2]; +#define SWAP_2_AND_3(n) ((n) == 2 ? 3 : (n) == 3 ? 2 : n) - unpremul_i_to_u_128bpp (in, t, alpha); +#define PACK_FROM_1324_64BPP(in, a, b, c, d) \ + ((SHIFT_S ((in), (SWAP_2_AND_3 (a) - 1) * 16 + 8 - 32) & 0xff000000) \ + | (SHIFT_S ((in), (SWAP_2_AND_3 (b) - 1) * 16 + 8 - 40) & 0x00ff0000) \ + | (SHIFT_S ((in), (SWAP_2_AND_3 (c) - 1) * 16 + 8 - 48) & 0x0000ff00) \ + | (SHIFT_S ((in), (SWAP_2_AND_3 (d) - 1) * 16 + 8 - 56) & 0x000000ff)) - return ((t [0] >> 8) & 0xff000000) - | ((t [0] << 16) & 0x00ff0000) - | ((t [1] >> 24) & 0x0000ff00) - | alpha; -} +/* ---------------------- * + * Repacking: 24/32 -> 64 * + * ---------------------- */ static void -pack_8x_123a_i_to_xxxx_u_128bpp (const uint64_t * SMOL_RESTRICT *in, - uint32_t * SMOL_RESTRICT *out, - uint32_t * out_max, - const __m256i channel_shuf) +unpack_8x_1234_p8_to_xxxx_p8_64bpp (const uint32_t * SMOL_RESTRICT *in, + uint64_t * SMOL_RESTRICT *out, + uint64_t *out_max, + const __m256i channel_shuf) { -#define ALPHA_MUL (1 << (INVERTED_DIV_SHIFT - 8)) -#define ALPHA_MASK SMOL_8X1BIT (0, 1, 0, 0, 0, 1, 0, 0) - - const __m256i ones = _mm256_set_epi32 ( - ALPHA_MUL, ALPHA_MUL, ALPHA_MUL, ALPHA_MUL, - ALPHA_MUL, ALPHA_MUL, ALPHA_MUL, ALPHA_MUL); - const __m256i alpha_clean_mask = _mm256_set_epi32 ( - 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, - 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff); - const __m256i rounding = _mm256_set_epi32 ( - INVERTED_DIV_ROUNDING, 0, INVERTED_DIV_ROUNDING, INVERTED_DIV_ROUNDING, - INVERTED_DIV_ROUNDING, 0, INVERTED_DIV_ROUNDING, INVERTED_DIV_ROUNDING); - __m256i m00, m01, m02, m03, m04, m05, m06, m07, m08; + const __m256i zero = _mm256_setzero_si256 (); const __m256i * SMOL_RESTRICT my_in = (const __m256i * SMOL_RESTRICT) *in; __m256i * SMOL_RESTRICT my_out = (__m256i * SMOL_RESTRICT) *out; + __m256i m0, m1, m2; - SMOL_ASSUME_ALIGNED (my_in, __m256i * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (my_out, __m256i * SMOL_RESTRICT); - while ((ptrdiff_t) (my_out + 1) <= (ptrdiff_t) out_max) + while ((ptrdiff_t) (my_out + 2) <= (ptrdiff_t) out_max) { - /* Load inputs */ - - m00 = _mm256_stream_load_si256 (my_in); - my_in++; - m01 = _mm256_stream_load_si256 (my_in); - my_in++; - m02 = _mm256_stream_load_si256 (my_in); - my_in++; - m03 = _mm256_stream_load_si256 (my_in); + m0 = _mm256_loadu_si256 (my_in); my_in++; - /* Load alpha factors */ - - m04 = _mm256_slli_si256 (m00, 4); - m06 = _mm256_srli_si256 (m03, 4); - m05 = _mm256_blend_epi32 (m04, m01, ALPHA_MASK); - m07 = _mm256_blend_epi32 (m06, m02, ALPHA_MASK); - m07 = _mm256_srli_si256 (m07, 4); - - m04 = _mm256_blend_epi32 (m05, m07, SMOL_8X1BIT (0, 0, 1, 1, 0, 0, 1, 1)); - m04 = _mm256_srli_epi32 (m04, 8); - m04 = _mm256_and_si256 (m04, alpha_clean_mask); - m04 = _mm256_i32gather_epi32 ((const void *) inverted_div_table, m04, 4); - - /* 2 pixels times 4 */ - - m05 = _mm256_shuffle_epi32 (m04, SMOL_4X2BIT (3, 3, 3, 3)); - m06 = _mm256_shuffle_epi32 (m04, SMOL_4X2BIT (2, 2, 2, 2)); - m07 = _mm256_shuffle_epi32 (m04, SMOL_4X2BIT (1, 1, 1, 1)); - m08 = _mm256_shuffle_epi32 (m04, SMOL_4X2BIT (0, 0, 0, 0)); - - m05 = _mm256_blend_epi32 (m05, ones, ALPHA_MASK); - m06 = _mm256_blend_epi32 (m06, ones, ALPHA_MASK); - m07 = _mm256_blend_epi32 (m07, ones, ALPHA_MASK); - m08 = _mm256_blend_epi32 (m08, ones, ALPHA_MASK); - - m05 = _mm256_mullo_epi32 (m05, m00); - m06 = _mm256_mullo_epi32 (m06, m01); - m07 = _mm256_mullo_epi32 (m07, m02); - m08 = _mm256_mullo_epi32 (m08, m03); - - m05 = _mm256_add_epi32 (m05, rounding); - m06 = _mm256_add_epi32 (m06, rounding); - m07 = _mm256_add_epi32 (m07, rounding); - m08 = _mm256_add_epi32 (m08, rounding); - - m05 = _mm256_srli_epi32 (m05, INVERTED_DIV_SHIFT); - m06 = _mm256_srli_epi32 (m06, INVERTED_DIV_SHIFT); - m07 = _mm256_srli_epi32 (m07, INVERTED_DIV_SHIFT); - m08 = _mm256_srli_epi32 (m08, INVERTED_DIV_SHIFT); - - /* Pack and store */ - - m00 = _mm256_packus_epi32 (m05, m06); - m01 = _mm256_packus_epi32 (m07, m08); - m00 = _mm256_packus_epi16 (m00, m01); + m0 = _mm256_shuffle_epi8 (m0, channel_shuf); + m0 = _mm256_permute4x64_epi64 (m0, SMOL_4X2BIT (3, 1, 2, 0)); - m00 = _mm256_shuffle_epi8 (m00, channel_shuf); - m00 = _mm256_permute4x64_epi64 (m00, SMOL_4X2BIT (3, 1, 2, 0)); - m00 = _mm256_shuffle_epi32 (m00, SMOL_4X2BIT (3, 1, 2, 0)); + m1 = _mm256_unpacklo_epi8 (m0, zero); + m2 = _mm256_unpackhi_epi8 (m0, zero); - _mm256_storeu_si256 (my_out, m00); - my_out += 1; + _mm256_store_si256 (my_out, m1); + my_out++; + _mm256_store_si256 (my_out, m2); + my_out++; } - *out = (uint32_t * SMOL_RESTRICT) my_out; - *in = (const uint64_t * SMOL_RESTRICT) my_in; - -#undef ALPHA_MUL -#undef ALPHA_MASK + *out = (uint64_t * SMOL_RESTRICT) my_out; + *in = (const uint32_t * SMOL_RESTRICT) my_in; } -/* PACK_SHUF_MM256_EPI8() - * - * Generates a shuffling register for packing 8bpc pixel channels in the - * provided order. The order (1, 2, 3, 4) is neutral and corresponds to - * - * _mm256_set_epi8 (13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2, - * 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); - */ -#define SHUF_ORDER 0x01000302U -#define SHUF_CH(n) ((char) (SHUF_ORDER >> ((4 - (n)) * 8))) -#define SHUF_QUAD_CH(q, n) (4 * (q) + SHUF_CH (n)) -#define SHUF_QUAD(q, a, b, c, d) \ - SHUF_QUAD_CH ((q), (a)), \ - SHUF_QUAD_CH ((q), (b)), \ - SHUF_QUAD_CH ((q), (c)), \ - SHUF_QUAD_CH ((q), (d)) -#define PACK_SHUF_EPI8_LANE(a, b, c, d) \ - SHUF_QUAD (3, (a), (b), (c), (d)), \ - SHUF_QUAD (2, (a), (b), (c), (d)), \ - SHUF_QUAD (1, (a), (b), (c), (d)), \ - SHUF_QUAD (0, (a), (b), (c), (d)) -#define PACK_SHUF_MM256_EPI8(a, b, c, d) _mm256_set_epi8 ( \ - PACK_SHUF_EPI8_LANE ((a), (b), (c), (d)), \ - PACK_SHUF_EPI8_LANE ((a), (b), (c), (d))) - -static void -pack_row_123a_i_to_1234_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint32_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +static SMOL_INLINE uint64_t +unpack_pixel_123_p8_to_132a_p8_64bpp (const uint8_t *p) { - uint32_t *row_out_max = row_out + n_pixels; - const __m256i channel_shuf = PACK_SHUF_MM256_EPI8 (1, 2, 3, 4); - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t * SMOL_RESTRICT); - - pack_8x_123a_i_to_xxxx_u_128bpp (&row_in, &row_out, row_out_max, - channel_shuf); - - while (row_out != row_out_max) - { - *(row_out++) = pack_pixel_123a_i_to_1234_u_128bpp (row_in); - row_in += 2; - } + return ((uint64_t) p [0] << 48) | ((uint32_t) p [1] << 16) + | ((uint64_t) p [2] << 32) | 0xff; } -static void -pack_row_123a_i_to_123_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) +SMOL_REPACK_ROW_DEF (123, 24, 8, PREMUL8, COMPRESSED, + 1324, 64, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) { - uint32_t p = pack_pixel_123a_i_to_1234_u_128bpp (row_in); - row_in += 2; - *(row_out++) = p >> 24; - *(row_out++) = p >> 16; - *(row_out++) = p >> 8; + *(dest_row++) = unpack_pixel_123_p8_to_132a_p8_64bpp (src_row); + src_row += 3; } -} - -static void -pack_row_123a_i_to_321_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_123a_i_to_1234_u_128bpp (row_in); - row_in += 2; - *(row_out++) = p >> 8; - *(row_out++) = p >> 16; - *(row_out++) = p >> 24; - } -} - -#define DEF_PACK_FROM_123A_I_TO_U_128BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_123a_i_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT in) \ -{ \ - uint8_t alpha = (in [1] >> 8) & 0xff; \ - uint64_t t [2]; \ - unpremul_i_to_u_128bpp (in, t, alpha); \ - t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \ - return PACK_FROM_1234_128BPP (t, a, b, c, d); \ -} \ - \ -static void \ -pack_row_123a_i_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - const __m256i channel_shuf = PACK_SHUF_MM256_EPI8 ((a), (b), (c), (d)); \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - pack_8x_123a_i_to_xxxx_u_128bpp (&row_in, &row_out, row_out_max, \ - channel_shuf); \ - while (row_out != row_out_max) \ - { \ - *(row_out++) = pack_pixel_123a_i_to_##a##b##c##d##_u_128bpp (row_in); \ - row_in += 2; \ - } \ -} - -DEF_PACK_FROM_123A_I_TO_U_128BPP(3, 2, 1, 4) -DEF_PACK_FROM_123A_I_TO_U_128BPP(4, 1, 2, 3) -DEF_PACK_FROM_123A_I_TO_U_128BPP(4, 3, 2, 1) - -/* Unpack p -> p */ +} SMOL_REPACK_ROW_DEF_END static SMOL_INLINE uint64_t -unpack_pixel_1234_p_to_1324_p_64bpp (uint32_t p) +unpack_pixel_1234_p8_to_1324_p8_64bpp (uint32_t p) { return (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff00ff); } -/* AVX2 has a useful instruction for this: __m256i _mm256_cvtepu8_epi16 (__m128i a); - * It results in a different channel ordering, so it'd be important to match with - * the right kind of re-pack. */ -static void -unpack_row_1234_p_to_1324_p_64bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels; - - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); +SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED, + 1324, 64, 64, PREMUL8, COMPRESSED) { + const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_64 (1, 3, 2, 4); + unpack_8x_1234_p8_to_xxxx_p8_64bpp (&src_row, &dest_row, dest_row_max, + channel_shuf); - while (row_out != row_out_max) + while (dest_row != dest_row_max) { - *(row_out++) = unpack_pixel_1234_p_to_1324_p_64bpp (*(row_in++)); + *(dest_row++) = unpack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++)); } -} +} SMOL_REPACK_ROW_DEF_END static SMOL_INLINE uint64_t -unpack_pixel_123_p_to_132a_p_64bpp (const uint8_t *p) +unpack_pixel_1234_p8_to_3241_p8_64bpp (uint32_t p) { - return ((uint64_t) p [0] << 48) | ((uint32_t) p [1] << 16) - | ((uint64_t) p [2] << 32) | 0xff; + return (((uint64_t) p & 0x0000ff00) << 40) + | (((uint64_t) p & 0x00ff00ff) << 16) | (p >> 24); } -static void -unpack_row_123_p_to_132a_p_64bpp (const uint8_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels; - - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); +SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED, + 3241, 64, 64, PREMUL8, COMPRESSED) { + const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_64 (3, 2, 4, 1); + unpack_8x_1234_p8_to_xxxx_p8_64bpp (&src_row, &dest_row, dest_row_max, + channel_shuf); - while (row_out != row_out_max) + while (dest_row != dest_row_max) { - *(row_out++) = unpack_pixel_123_p_to_132a_p_64bpp (row_in); - row_in += 3; + *(dest_row++) = unpack_pixel_1234_p8_to_3241_p8_64bpp (*(src_row++)); } -} +} SMOL_REPACK_ROW_DEF_END -static SMOL_INLINE void -unpack_pixel_1234_p_to_1234_p_128bpp (uint32_t p, - uint64_t *out) +static SMOL_INLINE uint64_t +unpack_pixel_1234_p8_to_2431_p8_64bpp (uint32_t p) { uint64_t p64 = p; - out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16); - out [1] = ((p64 & 0x0000ff00) << 24) | (p64 & 0x000000ff); -} -static void -unpack_row_1234_p_to_1234_p_128bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels * 2; + return ((p64 & 0x00ff00ff) << 32) | ((p64 & 0x0000ff00) << 8) + | ((p64 & 0xff000000) >> 24); +} - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); +SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED, + 2431, 64, 64, PREMUL8, COMPRESSED) { + const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_64 (2, 4, 3, 1); + unpack_8x_1234_p8_to_xxxx_p8_64bpp (&src_row, &dest_row, dest_row_max, + channel_shuf); - while (row_out != row_out_max) + while (dest_row != dest_row_max) { - unpack_pixel_1234_p_to_1234_p_128bpp (*(row_in++), row_out); - row_out += 2; + *(dest_row++) = unpack_pixel_1234_p8_to_2431_p8_64bpp (*(src_row++)); } -} +} SMOL_REPACK_ROW_DEF_END -static SMOL_INLINE void -unpack_pixel_123_p_to_123a_p_128bpp (const uint8_t *in, - uint64_t *out) -{ - out [0] = ((uint64_t) in [0] << 32) | in [1]; - out [1] = ((uint64_t) in [2] << 32) | 0xff; -} - -static void -unpack_row_123_p_to_123a_p_128bpp (const uint8_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +static SMOL_INLINE uint64_t +unpack_pixel_a234_u_to_324a_p8_64bpp (uint32_t p) { - uint64_t *row_out_max = row_out + n_pixels * 2; + uint64_t p64 = (((uint64_t) p & 0x0000ff00) << 40) | (((uint64_t) p & 0x00ff00ff) << 16); + uint8_t alpha = p >> 24; - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); + return premul_u_to_p8_64bpp (p64, alpha) | alpha; +} - while (row_out != row_out_max) +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 3241, 64, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) { - unpack_pixel_123_p_to_123a_p_128bpp (row_in, row_out); - row_in += 3; - row_out += 2; + *(dest_row++) = unpack_pixel_a234_u_to_324a_p8_64bpp (*(src_row++)); } -} - -/* Unpack u (alpha first) -> p */ +} SMOL_REPACK_ROW_DEF_END static SMOL_INLINE uint64_t -unpack_pixel_a234_u_to_a324_p_64bpp (uint32_t p) +unpack_pixel_1234_u_to_2431_p8_64bpp (uint32_t p) { - uint64_t p64 = (((uint64_t) p & 0x0000ff00) << 24) | (p & 0x00ff00ff); + uint64_t p64 = (((uint64_t) p & 0x00ff00ff) << 32) | (((uint64_t) p & 0x0000ff00) << 8); uint8_t alpha = p >> 24; - return premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha << 48); + return premul_u_to_p8_64bpp (p64, alpha) | alpha; } -static void -unpack_row_a234_u_to_a324_p_64bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels; - - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); - - while (row_out != row_out_max) +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 2431, 64, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) { - *(row_out++) = unpack_pixel_a234_u_to_a324_p_64bpp (*(row_in++)); + *(dest_row++) = unpack_pixel_1234_u_to_2431_p8_64bpp (*(src_row++)); } -} +} SMOL_REPACK_ROW_DEF_END -static SMOL_INLINE void -unpack_pixel_a234_u_to_a234_p_128bpp (uint32_t p, - uint64_t *out) +static SMOL_INLINE uint64_t +unpack_pixel_123a_u_to_132a_p8_64bpp (uint32_t p) { - uint64_t p64 = (((uint64_t) p & 0x0000ff00) << 24) | (p & 0x00ff00ff); - uint8_t alpha = p >> 24; + uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000); + uint8_t alpha = p & 0xff; - p64 = premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha << 48); - out [0] = (p64 >> 16) & 0x000000ff000000ff; - out [1] = p64 & 0x000000ff000000ff; + return premul_u_to_p8_64bpp (p64, alpha) | alpha; } -static void -unpack_row_a234_u_to_a234_p_128bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels * 2; - - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); - - while (row_out != row_out_max) +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 1324, 64, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) { - unpack_pixel_a234_u_to_a234_p_128bpp (*(row_in++), row_out); - row_out += 2; + *(dest_row++) = unpack_pixel_123a_u_to_132a_p8_64bpp (*(src_row++)); } -} +} SMOL_REPACK_ROW_DEF_END -/* Unpack u -> i (common) */ +/* ----------------------- * + * Repacking: 24/32 -> 128 * + * ----------------------- */ static void -unpack_8x_xxxx_u_to_123a_i_128bpp (const uint32_t * SMOL_RESTRICT *in, - uint64_t * SMOL_RESTRICT *out, - uint64_t *out_max, - const __m256i channel_shuf) +unpack_8x_xxxx_u_to_123a_p16_128bpp (const uint32_t * SMOL_RESTRICT *in, + uint64_t * SMOL_RESTRICT *out, + uint64_t *out_max, + const __m256i channel_shuf) { const __m256i zero = _mm256_setzero_si256 (); const __m256i factor_shuf = _mm256_set_epi8 ( @@ -1057,10 +615,10 @@ unpack_8x_xxxx_u_to_123a_i_128bpp (const uint32_t * SMOL_RESTRICT *in, const __m256i alpha_add = _mm256_set_epi16 ( 0, 0x80, 0, 0, 0, 0x80, 0, 0, 0, 0x80, 0, 0, 0, 0x80, 0, 0); - __m256i m0, m1, m2, m3, m4, m5, m6; - __m256i fact1, fact2; const __m256i * SMOL_RESTRICT my_in = (const __m256i * SMOL_RESTRICT) *in; __m256i * SMOL_RESTRICT my_out = (__m256i * SMOL_RESTRICT) *out; + __m256i m0, m1, m2, m3, m4, m5, m6; + __m256i fact1, fact2; SMOL_ASSUME_ALIGNED (my_out, __m256i * SMOL_RESTRICT); @@ -1095,13 +653,13 @@ unpack_8x_xxxx_u_to_123a_i_128bpp (const uint32_t * SMOL_RESTRICT *in, m5 = _mm256_unpacklo_epi16 (m2, zero); m6 = _mm256_unpackhi_epi16 (m2, zero); - _mm256_store_si256 ((__m256i *) my_out, m3); + _mm256_store_si256 (my_out, m3); my_out++; - _mm256_store_si256 ((__m256i *) my_out, m4); + _mm256_store_si256 (my_out, m4); my_out++; - _mm256_store_si256 ((__m256i *) my_out, m5); + _mm256_store_si256 (my_out, m5); my_out++; - _mm256_store_si256 ((__m256i *) my_out, m6); + _mm256_store_si256 (my_out, m6); my_out++; } @@ -1109,11 +667,84 @@ unpack_8x_xxxx_u_to_123a_i_128bpp (const uint32_t * SMOL_RESTRICT *in, *in = (const uint32_t * SMOL_RESTRICT) my_in; } -/* Unpack u (alpha first) -> i */ +static SMOL_INLINE void +unpack_pixel_123_p8_to_123a_p8_128bpp (const uint8_t *in, + uint64_t *out) +{ + out [0] = ((uint64_t) in [0] << 32) | in [1]; + out [1] = ((uint64_t) in [2] << 32) | 0xff; +} + +SMOL_REPACK_ROW_DEF (123, 24, 8, PREMUL8, COMPRESSED, + 1234, 128, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + unpack_pixel_123_p8_to_123a_p8_128bpp (src_row, dest_row); + src_row += 3; + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END static SMOL_INLINE void -unpack_pixel_a234_u_to_234a_i_128bpp (uint32_t p, - uint64_t *out) +unpack_pixel_123a_p8_to_123a_p8_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = p; + out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16); + out [1] = ((p64 & 0x0000ff00) << 24) | (p64 & 0x000000ff); +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED, + 1234, 128, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + unpack_pixel_123a_p8_to_123a_p8_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_a234_p8_to_234a_p8_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = p; + out [0] = ((p64 & 0x00ff0000) << 16) | ((p64 & 0x0000ff00) >> 8); + out [1] = ((p64 & 0x000000ff) << 32) | ((p64 & 0xff000000) >> 24); +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED, + 2341, 128, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + unpack_pixel_a234_p8_to_234a_p8_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_a234_u_to_234a_p8_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = (((uint64_t) p & 0x00ff00ff) << 32) | (((uint64_t) p & 0x0000ff00) << 8); + uint8_t alpha = p >> 24; + + p64 = premul_u_to_p8_64bpp (p64, alpha) | alpha; + out [0] = (p64 >> 16) & 0x000000ff000000ff; + out [1] = p64 & 0x000000ff000000ff; +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 2341, 128, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + unpack_pixel_a234_u_to_234a_p8_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_a234_u_to_234a_p16_128bpp (uint32_t p, + uint64_t *out) { uint64_t p64 = p; uint64_t alpha = p >> 24; @@ -1122,143 +753,515 @@ unpack_pixel_a234_u_to_234a_i_128bpp (uint32_t p, out [1] = (((((p64 & 0x000000ff) << 32) * alpha))) | (alpha << 8) | 0x80; } -static void -unpack_row_a234_u_to_234a_i_128bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels * 2; - const __m256i channel_shuf = _mm256_set_epi8 ( - 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1, - 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 2341, 128, 64, PREMUL16, COMPRESSED) { + const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_128 (2, 3, 4, 1); + unpack_8x_xxxx_u_to_123a_p16_128bpp (&src_row, &dest_row, dest_row_max, + channel_shuf); - SMOL_ASSUME_ALIGNED (row_out, uint64_t * SMOL_RESTRICT); + while (dest_row != dest_row_max) + { + unpack_pixel_a234_u_to_234a_p16_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END - unpack_8x_xxxx_u_to_123a_i_128bpp (&row_in, &row_out, row_out_max, - channel_shuf); +static SMOL_INLINE void +unpack_pixel_123a_u_to_123a_p8_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000); + uint8_t alpha = p & 0xff; + + p64 = premul_u_to_p8_64bpp (p64, alpha) | ((uint64_t) alpha); + out [0] = (p64 >> 16) & 0x000000ff000000ff; + out [1] = p64 & 0x000000ff000000ff; +} - while (row_out != row_out_max) +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 1234, 128, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) { - unpack_pixel_a234_u_to_234a_i_128bpp (*(row_in++), row_out); - row_out += 2; + unpack_pixel_123a_u_to_123a_p8_128bpp (*(src_row++), dest_row); + dest_row += 2; } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_123a_u_to_123a_p16_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = p; + uint64_t alpha = p & 0xff; + + out [0] = (((((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16)) * alpha)); + out [1] = (((((p64 & 0x0000ff00) << 24) * alpha))) | (alpha << 8) | 0x80; } -/* Unpack u (alpha last) -> p */ +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 1234, 128, 64, PREMUL16, COMPRESSED) { + const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_128 (1, 2, 3, 4); + unpack_8x_xxxx_u_to_123a_p16_128bpp (&src_row, &dest_row, dest_row_max, + channel_shuf); -static SMOL_INLINE uint64_t -unpack_pixel_123a_u_to_132a_p_64bpp (uint32_t p) + while (dest_row != dest_row_max) + { + unpack_pixel_123a_u_to_123a_p16_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +/* ---------------------- * + * Repacking: 64 -> 24/32 * + * ---------------------- */ + +static void +pack_8x_1234_p8_to_xxxx_p8_64bpp (const uint64_t * SMOL_RESTRICT *in, + uint32_t * SMOL_RESTRICT *out, + uint32_t * out_max, + const __m256i channel_shuf) { - uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000); - uint8_t alpha = p & 0xff; + const __m256i * SMOL_RESTRICT my_in = (const __m256i * SMOL_RESTRICT) *in; + __m256i * SMOL_RESTRICT my_out = (__m256i * SMOL_RESTRICT) *out; + __m256i m0, m1; + + SMOL_ASSUME_ALIGNED (my_in, __m256i * SMOL_RESTRICT); + + while ((ptrdiff_t) (my_out + 1) <= (ptrdiff_t) out_max) + { + /* Load inputs */ + + m0 = _mm256_stream_load_si256 (my_in); + my_in++; + m1 = _mm256_stream_load_si256 (my_in); + my_in++; + + /* Pack and store */ + + m0 = _mm256_packus_epi16 (m0, m1); + m0 = _mm256_shuffle_epi8 (m0, channel_shuf); + m0 = _mm256_permute4x64_epi64 (m0, SMOL_4X2BIT (3, 1, 2, 0)); + + _mm256_storeu_si256 (my_out, m0); + my_out++; + } + + *out = (uint32_t * SMOL_RESTRICT) my_out; + *in = (const uint64_t * SMOL_RESTRICT) my_in; +} - return premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha); +static SMOL_INLINE uint32_t +pack_pixel_1234_p8_to_1324_p8_64bpp (uint64_t in) +{ + return in | (in >> 24); } +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 132, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++)); + *(dest_row++) = p >> 24; + *(dest_row++) = p >> 16; + *(dest_row++) = p >> 8; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 132, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint8_t alpha = *src_row; + uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t); + *(dest_row++) = p >> 24; + *(dest_row++) = p >> 16; + *(dest_row++) = p >> 8; + src_row++; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 231, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++)); + *(dest_row++) = p >> 8; + *(dest_row++) = p >> 16; + *(dest_row++) = p >> 24; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 231, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint8_t alpha = *src_row; + uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t); + *(dest_row++) = p >> 8; + *(dest_row++) = p >> 16; + *(dest_row++) = p >> 24; + src_row++; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 324, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++)); + *(dest_row++) = p >> 16; + *(dest_row++) = p >> 8; + *(dest_row++) = p; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 324, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint8_t alpha = *src_row >> 24; + uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t); + *(dest_row++) = p >> 16; + *(dest_row++) = p >> 8; + *(dest_row++) = p; + src_row++; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 423, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++)); + *(dest_row++) = p; + *(dest_row++) = p >> 8; + *(dest_row++) = p >> 16; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 423, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint8_t alpha = *src_row >> 24; + uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t); + *(dest_row++) = p; + *(dest_row++) = p >> 8; + *(dest_row++) = p >> 16; + src_row++; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 1324, 32, 32, PREMUL8, COMPRESSED) { + const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_64 (1, 3, 2, 4); + pack_8x_1234_p8_to_xxxx_p8_64bpp (&src_row, &dest_row, dest_row_max, + channel_shuf); + while (dest_row != dest_row_max) + { + *(dest_row++) = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++)); + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 1324, 32, 32, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint8_t alpha = *src_row; + uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; + *(dest_row++) = pack_pixel_1234_p8_to_1324_p8_64bpp (t); + src_row++; + } +} SMOL_REPACK_ROW_DEF_END + +#define DEF_REPACK_FROM_1234_64BPP_TO_32BPP(a, b, c, d) \ + SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, \ + a##b##c##d, 32, 32, PREMUL8, COMPRESSED) { \ + const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_64 ((a), (b), (c), (d)); \ + pack_8x_1234_p8_to_xxxx_p8_64bpp (&src_row, &dest_row, dest_row_max, \ + channel_shuf); \ + while (dest_row != dest_row_max) \ + { \ + *(dest_row++) = PACK_FROM_1234_64BPP (*src_row, a, b, c, d); \ + src_row++; \ + } \ + } SMOL_REPACK_ROW_DEF_END \ + SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, \ + a##b##c##d, 32, 32, UNASSOCIATED, COMPRESSED) { \ + while (dest_row != dest_row_max) \ + { \ + uint8_t alpha = *src_row; \ + uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; \ + *(dest_row++) = PACK_FROM_1234_64BPP (t, a, b, c, d); \ + src_row++; \ + } \ + } SMOL_REPACK_ROW_DEF_END + +DEF_REPACK_FROM_1234_64BPP_TO_32BPP (1, 4, 2, 3) +DEF_REPACK_FROM_1234_64BPP_TO_32BPP (2, 3, 1, 4) +DEF_REPACK_FROM_1234_64BPP_TO_32BPP (4, 1, 3, 2) +DEF_REPACK_FROM_1234_64BPP_TO_32BPP (4, 2, 3, 1) + +/* ----------------------- * + * Repacking: 128 -> 24/32 * + * ----------------------- */ + static void -unpack_row_123a_u_to_132a_p_64bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +pack_8x_123a_p16_to_xxxx_u_128bpp (const uint64_t * SMOL_RESTRICT *in, + uint32_t * SMOL_RESTRICT *out, + uint32_t * out_max, + const __m256i channel_shuf) { - uint64_t *row_out_max = row_out + n_pixels; +#define ALPHA_MUL (1 << (INVERTED_DIV_SHIFT_P16 - 8)) +#define ALPHA_MASK SMOL_8X1BIT (0, 1, 0, 0, 0, 1, 0, 0) + + const __m256i ones = _mm256_set_epi32 ( + ALPHA_MUL, ALPHA_MUL, ALPHA_MUL, ALPHA_MUL, + ALPHA_MUL, ALPHA_MUL, ALPHA_MUL, ALPHA_MUL); + const __m256i alpha_clean_mask = _mm256_set_epi32 ( + 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, + 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff); + const __m256i * SMOL_RESTRICT my_in = (const __m256i * SMOL_RESTRICT) *in; + __m256i * SMOL_RESTRICT my_out = (__m256i * SMOL_RESTRICT) *out; + __m256i m0, m1, m2, m3, m4, m5, m6, m7, m8; - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); + SMOL_ASSUME_ALIGNED (my_in, __m256i * SMOL_RESTRICT); - while (row_out != row_out_max) + while ((ptrdiff_t) (my_out + 1) <= (ptrdiff_t) out_max) { - *(row_out++) = unpack_pixel_123a_u_to_132a_p_64bpp (*(row_in++)); - } -} + /* Load inputs */ -static SMOL_INLINE void -unpack_pixel_123a_u_to_123a_p_128bpp (uint32_t p, - uint64_t *out) -{ - uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000); - uint8_t alpha = p & 0xff; + m0 = _mm256_stream_load_si256 (my_in); + my_in++; + m1 = _mm256_stream_load_si256 (my_in); + my_in++; + m2 = _mm256_stream_load_si256 (my_in); + my_in++; + m3 = _mm256_stream_load_si256 (my_in); + my_in++; - p64 = premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha); - out [0] = (p64 >> 16) & 0x000000ff000000ff; - out [1] = p64 & 0x000000ff000000ff; -} + /* Load alpha factors */ -static void -unpack_row_123a_u_to_123a_p_128bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels * 2; + m4 = _mm256_slli_si256 (m0, 4); + m6 = _mm256_srli_si256 (m3, 4); + m5 = _mm256_blend_epi32 (m4, m1, ALPHA_MASK); + m7 = _mm256_blend_epi32 (m6, m2, ALPHA_MASK); + m7 = _mm256_srli_si256 (m7, 4); - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); + m4 = _mm256_blend_epi32 (m5, m7, SMOL_8X1BIT (0, 0, 1, 1, 0, 0, 1, 1)); + m4 = _mm256_srli_epi32 (m4, 8); + m4 = _mm256_and_si256 (m4, alpha_clean_mask); + m4 = _mm256_i32gather_epi32 ((const void *) _smol_inv_div_p16_lut, m4, 4); - while (row_out != row_out_max) - { - unpack_pixel_123a_u_to_123a_p_128bpp (*(row_in++), row_out); - row_out += 2; - } -} + /* 2 pixels times 4 */ -/* Unpack u (alpha last) -> i */ + m5 = _mm256_shuffle_epi32 (m4, SMOL_4X2BIT (3, 3, 3, 3)); + m6 = _mm256_shuffle_epi32 (m4, SMOL_4X2BIT (2, 2, 2, 2)); + m7 = _mm256_shuffle_epi32 (m4, SMOL_4X2BIT (1, 1, 1, 1)); + m8 = _mm256_shuffle_epi32 (m4, SMOL_4X2BIT (0, 0, 0, 0)); -static SMOL_INLINE void -unpack_pixel_123a_u_to_123a_i_128bpp (uint32_t p, - uint64_t *out) -{ - uint64_t p64 = p; - uint64_t alpha = p & 0xff; + m5 = _mm256_blend_epi32 (m5, ones, ALPHA_MASK); + m6 = _mm256_blend_epi32 (m6, ones, ALPHA_MASK); + m7 = _mm256_blend_epi32 (m7, ones, ALPHA_MASK); + m8 = _mm256_blend_epi32 (m8, ones, ALPHA_MASK); - out [0] = (((((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16)) * alpha)); - out [1] = (((((p64 & 0x0000ff00) << 24) * alpha))) | (alpha << 8) | 0x80; -} + m5 = _mm256_mullo_epi32 (m5, m0); + m6 = _mm256_mullo_epi32 (m6, m1); + m7 = _mm256_mullo_epi32 (m7, m2); + m8 = _mm256_mullo_epi32 (m8, m3); -static void -unpack_row_123a_u_to_123a_i_128bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels * 2; - const __m256i channel_shuf = _mm256_set_epi8 ( - 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2, - 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); + m5 = _mm256_srli_epi32 (m5, INVERTED_DIV_SHIFT_P16); + m6 = _mm256_srli_epi32 (m6, INVERTED_DIV_SHIFT_P16); + m7 = _mm256_srli_epi32 (m7, INVERTED_DIV_SHIFT_P16); + m8 = _mm256_srli_epi32 (m8, INVERTED_DIV_SHIFT_P16); - SMOL_ASSUME_ALIGNED (row_out, uint64_t * SMOL_RESTRICT); + /* Pack and store */ - unpack_8x_xxxx_u_to_123a_i_128bpp (&row_in, &row_out, row_out_max, - channel_shuf); + m0 = _mm256_packus_epi32 (m5, m6); + m1 = _mm256_packus_epi32 (m7, m8); + m0 = _mm256_packus_epi16 (m0, m1); - while (row_out != row_out_max) - { - unpack_pixel_123a_u_to_123a_i_128bpp (*(row_in++), row_out); - row_out += 2; + m0 = _mm256_shuffle_epi8 (m0, channel_shuf); + m0 = _mm256_permute4x64_epi64 (m0, SMOL_4X2BIT (3, 1, 2, 0)); + m0 = _mm256_shuffle_epi32 (m0, SMOL_4X2BIT (3, 1, 2, 0)); + + _mm256_storeu_si256 (my_out, m0); + my_out += 1; } -} -/* --- Filter helpers --- */ + *out = (uint32_t * SMOL_RESTRICT) my_out; + *in = (const uint64_t * SMOL_RESTRICT) my_in; -static SMOL_INLINE const uint32_t * -inrow_ofs_to_pointer (const SmolScaleCtx *scale_ctx, - uint32_t inrow_ofs) -{ - return (const uint32_t *) (((const uint8_t *) scale_ctx->pixels_in) + scale_ctx->rowstride_in * inrow_ofs); +#undef ALPHA_MUL +#undef ALPHA_MASK } -static SMOL_INLINE uint32_t * -outrow_ofs_to_pointer (const SmolScaleCtx *scale_ctx, - uint32_t outrow_ofs) +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, COMPRESSED, + 123, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + *(dest_row++) = *src_row >> 32; + *(dest_row++) = *(src_row++); + *(dest_row++) = *(src_row++) >> 32; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, COMPRESSED, + 123, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = src_row [1]; + unpremul_p8_to_u_128bpp (src_row, t, alpha); + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; + *(dest_row++) = t [0] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [1] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16, COMPRESSED, + 123, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = src_row [1]; + unpremul_p16_to_u_128bpp (src_row, t, alpha); + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; + *(dest_row++) = t [0] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [1] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, COMPRESSED, + 321, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + *(dest_row++) = src_row [1] >> 32; + *(dest_row++) = src_row [0]; + *(dest_row++) = src_row [0] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, COMPRESSED, + 321, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = src_row [1]; + unpremul_p8_to_u_128bpp (src_row, t, alpha); + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; + *(dest_row++) = t [1] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [0] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16, COMPRESSED, + 321, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = src_row [1] >> 8; + unpremul_p16_to_u_128bpp (src_row, t, alpha); + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; + *(dest_row++) = t [1] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [0] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +#define DEF_REPACK_FROM_1234_128BPP_TO_32BPP(a, b, c, d) \ + SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, COMPRESSED, \ + a##b##c##d, 32, 32, PREMUL8, COMPRESSED) { \ + while (dest_row != dest_row_max) \ + { \ + *(dest_row++) = PACK_FROM_1234_128BPP (src_row, a, b, c, d); \ + src_row += 2; \ + } \ + } SMOL_REPACK_ROW_DEF_END \ + SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, COMPRESSED, \ + a##b##c##d, 32, 32, UNASSOCIATED, COMPRESSED) { \ + while (dest_row != dest_row_max) \ + { \ + uint64_t t [2]; \ + uint8_t alpha = src_row [1]; \ + unpremul_p8_to_u_128bpp (src_row, t, alpha); \ + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \ + *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \ + src_row += 2; \ + } \ + } SMOL_REPACK_ROW_DEF_END \ + SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16, COMPRESSED, \ + a##b##c##d, 32, 32, UNASSOCIATED, COMPRESSED) { \ + const __m256i channel_shuf = PACK_SHUF_MM256_EPI8_32_TO_128 ((a), (b), (c), (d)); \ + pack_8x_123a_p16_to_xxxx_u_128bpp (&src_row, &dest_row, dest_row_max, \ + channel_shuf); \ + while (dest_row != dest_row_max) \ + { \ + uint64_t t [2]; \ + uint8_t alpha = src_row [1] >> 8; \ + unpremul_p16_to_u_128bpp (src_row, t, alpha); \ + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \ + *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \ + src_row += 2; \ + } \ + } SMOL_REPACK_ROW_DEF_END + +DEF_REPACK_FROM_1234_128BPP_TO_32BPP (1, 2, 3, 4) +DEF_REPACK_FROM_1234_128BPP_TO_32BPP (3, 2, 1, 4) +DEF_REPACK_FROM_1234_128BPP_TO_32BPP (4, 1, 2, 3) +DEF_REPACK_FROM_1234_128BPP_TO_32BPP (4, 3, 2, 1) + +/* -------------- * + * Filter helpers * + * -------------- */ + +#define LERP_SIMD256_EPI32(a, b, f) \ + _mm256_add_epi32 ( \ + _mm256_srli_epi32 ( \ + _mm256_mullo_epi32 ( \ + _mm256_sub_epi32 ((a), (b)), (f)), 8), (b)) + +#define LERP_SIMD128_EPI32(a, b, f) \ + _mm_add_epi32 ( \ + _mm_srli_epi32 ( \ + _mm_mullo_epi32 ( \ + _mm_sub_epi32 ((a), (b)), (f)), 8), (b)) + +#define LERP_SIMD256_EPI32_AND_MASK(a, b, f, mask) \ + _mm256_and_si256 (LERP_SIMD256_EPI32 ((a), (b), (f)), (mask)) + +#define LERP_SIMD128_EPI32_AND_MASK(a, b, f, mask) \ + _mm_and_si128 (LERP_SIMD128_EPI32 ((a), (b), (f)), (mask)) + +static SMOL_INLINE const char * +src_row_ofs_to_pointer (const SmolScaleCtx *scale_ctx, + uint32_t src_row_ofs) { - return (uint32_t *) (((uint8_t *) scale_ctx->pixels_out) + scale_ctx->rowstride_out * outrow_ofs); + return scale_ctx->src_pixels + scale_ctx->src_rowstride * src_row_ofs; } static SMOL_INLINE uint64_t weight_pixel_64bpp (uint64_t p, uint16_t w) { - return ((p * w) >> 8) & 0x00ff00ff00ff00ff; + return ((p * w) >> 8) & 0x00ff00ff00ff00ffULL; } /* p and out may be the same address */ static SMOL_INLINE void -weight_pixel_128bpp (uint64_t *p, +weight_pixel_128bpp (const uint64_t *p, uint64_t *out, uint16_t w) { @@ -1331,8 +1334,7 @@ scale_128bpp_half (uint64_t accum, b = (accum & 0xffffffff00000000ULL) >> 32; b = (b * multiplier + SMOL_BOXES_MULTIPLIER / 2) / SMOL_BOXES_MULTIPLIER; - return (a & 0x000000000000ffffULL) - | ((b & 0x000000000000ffffULL) << 32); + return a | (b << 32); } static SMOL_INLINE void @@ -1371,120 +1373,598 @@ add_parts (const uint64_t * SMOL_RESTRICT parts_in, *(parts_acc_out++) += *(parts_in++); } -/* --- Horizontal scaling --- */ - -#define DEF_INTERP_HORIZONTAL_BILINEAR(n_halvings) \ -static void \ -interp_horizontal_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \ - const uint64_t * SMOL_RESTRICT row_parts_in, \ - uint64_t * SMOL_RESTRICT row_parts_out) \ -{ \ - uint64_t p, q; \ - const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x; \ - uint64_t F; \ - uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out; \ - int i; \ - \ - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); \ - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); \ - \ - do \ - { \ - uint64_t accum = 0; \ - \ - for (i = 0; i < (1 << (n_halvings)); i++) \ - { \ - row_parts_in += *(ofs_x++); \ - F = *(ofs_x++); \ - \ - p = *row_parts_in; \ - q = *(row_parts_in + 1); \ - \ - accum += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \ - } \ - *(row_parts_out++) = ((accum) >> (n_halvings)) & 0x00ff00ff00ff00ffULL; \ - } \ - while (row_parts_out != row_parts_out_max); \ -} \ - \ -static void \ -interp_horizontal_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \ - const uint64_t * SMOL_RESTRICT row_parts_in, \ - uint64_t * SMOL_RESTRICT row_parts_out) \ -{ \ - const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x; \ - uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out * 2; \ - const __m128i mask128 = _mm_set_epi32 ( \ - 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff); \ - const __m256i zero256 = _mm256_setzero_si256 (); \ - int i; \ - \ - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); \ - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); \ - \ - while (row_parts_out != row_parts_out_max) \ - { \ - __m256i a0 = _mm256_setzero_si256 (); \ - __m128i a1; \ - \ - for (i = 0; i < (1 << ((n_halvings) - 1)); i++) \ - { \ - __m256i m0, m1; \ - __m256i factors; \ - __m128i n0, n1, n2, n3, n4, n5; \ - \ - row_parts_in += *(ofs_x++) * 2; \ - n4 = _mm_set1_epi16 (*(ofs_x++)); \ - n0 = _mm_load_si128 ((__m128i *) row_parts_in); \ - n1 = _mm_load_si128 ((__m128i *) row_parts_in + 1); \ - \ - row_parts_in += *(ofs_x++) * 2; \ - n5 = _mm_set1_epi16 (*(ofs_x++)); \ - n2 = _mm_load_si128 ((__m128i *) row_parts_in); \ - n3 = _mm_load_si128 ((__m128i *) row_parts_in + 1); \ - \ - m0 = _mm256_set_m128i (n2, n0); \ - m1 = _mm256_set_m128i (n3, n1); \ - factors = _mm256_set_m128i (n5, n4); \ - factors = _mm256_blend_epi16 (factors, zero256, 0xaa); \ - \ - m0 = LERP_SIMD256_EPI32 (m0, m1, factors); \ - a0 = _mm256_add_epi32 (a0, m0); \ - } \ - \ - a1 = _mm_add_epi32 (_mm256_extracti128_si256 (a0, 0), \ - _mm256_extracti128_si256 (a0, 1)); \ - a1 = _mm_srli_epi32 (a1, (n_halvings)); \ - a1 = _mm_and_si128 (a1, mask128); \ - _mm_store_si128 ((__m128i *) row_parts_out, a1); \ - row_parts_out += 2; \ - } \ +static void +copy_weighted_parts_64bpp (const uint64_t * SMOL_RESTRICT parts_in, + uint64_t * SMOL_RESTRICT parts_acc_out, + uint32_t n, + uint16_t w) +{ + const uint64_t *parts_in_max = parts_in + n; + + SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *); + SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *); + + while (parts_in < parts_in_max) + { + *(parts_acc_out++) = weight_pixel_64bpp (*(parts_in++), w); + } +} + +static void +copy_weighted_parts_128bpp (const uint64_t * SMOL_RESTRICT parts_in, + uint64_t * SMOL_RESTRICT parts_acc_out, + uint32_t n, + uint16_t w) +{ + const uint64_t *parts_in_max = parts_in + n * 2; + + SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *); + SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *); + + while (parts_in < parts_in_max) + { + weight_pixel_128bpp (parts_in, parts_acc_out, w); + parts_in += 2; + parts_acc_out += 2; + } +} + +static void +add_weighted_parts_64bpp (const uint64_t * SMOL_RESTRICT parts_in, + uint64_t * SMOL_RESTRICT parts_acc_out, + uint32_t n, + uint16_t w) +{ + const uint64_t *parts_in_max = parts_in + n; + + SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *); + SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *); + + while (parts_in < parts_in_max) + { + *(parts_acc_out++) += weight_pixel_64bpp (*(parts_in++), w); + } +} + +static void +add_weighted_parts_128bpp (const uint64_t * SMOL_RESTRICT parts_in, + uint64_t * SMOL_RESTRICT parts_acc_out, + uint32_t n, + uint16_t w) +{ + const uint64_t *parts_in_max = parts_in + n * 2; + + SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *); + SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *); + + while (parts_in < parts_in_max) + { + uint64_t t [2]; + + weight_pixel_128bpp (parts_in, t, w); + parts_acc_out [0] += t [0]; + parts_acc_out [1] += t [1]; + parts_in += 2; + parts_acc_out += 2; + } +} + +static SMOL_INLINE void +apply_subpixel_opacity_64bpp (uint64_t * SMOL_RESTRICT u64_inout, uint16_t opacity) +{ + *u64_inout = ((*u64_inout * opacity) >> SMOL_SUBPIXEL_SHIFT) & 0x00ff00ff00ff00ffULL; +} + +static SMOL_INLINE void +apply_subpixel_opacity_128bpp_half (uint64_t * SMOL_RESTRICT u64_inout, uint16_t opacity) +{ + *u64_inout = ((*u64_inout * opacity) >> SMOL_SUBPIXEL_SHIFT) & 0x00ffffff00ffffffULL; +} + +static SMOL_INLINE void +apply_subpixel_opacity_128bpp (uint64_t *u64_inout, uint16_t opacity) +{ + apply_subpixel_opacity_128bpp_half (u64_inout, opacity); + apply_subpixel_opacity_128bpp_half (u64_inout + 1, opacity); +} + +static void +apply_subpixel_opacity_row_copy_64bpp (uint64_t * SMOL_RESTRICT u64_in, + uint64_t * SMOL_RESTRICT u64_out, + int n_pixels, + uint16_t opacity) +{ + uint64_t *u64_out_max = u64_out + n_pixels; + + while (u64_out != u64_out_max) + { + *u64_out = *u64_in++; + apply_subpixel_opacity_64bpp (u64_out, opacity); + u64_out++; + } +} + +static void +apply_subpixel_opacity_row_copy_128bpp (uint64_t * SMOL_RESTRICT u64_in, + uint64_t * SMOL_RESTRICT u64_out, + int n_pixels, + uint16_t opacity) +{ + uint64_t *u64_out_max = u64_out + (n_pixels * 2); + + while (u64_out != u64_out_max) + { + u64_out [0] = u64_in [0]; + u64_out [1] = u64_in [1]; + apply_subpixel_opacity_128bpp_half (u64_out, opacity); + apply_subpixel_opacity_128bpp_half (u64_out + 1, opacity); + u64_in += 2; + u64_out += 2; + } +} + +static void +apply_horiz_edge_opacity (const SmolScaleCtx *scale_ctx, + uint64_t *row_parts) +{ + if (scale_ctx->storage_type == SMOL_STORAGE_64BPP) + { + apply_subpixel_opacity_64bpp (&row_parts [0], scale_ctx->hdim.first_opacity); + apply_subpixel_opacity_64bpp (&row_parts [scale_ctx->hdim.placement_size_px - 1], scale_ctx->hdim.last_opacity); + } + else + { + apply_subpixel_opacity_128bpp (&row_parts [0], scale_ctx->hdim.first_opacity); + apply_subpixel_opacity_128bpp (&row_parts [(scale_ctx->hdim.placement_size_px - 1) * 2], scale_ctx->hdim.last_opacity); + } +} + +/* ------------------ * + * Horizontal scaling * + * ------------------ */ + +static SMOL_INLINE void +hadd_pixels_16x_to_8x_64bpp (__m256i i0, __m256i i1, __m256i i2, __m256i i3, + __m256i * SMOL_RESTRICT o0, __m256i * SMOL_RESTRICT o1) +{ + __m256i t0, t1, t2, t3; + int control_bits; + + control_bits = SMOL_4X2BIT (1, 0, 3, 2); + t0 = _mm256_shuffle_epi32 (i0, control_bits); + t1 = _mm256_shuffle_epi32 (i1, control_bits); + t2 = _mm256_shuffle_epi32 (i2, control_bits); + t3 = _mm256_shuffle_epi32 (i3, control_bits); + + t0 = _mm256_add_epi16 (t0, i0); + t1 = _mm256_add_epi16 (t1, i1); + t2 = _mm256_add_epi16 (t2, i2); + t3 = _mm256_add_epi16 (t3, i3); + + control_bits = SMOL_8X1BIT (1, 1, 0, 0, 1, 1, 0, 0); + t0 = _mm256_blend_epi32 (t0, t1, control_bits); + t1 = _mm256_blend_epi32 (t2, t3, control_bits); + + control_bits = SMOL_4X2BIT (3, 1, 2, 0); + t0 = _mm256_permute4x64_epi64 (t0, control_bits); + t1 = _mm256_permute4x64_epi64 (t1, control_bits); + + *o0 = t0; + *o1 = t1; +} + +static SMOL_INLINE __m256i +hadd_pixels_8x_to_4x_64bpp (__m256i i0, __m256i i1) +{ + __m256i t0, t1; + int control_bits; + + control_bits = SMOL_4X2BIT (1, 0, 3, 2); + t0 = _mm256_shuffle_epi32 (i0, control_bits); + t1 = _mm256_shuffle_epi32 (i1, control_bits); + + t0 = _mm256_add_epi16 (t0, i0); + t1 = _mm256_add_epi16 (t1, i1); + + control_bits = SMOL_8X1BIT (1, 1, 0, 0, 1, 1, 0, 0); + t0 = _mm256_blend_epi32 (t0, t1, control_bits); + + control_bits = SMOL_4X2BIT (3, 1, 2, 0); + t0 = _mm256_permute4x64_epi64 (t0, control_bits); + + return t0; +} + +static SMOL_INLINE void +interp_horizontal_bilinear_batch_64bpp (const uint64_t * SMOL_RESTRICT row_parts_in, + const uint16_t * SMOL_RESTRICT precalc_x, + __m256i * SMOL_RESTRICT o0, + __m256i * SMOL_RESTRICT o1, + __m256i * SMOL_RESTRICT o2, + __m256i * SMOL_RESTRICT o3) +{ + const __m256i mask = _mm256_set_epi16 (0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, + 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff); + const __m256i shuf_0 = _mm256_set_epi8 (3, 2, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0, 1, 0, + 3, 2, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0, 1, 0); + const __m256i shuf_1 = _mm256_set_epi8 (7, 6, 7, 6, 7, 6, 7, 6, 5, 4, 5, 4, 5, 4, 5, 4, + 7, 6, 7, 6, 7, 6, 7, 6, 5, 4, 5, 4, 5, 4, 5, 4); + const __m256i shuf_2 = _mm256_set_epi8 (11, 10, 11, 10, 11, 10, 11, 10, 9, 8, 9, 8, 9, 8, 9, 8, + 11, 10, 11, 10, 11, 10, 11, 10, 9, 8, 9, 8, 9, 8, 9, 8); + const __m256i shuf_3 = _mm256_set_epi8 (15, 14, 15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 13, 12, + 15, 14, 15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 13, 12); + __m256i m0, m1, m2, m3; + __m256i f0, f1, f2, f3; + __m256i q00, q10, q20, q30, q40, q50, q60, q70; + __m256i q01, q11, q21, q31, q41, q51, q61, q71; + __m256i p00, p01, p10, p11, p20, p21, p30, p31; + __m256i f; + int control_bits; + + /* Fetch pixel pairs to interpolate between, two pairs per ymm register. + * This looks clumsy, but it's a lot faster than using _mm256_i32gather_epi64(), + * as benchmarked on both Haswell and Tiger Lake. */ + + q00 = _mm256_inserti128_si256 (_mm256_castsi128_si256 ( + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [0]))), + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [1])), 1); + q10 = _mm256_inserti128_si256 (_mm256_castsi128_si256 ( + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [2]))), + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [3])), 1); + q20 = _mm256_inserti128_si256 (_mm256_castsi128_si256 ( + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [4]))), + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [5])), 1); + q30 = _mm256_inserti128_si256 (_mm256_castsi128_si256 ( + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [6]))), + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [7])), 1); + + q40 = _mm256_inserti128_si256 (_mm256_castsi128_si256 ( + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [8]))), + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [9])), 1); + q50 = _mm256_inserti128_si256 (_mm256_castsi128_si256 ( + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [10]))), + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [11])), 1); + q60 = _mm256_inserti128_si256 (_mm256_castsi128_si256 ( + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [12]))), + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [13])), 1); + q70 = _mm256_inserti128_si256 (_mm256_castsi128_si256 ( + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [14]))), + _mm_loadu_si128 ((const __m128i *) (row_parts_in + precalc_x [15])), 1); + + f = _mm256_load_si256 ((const __m256i *) (precalc_x + 16)); /* Factors */ + + /* 0123 -> 0x2x, 1x3x. 4567 -> x4x6, x5x7. Etc. */ + + control_bits = SMOL_4X2BIT (1, 0, 3, 2); + + q01 = _mm256_shuffle_epi32 (q00, control_bits); + q11 = _mm256_shuffle_epi32 (q10, control_bits); + q21 = _mm256_shuffle_epi32 (q20, control_bits); + q31 = _mm256_shuffle_epi32 (q30, control_bits); + q41 = _mm256_shuffle_epi32 (q40, control_bits); + q51 = _mm256_shuffle_epi32 (q50, control_bits); + q61 = _mm256_shuffle_epi32 (q60, control_bits); + q71 = _mm256_shuffle_epi32 (q70, control_bits); + + /* 0x2x, x4x6 -> 0426. 1x3x, x5x7 -> 1537. Etc. */ + + control_bits = SMOL_8X1BIT (1, 1, 0, 0, 1, 1, 0, 0); + + p00 = _mm256_blend_epi32 (q00, q11, control_bits); + p10 = _mm256_blend_epi32 (q20, q31, control_bits); + p20 = _mm256_blend_epi32 (q40, q51, control_bits); + p30 = _mm256_blend_epi32 (q60, q71, control_bits); + + p01 = _mm256_blend_epi32 (q01, q10, control_bits); + p11 = _mm256_blend_epi32 (q21, q30, control_bits); + p21 = _mm256_blend_epi32 (q41, q50, control_bits); + p31 = _mm256_blend_epi32 (q61, q70, control_bits); + + /* Interpolation. 0426 vs 1537. Etc. */ + + m0 = _mm256_sub_epi16 (p00, p01); + m1 = _mm256_sub_epi16 (p10, p11); + m2 = _mm256_sub_epi16 (p20, p21); + m3 = _mm256_sub_epi16 (p30, p31); + + f0 = _mm256_shuffle_epi8 (f, shuf_0); + f1 = _mm256_shuffle_epi8 (f, shuf_1); + f2 = _mm256_shuffle_epi8 (f, shuf_2); + f3 = _mm256_shuffle_epi8 (f, shuf_3); + + m0 = _mm256_mullo_epi16 (m0, f0); + m1 = _mm256_mullo_epi16 (m1, f1); + m2 = _mm256_mullo_epi16 (m2, f2); + m3 = _mm256_mullo_epi16 (m3, f3); + + m0 = _mm256_srli_epi16 (m0, 8); + m1 = _mm256_srli_epi16 (m1, 8); + m2 = _mm256_srli_epi16 (m2, 8); + m3 = _mm256_srli_epi16 (m3, 8); + + m0 = _mm256_add_epi16 (m0, p01); + m1 = _mm256_add_epi16 (m1, p11); + m2 = _mm256_add_epi16 (m2, p21); + m3 = _mm256_add_epi16 (m3, p31); + + m0 = _mm256_and_si256 (m0, mask); + m1 = _mm256_and_si256 (m1, mask); + m2 = _mm256_and_si256 (m2, mask); + m3 = _mm256_and_si256 (m3, mask); + + /* [0426/1537] -> [0246/1357]. Etc. */ + + control_bits = SMOL_4X2BIT (3, 1, 2, 0); + + *o0 = _mm256_permute4x64_epi64 (m0, control_bits); + *o1 = _mm256_permute4x64_epi64 (m1, control_bits); + *o2 = _mm256_permute4x64_epi64 (m2, control_bits); + *o3 = _mm256_permute4x64_epi64 (m3, control_bits); +} + +static __m256i +interp_horizontal_bilinear_batch_to_4x_64bpp (const uint64_t * SMOL_RESTRICT row_parts_in, + const uint16_t * SMOL_RESTRICT precalc_x) +{ + __m256i m0, m1, m2, m3, s0, s1; + + interp_horizontal_bilinear_batch_64bpp (row_parts_in, precalc_x, &m0, &m1, &m2, &m3); + hadd_pixels_16x_to_8x_64bpp (m0, m1, m2, m3, &s0, &s1); + return hadd_pixels_8x_to_4x_64bpp (s0, s1); +} + +static __m256i +interp_horizontal_bilinear_4x_batch_to_4x_64bpp (const uint64_t * SMOL_RESTRICT row_parts_in, + const uint16_t * SMOL_RESTRICT precalc_x) +{ + __m256i t0, t1, t2, t3; + + t0 = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x); + t1 = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x + 32); + t2 = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x + 64); + t3 = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x + 96); + + hadd_pixels_16x_to_8x_64bpp (t0, t1, t2, t3, &t0, &t1); + return hadd_pixels_8x_to_4x_64bpp (t0, t1); +} + +/* Note that precalc_x must point to offsets and factors interleaved one by one, i.e. + * offset - factor - offset - factor, and not 16x as with the batch function. */ +static SMOL_INLINE void +interp_horizontal_bilinear_epilogue_64bpp (const uint64_t * SMOL_RESTRICT row_parts_in, + uint64_t * SMOL_RESTRICT row_parts_out, + uint64_t * SMOL_RESTRICT row_parts_out_max, + const uint16_t * SMOL_RESTRICT precalc_x, + int n_halvings) +{ + while (row_parts_out != row_parts_out_max) + { + uint64_t accum = 0; + int i; + + for (i = 0; i < (1 << (n_halvings)); i++) + { + uint64_t p, q; + uint64_t F; + + p = *(row_parts_in + (*precalc_x)); + q = *(row_parts_in + (*precalc_x) + 1); + precalc_x++; + F = *(precalc_x++); + + accum += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; + } + + *(row_parts_out++) = ((accum) >> (n_halvings)) & 0x00ff00ff00ff00ffULL; + } +} + +static void +interp_horizontal_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx, + const uint64_t * SMOL_RESTRICT row_parts_in, + uint64_t * SMOL_RESTRICT row_parts_out) +{ + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; + uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px; + + SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT); + + while (row_parts_out + 16 <= row_parts_out_max) + { + __m256i m0, m1, m2, m3; + + interp_horizontal_bilinear_batch_64bpp (row_parts_in, precalc_x, &m0, &m1, &m2, &m3); + + _mm256_store_si256 ((__m256i *) row_parts_out + 0, m0); + _mm256_store_si256 ((__m256i *) row_parts_out + 1, m1); + _mm256_store_si256 ((__m256i *) row_parts_out + 2, m2); + _mm256_store_si256 ((__m256i *) row_parts_out + 3, m3); + + row_parts_out += 16; + precalc_x += 32; + } + + interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 0); +} + +static void +interp_horizontal_bilinear_1h_64bpp (const SmolScaleCtx *scale_ctx, + const uint64_t * SMOL_RESTRICT row_parts_in, + uint64_t * SMOL_RESTRICT row_parts_out) +{ + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; + uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px; + + SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT); + + while (row_parts_out + 8 <= row_parts_out_max) + { + __m256i m0, m1, m2, m3, s0, s1; + + interp_horizontal_bilinear_batch_64bpp (row_parts_in, precalc_x, &m0, &m1, &m2, &m3); + hadd_pixels_16x_to_8x_64bpp (m0, m1, m2, m3, &s0, &s1); + + s0 = _mm256_srli_epi16 (s0, 1); + s1 = _mm256_srli_epi16 (s1, 1); + + _mm256_store_si256 ((__m256i *) row_parts_out, s0); + _mm256_store_si256 ((__m256i *) row_parts_out + 1, s1); + + row_parts_out += 8; + precalc_x += 32; + } + + interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 1); +} + +static void +interp_horizontal_bilinear_2h_64bpp (const SmolScaleCtx *scale_ctx, + const uint64_t * SMOL_RESTRICT row_parts_in, + uint64_t * SMOL_RESTRICT row_parts_out) +{ + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; + uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px; + + SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT); + + while (row_parts_out + 4 <= row_parts_out_max) + { + __m256i t = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x); + t = _mm256_srli_epi16 (t, 2); + _mm256_store_si256 ((__m256i *) row_parts_out, t); + + row_parts_out += 4; + precalc_x += 32; + } + + interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 2); +} + +static void +interp_horizontal_bilinear_3h_64bpp (const SmolScaleCtx *scale_ctx, + const uint64_t * SMOL_RESTRICT row_parts_in, + uint64_t * SMOL_RESTRICT row_parts_out) +{ + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; + uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px; + + SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT); + + while (row_parts_out + 4 <= row_parts_out_max) + { + __m256i s0, s1; + + s0 = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x); + s1 = interp_horizontal_bilinear_batch_to_4x_64bpp (row_parts_in, precalc_x + 32); + + s0 = hadd_pixels_8x_to_4x_64bpp (s0, s1); + s0 = _mm256_srli_epi16 (s0, 3); + _mm256_store_si256 ((__m256i *) row_parts_out, s0); + + row_parts_out += 4; + precalc_x += 64; + } + + interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 3); +} + +static void +interp_horizontal_bilinear_4h_64bpp (const SmolScaleCtx *scale_ctx, + const uint64_t * SMOL_RESTRICT row_parts_in, + uint64_t * SMOL_RESTRICT row_parts_out) +{ + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; + uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px; + + SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT); + + while (row_parts_out + 4 <= row_parts_out_max) + { + __m256i t0; + + t0 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x); + t0 = _mm256_srli_epi16 (t0, 4); + _mm256_store_si256 ((__m256i *) row_parts_out, t0); + + row_parts_out += 4; + precalc_x += 128; + } + + interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 4); +} + +static void +interp_horizontal_bilinear_5h_64bpp (const SmolScaleCtx *scale_ctx, + const uint64_t * SMOL_RESTRICT row_parts_in, + uint64_t * SMOL_RESTRICT row_parts_out) +{ + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; + uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px; + + SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT); + + while (row_parts_out + 4 <= row_parts_out_max) + { + __m256i t0, t1; + + t0 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x); + t1 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x + 128); + + t0 = hadd_pixels_8x_to_4x_64bpp (t0, t1); + t0 = _mm256_srli_epi16 (t0, 5); + _mm256_store_si256 ((__m256i *) row_parts_out, t0); + + row_parts_out += 4; + precalc_x += 256; + } + + interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 5); } static void -interp_horizontal_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx, +interp_horizontal_bilinear_6h_64bpp (const SmolScaleCtx *scale_ctx, const uint64_t * SMOL_RESTRICT row_parts_in, uint64_t * SMOL_RESTRICT row_parts_out) { - uint64_t p, q; - const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x; - uint64_t F; - uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->width_out; + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; + uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px; - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); + SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t * SMOL_RESTRICT); + SMOL_ASSUME_ALIGNED (precalc_x, const uint16_t * SMOL_RESTRICT); - do + while (row_parts_out + 4 <= row_parts_out_max) { - row_parts_in += *(ofs_x++); - F = *(ofs_x++); + __m256i t0, t1, t2, t3; + + t0 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x); + t1 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x + 128); + t2 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x + 256); + t3 = interp_horizontal_bilinear_4x_batch_to_4x_64bpp (row_parts_in, precalc_x + 384); - p = *row_parts_in; - q = *(row_parts_in + 1); + hadd_pixels_16x_to_8x_64bpp (t0, t1, t2, t3, &t0, &t1); + t0 = hadd_pixels_8x_to_4x_64bpp (t0, t1); - *(row_parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; + t0 = _mm256_srli_epi16 (t0, 6); + _mm256_store_si256 ((__m256i *) row_parts_out, t0); + + row_parts_out += 4; + precalc_x += 512; } - while (row_parts_out != row_parts_out_max); + + interp_horizontal_bilinear_epilogue_64bpp (row_parts_in, row_parts_out, row_parts_out_max, precalc_x, 6); } static void @@ -1492,8 +1972,8 @@ interp_horizontal_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx, const uint64_t * SMOL_RESTRICT row_parts_in, uint64_t * SMOL_RESTRICT row_parts_out) { - const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x; - uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->width_out * 2; + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; + uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px * 2; const __m256i mask256 = _mm256_set_epi32 ( 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff); @@ -1506,19 +1986,20 @@ interp_horizontal_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx, while (row_parts_out + 4 <= row_parts_out_max) { - __m256i m0, m1; __m256i factors; + __m256i m0, m1; __m128i n0, n1, n2, n3, n4, n5; + const uint64_t * SMOL_RESTRICT p0; - row_parts_in += *(ofs_x++) * 2; - n4 = _mm_set1_epi16 (*(ofs_x++)); - n0 = _mm_load_si128 ((__m128i *) row_parts_in); - n1 = _mm_load_si128 ((__m128i *) row_parts_in + 1); + p0 = row_parts_in + *(precalc_x++) * 2; + n4 = _mm_set1_epi16 (*(precalc_x++)); + n0 = _mm_load_si128 ((__m128i *) p0); + n1 = _mm_load_si128 ((__m128i *) p0 + 1); - row_parts_in += *(ofs_x++) * 2; - n5 = _mm_set1_epi16 (*(ofs_x++)); - n2 = _mm_load_si128 ((__m128i *) row_parts_in); - n3 = _mm_load_si128 ((__m128i *) row_parts_in + 1); + p0 = row_parts_in + *(precalc_x++) * 2; + n5 = _mm_set1_epi16 (*(precalc_x++)); + n2 = _mm_load_si128 ((__m128i *) p0); + n3 = _mm_load_si128 ((__m128i *) p0 + 1); m0 = _mm256_set_m128i (n2, n0); m1 = _mm256_set_m128i (n3, n1); @@ -1533,16 +2014,17 @@ interp_horizontal_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx, /* No need for a loop here; let compiler know we're doing it at most once */ if (row_parts_out != row_parts_out_max) { - __m128i m0, m1; __m128i factors; + __m128i m0, m1; uint32_t f; + const uint64_t * SMOL_RESTRICT p0; - row_parts_in += *(ofs_x++) * 2; - f = *(ofs_x++); + p0 = row_parts_in + *(precalc_x++) * 2; + f = *(precalc_x++); factors = _mm_set1_epi32 ((uint32_t) f); - m0 = _mm_stream_load_si128 ((__m128i *) row_parts_in); - m1 = _mm_stream_load_si128 ((__m128i *) row_parts_in + 1); + m0 = _mm_stream_load_si128 ((__m128i *) p0); + m1 = _mm_stream_load_si128 ((__m128i *) p0 + 1); m0 = LERP_SIMD128_EPI32_AND_MASK (m0, m1, factors, mask128); _mm_store_si128 ((__m128i *) row_parts_out, m0); @@ -1550,142 +2032,170 @@ interp_horizontal_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx, } } -DEF_INTERP_HORIZONTAL_BILINEAR(1) -DEF_INTERP_HORIZONTAL_BILINEAR(2) -DEF_INTERP_HORIZONTAL_BILINEAR(3) -DEF_INTERP_HORIZONTAL_BILINEAR(4) -DEF_INTERP_HORIZONTAL_BILINEAR(5) -DEF_INTERP_HORIZONTAL_BILINEAR(6) +#define DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(n_halvings) \ +static void \ +interp_horizontal_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \ + const uint64_t * SMOL_RESTRICT row_parts_in, \ + uint64_t * SMOL_RESTRICT row_parts_out) \ +{ \ + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; \ + uint64_t *row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px * 2; \ + const __m256i mask256 = _mm256_set_epi32 ( \ + 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff, \ + 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff); \ + const __m128i mask128 = _mm_set_epi32 ( \ + 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff); \ + const __m256i zero256 = _mm256_setzero_si256 (); \ + int i; \ + \ + SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); \ +\ + while (row_parts_out != row_parts_out_max) \ + { \ + __m256i a0 = _mm256_setzero_si256 (); \ + __m128i a1; \ +\ + for (i = 0; i < (1 << ((n_halvings) - 1)); i++) \ + { \ + __m256i m0, m1; \ + __m256i factors; \ + __m128i n0, n1, n2, n3, n4, n5; \ + const uint64_t * SMOL_RESTRICT p0; \ +\ + p0 = row_parts_in + *(precalc_x++) * 2; \ + n4 = _mm_set1_epi16 (*(precalc_x++)); \ + n0 = _mm_load_si128 ((__m128i *) p0); \ + n1 = _mm_load_si128 ((__m128i *) p0 + 1); \ +\ + p0 = row_parts_in + *(precalc_x++) * 2; \ + n5 = _mm_set1_epi16 (*(precalc_x++)); \ + n2 = _mm_load_si128 ((__m128i *) p0); \ + n3 = _mm_load_si128 ((__m128i *) p0 + 1); \ +\ + m0 = _mm256_set_m128i (n2, n0); \ + m1 = _mm256_set_m128i (n3, n1); \ + factors = _mm256_set_m128i (n5, n4); \ + factors = _mm256_blend_epi16 (factors, zero256, 0xaa); \ +\ + m0 = LERP_SIMD256_EPI32_AND_MASK (m0, m1, factors, mask256); \ + a0 = _mm256_add_epi32 (a0, m0); \ + } \ +\ + a1 = _mm_add_epi32 (_mm256_extracti128_si256 (a0, 0), \ + _mm256_extracti128_si256 (a0, 1)); \ + a1 = _mm_srli_epi32 (a1, (n_halvings)); \ + a1 = _mm_and_si128 (a1, mask128); \ + _mm_store_si128 ((__m128i *) row_parts_out, a1); \ + row_parts_out += 2; \ + } \ +} + +DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(1) +DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(2) +DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(3) +DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(4) +DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(5) +DEF_INTERP_HORIZONTAL_BILINEAR_128BPP(6) + +static SMOL_INLINE void +unpack_box_precalc (const uint32_t precalc, + uint32_t step, + uint32_t *ofs0, + uint32_t *ofs1, + uint32_t *f0, + uint32_t *f1, + uint32_t *n) +{ + *ofs0 = precalc; + *ofs1 = *ofs0 + step; + *f0 = 256 - (*ofs0 % SMOL_SUBPIXEL_MUL); + *f1 = *ofs1 % SMOL_SUBPIXEL_MUL; + *ofs0 /= SMOL_SUBPIXEL_MUL; + *ofs1 /= SMOL_SUBPIXEL_MUL; + *n = *ofs1 - *ofs0 - 1; +} static void interp_horizontal_boxes_64bpp (const SmolScaleCtx *scale_ctx, - const uint64_t *row_parts_in, - uint64_t * SMOL_RESTRICT row_parts_out) + const uint64_t *src_row_parts, + uint64_t * SMOL_RESTRICT dest_row_parts) { const uint64_t * SMOL_RESTRICT pp; - const uint16_t *ofs_x = scale_ctx->offsets_x; - uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out - 1; - uint64_t accum = 0; - uint64_t p, q, r, s; - uint32_t n; - uint64_t F; - - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); + const uint32_t *precalc_x = scale_ctx->hdim.precalc; + uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px; + uint64_t accum; - pp = row_parts_in; - p = weight_pixel_64bpp (*(pp++), 256); - n = *(ofs_x++); + SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); - while (row_parts_out != row_parts_out_max) + while (dest_row_parts < dest_row_parts_max) { - sum_parts_64bpp ((const uint64_t ** SMOL_RESTRICT) &pp, &accum, n); - - F = *(ofs_x++); - n = *(ofs_x++); + uint32_t ofs0, ofs1; + uint32_t f0, f1; + uint32_t n; - r = *(pp++); - s = r * F; + unpack_box_precalc (*(precalc_x++), + scale_ctx->hdim.span_step, + &ofs0, + &ofs1, + &f0, + &f1, + &n); - q = (s >> 8) & 0x00ff00ff00ff00ffULL; + pp = src_row_parts + ofs0; - accum += p + q; - - /* (255 * r) - (F * r) */ - p = (((r << 8) - r - s) >> 8) & 0x00ff00ff00ff00ffULL; + accum = weight_pixel_64bpp (*(pp++), f0); + sum_parts_64bpp ((const uint64_t ** SMOL_RESTRICT) &pp, &accum, n); + accum += weight_pixel_64bpp (*pp, f1); - *(row_parts_out++) = scale_64bpp (accum, scale_ctx->span_mul_x); - accum = 0; + *(dest_row_parts++) = scale_64bpp (accum, scale_ctx->hdim.span_mul); } - - /* Final box optionally features the rightmost fractional pixel */ - - sum_parts_64bpp ((const uint64_t ** SMOL_RESTRICT) &pp, &accum, n); - - q = 0; - F = *(ofs_x); - if (F > 0) - q = weight_pixel_64bpp (*(pp), F); - - accum += p + q; - *(row_parts_out++) = scale_64bpp (accum, scale_ctx->span_mul_x); } static void interp_horizontal_boxes_128bpp (const SmolScaleCtx *scale_ctx, - const uint64_t *row_parts_in, - uint64_t * SMOL_RESTRICT row_parts_out) + const uint64_t *src_row_parts, + uint64_t * SMOL_RESTRICT dest_row_parts) { const uint64_t * SMOL_RESTRICT pp; - const uint16_t *ofs_x = scale_ctx->offsets_x; - uint64_t *row_parts_out_max = row_parts_out + (scale_ctx->width_out - /* 2 */ 1) * 2; - uint64_t accum [2] = { 0, 0 }; - uint64_t p [2], q [2], r [2], s [2]; - uint32_t n; - uint64_t F; - - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); - - pp = row_parts_in; + const uint32_t *precalc_x = scale_ctx->hdim.precalc; + uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px * 2; + uint64_t accum [2]; - p [0] = *(pp++); - p [1] = *(pp++); - weight_pixel_128bpp (p, p, 256); + SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); - n = *(ofs_x++); - - while (row_parts_out != row_parts_out_max) + while (dest_row_parts < dest_row_parts_max) { - sum_parts_128bpp ((const uint64_t ** SMOL_RESTRICT) &pp, accum, n); - - F = *(ofs_x++); - n = *(ofs_x++); + uint32_t ofs0, ofs1; + uint32_t f0, f1; + uint32_t n; + uint64_t t [2]; - r [0] = *(pp++); - r [1] = *(pp++); + unpack_box_precalc (*(precalc_x++), + scale_ctx->hdim.span_step, + &ofs0, + &ofs1, + &f0, + &f1, + &n); - s [0] = r [0] * F; - s [1] = r [1] * F; + pp = src_row_parts + (ofs0 * 2); - q [0] = (s [0] >> 8) & 0x00ffffff00ffffff; - q [1] = (s [1] >> 8) & 0x00ffffff00ffffff; + weight_pixel_128bpp (pp, accum, f0); + pp += 2; - accum [0] += p [0] + q [0]; - accum [1] += p [1] + q [1]; + sum_parts_128bpp ((const uint64_t ** SMOL_RESTRICT) &pp, accum, n); - p [0] = (((r [0] << 8) - r [0] - s [0]) >> 8) & 0x00ffffff00ffffff; - p [1] = (((r [1] << 8) - r [1] - s [1]) >> 8) & 0x00ffffff00ffffff; + weight_pixel_128bpp (pp, t, f1); + accum [0] += t [0]; + accum [1] += t [1]; scale_and_store_128bpp (accum, - scale_ctx->span_mul_x, - (uint64_t ** SMOL_RESTRICT) &row_parts_out); - - accum [0] = 0; - accum [1] = 0; + scale_ctx->hdim.span_mul, + (uint64_t ** SMOL_RESTRICT) &dest_row_parts); } - - /* Final box optionally features the rightmost fractional pixel */ - - sum_parts_128bpp ((const uint64_t ** SMOL_RESTRICT) &pp, accum, n); - - q [0] = 0; - q [1] = 0; - - F = *(ofs_x); - if (F > 0) - { - q [0] = *(pp++); - q [1] = *(pp++); - weight_pixel_128bpp (q, q, F); - } - - accum [0] += p [0] + q [0]; - accum [1] += p [1] + q [1]; - - scale_and_store_128bpp (accum, - scale_ctx->span_mul_x, - (uint64_t ** SMOL_RESTRICT) &row_parts_out); } static void @@ -1693,7 +2203,7 @@ interp_horizontal_one_64bpp (const SmolScaleCtx *scale_ctx, const uint64_t * SMOL_RESTRICT row_parts_in, uint64_t * SMOL_RESTRICT row_parts_out) { - uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out; + uint64_t *row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px; uint64_t part; SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); @@ -1709,7 +2219,7 @@ interp_horizontal_one_128bpp (const SmolScaleCtx *scale_ctx, const uint64_t * SMOL_RESTRICT row_parts_in, uint64_t * SMOL_RESTRICT row_parts_out) { - uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out * 2; + uint64_t *row_parts_out_max = row_parts_out + scale_ctx->hdim.placement_size_px * 2; SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); @@ -1729,7 +2239,7 @@ interp_horizontal_copy_64bpp (const SmolScaleCtx *scale_ctx, SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); - memcpy (row_parts_out, row_parts_in, scale_ctx->width_out * sizeof (uint64_t)); + memcpy (row_parts_out, row_parts_in, scale_ctx->hdim.placement_size_px * sizeof (uint64_t)); } static void @@ -1740,76 +2250,81 @@ interp_horizontal_copy_128bpp (const SmolScaleCtx *scale_ctx, SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); - memcpy (row_parts_out, row_parts_in, scale_ctx->width_out * 2 * sizeof (uint64_t)); + memcpy (row_parts_out, row_parts_in, scale_ctx->hdim.placement_size_px * 2 * sizeof (uint64_t)); } static void scale_horizontal (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - const uint32_t *row_in, - uint64_t *row_parts_out) + SmolLocalCtx *local_ctx, + const char *src_row, + uint64_t *dest_row_parts) { - uint64_t * SMOL_RESTRICT unpacked_in; + uint64_t * SMOL_RESTRICT src_row_unpacked; - unpacked_in = vertical_ctx->parts_row [3]; + src_row_unpacked = local_ctx->parts_row [3]; /* 32-bit unpackers need 32-bit alignment */ - if ((((uintptr_t) row_in) & 3) - && scale_ctx->pixel_type_in != SMOL_PIXEL_RGB8 - && scale_ctx->pixel_type_in != SMOL_PIXEL_BGR8) + if ((((uintptr_t) src_row) & 3) + && scale_ctx->src_pixel_type != SMOL_PIXEL_RGB8 + && scale_ctx->src_pixel_type != SMOL_PIXEL_BGR8) { - if (!vertical_ctx->in_aligned) - vertical_ctx->in_aligned = - smol_alloc_aligned (scale_ctx->width_in * sizeof (uint32_t), - &vertical_ctx->in_aligned_storage); - memcpy (vertical_ctx->in_aligned, row_in, scale_ctx->width_in * sizeof (uint32_t)); - row_in = vertical_ctx->in_aligned; + if (!local_ctx->src_aligned) + local_ctx->src_aligned = + smol_alloc_aligned (scale_ctx->hdim.src_size_px * sizeof (uint32_t), + &local_ctx->src_aligned_storage); + memcpy (local_ctx->src_aligned, src_row, scale_ctx->hdim.src_size_px * sizeof (uint32_t)); + src_row = (const char *) local_ctx->src_aligned; } - scale_ctx->unpack_row_func (row_in, - unpacked_in, - scale_ctx->width_in); + scale_ctx->src_unpack_row_func (src_row, + src_row_unpacked, + scale_ctx->hdim.src_size_px); scale_ctx->hfilter_func (scale_ctx, - unpacked_in, - row_parts_out); + src_row_unpacked, + dest_row_parts); + + apply_horiz_edge_opacity (scale_ctx, dest_row_parts); } -/* --- Vertical scaling --- */ +/* ---------------- * + * Vertical scaling * + * ---------------- */ static void -update_vertical_ctx_bilinear (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index) +update_local_ctx_bilinear (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) { - uint32_t new_in_ofs = scale_ctx->offsets_y [outrow_index * 2]; + uint16_t *precalc_y = scale_ctx->vdim.precalc; + uint32_t new_src_ofs = precalc_y [dest_row_index * 2]; - if (new_in_ofs == vertical_ctx->in_ofs) + if (new_src_ofs == local_ctx->src_ofs) return; - if (new_in_ofs == vertical_ctx->in_ofs + 1) + if (new_src_ofs == local_ctx->src_ofs + 1) { - uint64_t *t = vertical_ctx->parts_row [0]; - vertical_ctx->parts_row [0] = vertical_ctx->parts_row [1]; - vertical_ctx->parts_row [1] = t; + uint64_t *t = local_ctx->parts_row [0]; + local_ctx->parts_row [0] = local_ctx->parts_row [1]; + local_ctx->parts_row [1] = t; scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, new_in_ofs + 1), - vertical_ctx->parts_row [1]); + local_ctx, + src_row_ofs_to_pointer (scale_ctx, new_src_ofs + 1), + local_ctx->parts_row [1]); } else { scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, new_in_ofs), - vertical_ctx->parts_row [0]); + local_ctx, + src_row_ofs_to_pointer (scale_ctx, new_src_ofs), + local_ctx->parts_row [0]); scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, new_in_ofs + 1), - vertical_ctx->parts_row [1]); + local_ctx, + src_row_ofs_to_pointer (scale_ctx, new_src_ofs + 1), + local_ctx->parts_row [1]); } - vertical_ctx->in_ofs = new_in_ofs; + local_ctx->src_ofs = new_src_ofs; } static void @@ -1819,13 +2334,37 @@ interp_vertical_bilinear_store_64bpp (uint64_t F, uint64_t * SMOL_RESTRICT parts_out, uint32_t width) { + const __m256i mask = _mm256_set_epi16 (0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, + 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff); uint64_t *parts_out_last = parts_out + width; + __m256i F256; SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *); SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *); SMOL_ASSUME_ALIGNED (parts_out, uint64_t *); - do + F256 = _mm256_set1_epi16 ((uint16_t) F); + + while (parts_out + 4 <= parts_out_last) + { + __m256i m0, m1; + + m0 = _mm256_load_si256 ((const __m256i *) top_row_parts_in); + top_row_parts_in += 4; + m1 = _mm256_load_si256 ((const __m256i *) bottom_row_parts_in); + bottom_row_parts_in += 4; + + m0 = _mm256_sub_epi16 (m0, m1); + m0 = _mm256_mullo_epi16 (m0, F256); + m0 = _mm256_srli_epi16 (m0, 8); + m0 = _mm256_add_epi16 (m0, m1); + m0 = _mm256_and_si256 (m0, mask); + + _mm256_store_si256 ((__m256i *) parts_out, m0); + parts_out += 4; + } + + while (parts_out != parts_out_last) { uint64_t p, q; @@ -1834,23 +2373,76 @@ interp_vertical_bilinear_store_64bpp (uint64_t F, *(parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; } - while (parts_out != parts_out_last); } static void -interp_vertical_bilinear_add_64bpp (uint64_t F, - const uint64_t * SMOL_RESTRICT top_row_parts_in, - const uint64_t * SMOL_RESTRICT bottom_row_parts_in, - uint64_t * SMOL_RESTRICT accum_out, +interp_vertical_bilinear_store_with_opacity_64bpp (uint64_t F, + const uint64_t * SMOL_RESTRICT top_src_row_parts, + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, + uint64_t * SMOL_RESTRICT dest_parts, + uint32_t width, + uint16_t opacity) +{ + uint64_t *parts_dest_last = dest_parts + width; + + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); + + do + { + uint64_t p, q; + + p = *(top_src_row_parts++); + q = *(bottom_src_row_parts++); + + *dest_parts = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; + apply_subpixel_opacity_64bpp (dest_parts, opacity); + dest_parts++; + } + while (dest_parts != parts_dest_last); +} + +static void +interp_vertical_bilinear_add_64bpp (uint16_t F, + const uint64_t *top_row_parts_in, + const uint64_t *bottom_row_parts_in, + uint64_t *accum_out, uint32_t width) { + const __m256i mask = _mm256_set_epi16 (0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, + 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff); uint64_t *accum_out_last = accum_out + width; + __m256i F256; SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *); SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *); SMOL_ASSUME_ALIGNED (accum_out, uint64_t *); - do + F256 = _mm256_set1_epi16 ((uint16_t) F); + + while (accum_out + 4 <= accum_out_last) + { + __m256i m0, m1, o0; + + m0 = _mm256_load_si256 ((const __m256i *) top_row_parts_in); + top_row_parts_in += 4; + m1 = _mm256_load_si256 ((const __m256i *) bottom_row_parts_in); + bottom_row_parts_in += 4; + o0 = _mm256_load_si256 ((const __m256i *) accum_out); + + m0 = _mm256_sub_epi16 (m0, m1); + m0 = _mm256_mullo_epi16 (m0, F256); + m0 = _mm256_srli_epi16 (m0, 8); + m0 = _mm256_add_epi16 (m0, m1); + m0 = _mm256_and_si256 (m0, mask); + + o0 = _mm256_add_epi16 (o0, m0); + _mm256_store_si256 ((__m256i *) accum_out, o0); + accum_out += 4; + } + + while (accum_out != accum_out_last) { uint64_t p, q; @@ -1859,7 +2451,6 @@ interp_vertical_bilinear_add_64bpp (uint64_t F, *(accum_out++) += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; } - while (accum_out != accum_out_last); } static void @@ -1922,6 +2513,34 @@ interp_vertical_bilinear_store_128bpp (uint64_t F, } } +static void +interp_vertical_bilinear_store_with_opacity_128bpp (uint64_t F, + const uint64_t * SMOL_RESTRICT top_src_row_parts, + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, + uint64_t * SMOL_RESTRICT dest_parts, + uint32_t width, + uint16_t opacity) +{ + uint64_t *parts_dest_last = dest_parts + width; + + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); + + do + { + uint64_t p, q; + + p = *(top_src_row_parts++); + q = *(bottom_src_row_parts++); + + *dest_parts = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; + apply_subpixel_opacity_128bpp_half (dest_parts, opacity); + dest_parts++; + } + while (dest_parts != parts_dest_last); +} + static void interp_vertical_bilinear_add_128bpp (uint64_t F, const uint64_t * SMOL_RESTRICT top_row_parts_in, @@ -1954,7 +2573,7 @@ interp_vertical_bilinear_add_128bpp (uint64_t F, m3 = _mm256_load_si256 ((const __m256i *) bottom_row_parts_in); bottom_row_parts_in += 4; o0 = _mm256_load_si256 ((const __m256i *) accum_out); - o1 = _mm256_load_si256 ((const __m256i *) accum_out + 4); + o1 = _mm256_load_si256 ((const __m256i *) (accum_out + 4)); m0 = _mm256_sub_epi32 (m0, m1); m2 = _mm256_sub_epi32 (m2, m3); @@ -1986,230 +2605,418 @@ interp_vertical_bilinear_add_128bpp (uint64_t F, } } -#define DEF_INTERP_VERTICAL_BILINEAR_FINAL(n_halvings) \ -static void \ -interp_vertical_bilinear_final_##n_halvings##h_64bpp (uint64_t F, \ +#define DEF_INTERP_VERTICAL_BILINEAR_FINAL(n_halvings) \ +static void \ +interp_vertical_bilinear_final_##n_halvings##h_64bpp (uint64_t F, \ const uint64_t * SMOL_RESTRICT top_row_parts_in, \ const uint64_t * SMOL_RESTRICT bottom_row_parts_in, \ uint64_t * SMOL_RESTRICT accum_inout, \ - uint32_t width) \ -{ \ - uint64_t *accum_inout_last = accum_inout + width; \ - \ - SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *); \ - SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *); \ - SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \ - \ - do \ - { \ - uint64_t p, q; \ - \ - p = *(top_row_parts_in++); \ - q = *(bottom_row_parts_in++); \ - \ - p = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \ + uint32_t width) \ +{ \ + const __m256i mask = _mm256_set_epi16 (0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, \ + 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff); \ + uint64_t *accum_inout_last = accum_inout + width; \ + __m256i F256; \ +\ + SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \ +\ + F256 = _mm256_set1_epi16 ((uint16_t) F); \ +\ + while (accum_inout + 4 <= accum_inout_last) \ + { \ + __m256i m0, m1, o0; \ +\ + m0 = _mm256_load_si256 ((const __m256i *) top_row_parts_in); \ + top_row_parts_in += 4; \ + m1 = _mm256_load_si256 ((const __m256i *) bottom_row_parts_in); \ + bottom_row_parts_in += 4; \ + o0 = _mm256_load_si256 ((const __m256i *) accum_inout); \ +\ + m0 = _mm256_sub_epi16 (m0, m1); \ + m0 = _mm256_mullo_epi16 (m0, F256); \ + m0 = _mm256_srli_epi16 (m0, 8); \ + m0 = _mm256_add_epi16 (m0, m1); \ + m0 = _mm256_and_si256 (m0, mask); \ +\ + o0 = _mm256_add_epi16 (o0, m0); \ + o0 = _mm256_srli_epi16 (o0, n_halvings); \ +\ + _mm256_store_si256 ((__m256i *) accum_inout, o0); \ + accum_inout += 4; \ + } \ +\ + while (accum_inout != accum_inout_last) \ + { \ + uint64_t p, q; \ +\ + p = *(top_row_parts_in++); \ + q = *(bottom_row_parts_in++); \ +\ + p = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \ + p = ((p + *accum_inout) >> n_halvings) & 0x00ff00ff00ff00ffULL; \ +\ + *(accum_inout++) = p; \ + } \ +} \ +static void \ +interp_vertical_bilinear_final_##n_halvings##h_with_opacity_64bpp (uint64_t F, \ + const uint64_t * SMOL_RESTRICT top_src_row_parts, \ + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \ + uint64_t * SMOL_RESTRICT accum_inout, \ + uint32_t width, \ + uint16_t opacity) \ +{ \ + uint64_t *accum_inout_last = accum_inout + width; \ +\ + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \ +\ + do \ + { \ + uint64_t p, q; \ +\ + p = *(top_src_row_parts++); \ + q = *(bottom_src_row_parts++); \ +\ + p = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \ p = ((p + *accum_inout) >> n_halvings) & 0x00ff00ff00ff00ffULL; \ - \ - *(accum_inout++) = p; \ - } \ - while (accum_inout != accum_inout_last); \ -} \ - \ -static void \ -interp_vertical_bilinear_final_##n_halvings##h_128bpp (uint64_t F, \ - const uint64_t * SMOL_RESTRICT top_row_parts_in, \ - const uint64_t * SMOL_RESTRICT bottom_row_parts_in, \ +\ + apply_subpixel_opacity_64bpp (&p, opacity); \ + *(accum_inout++) = p; \ + } \ + while (accum_inout != accum_inout_last); \ +} \ +\ +static void \ +interp_vertical_bilinear_final_##n_halvings##h_128bpp (uint64_t F, \ + const uint64_t * SMOL_RESTRICT top_src_row_parts, \ + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \ uint64_t * SMOL_RESTRICT accum_inout, \ - uint32_t width) \ -{ \ - uint64_t *accum_inout_last = accum_inout + width; \ - \ - SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *); \ - SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *); \ - SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \ - \ - do \ - { \ - uint64_t p, q; \ - \ - p = *(top_row_parts_in++); \ - q = *(bottom_row_parts_in++); \ - \ - p = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \ + uint32_t width) \ +{ \ + uint64_t *accum_inout_last = accum_inout + width; \ +\ + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \ +\ + do \ + { \ + uint64_t p, q; \ +\ + p = *(top_src_row_parts++); \ + q = *(bottom_src_row_parts++); \ +\ + p = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \ p = ((p + *accum_inout) >> n_halvings) & 0x00ffffff00ffffffULL; \ - \ - *(accum_inout++) = p; \ - } \ - while (accum_inout != accum_inout_last); \ -} - -#define DEF_SCALE_OUTROW_BILINEAR(n_halvings) \ -static void \ -scale_outrow_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \ - SmolVerticalCtx *vertical_ctx, \ - uint32_t outrow_index, \ - uint32_t *row_out) \ -{ \ - uint32_t bilin_index = outrow_index << (n_halvings); \ - unsigned int i; \ - \ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \ - interp_vertical_bilinear_store_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \ - vertical_ctx->parts_row [0], \ - vertical_ctx->parts_row [1], \ - vertical_ctx->parts_row [2], \ - scale_ctx->width_out); \ - bilin_index++; \ - \ - for (i = 0; i < (1 << (n_halvings)) - 2; i++) \ - { \ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \ - interp_vertical_bilinear_add_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \ - vertical_ctx->parts_row [0], \ - vertical_ctx->parts_row [1], \ - vertical_ctx->parts_row [2], \ - scale_ctx->width_out); \ - bilin_index++; \ - } \ - \ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \ - interp_vertical_bilinear_final_##n_halvings##h_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \ - vertical_ctx->parts_row [0], \ - vertical_ctx->parts_row [1], \ - vertical_ctx->parts_row [2], \ - scale_ctx->width_out); \ - \ - scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); \ -} \ - \ -static void \ -scale_outrow_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \ - SmolVerticalCtx *vertical_ctx, \ - uint32_t outrow_index, \ - uint32_t *row_out) \ -{ \ - uint32_t bilin_index = outrow_index << (n_halvings); \ - unsigned int i; \ - \ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \ - interp_vertical_bilinear_store_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \ - vertical_ctx->parts_row [0], \ - vertical_ctx->parts_row [1], \ - vertical_ctx->parts_row [2], \ - scale_ctx->width_out * 2); \ - bilin_index++; \ - \ - for (i = 0; i < (1 << (n_halvings)) - 2; i++) \ - { \ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \ - interp_vertical_bilinear_add_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \ - vertical_ctx->parts_row [0], \ - vertical_ctx->parts_row [1], \ - vertical_ctx->parts_row [2], \ - scale_ctx->width_out * 2); \ - bilin_index++; \ - } \ - \ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \ - interp_vertical_bilinear_final_##n_halvings##h_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \ - vertical_ctx->parts_row [0], \ - vertical_ctx->parts_row [1], \ - vertical_ctx->parts_row [2], \ - scale_ctx->width_out * 2); \ - \ - scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); \ -} - -static void -scale_outrow_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) -{ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, outrow_index); - interp_vertical_bilinear_store_64bpp (scale_ctx->offsets_y [outrow_index * 2 + 1], - vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - scale_ctx->width_out); - scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); -} +\ + *(accum_inout++) = p; \ + } \ + while (accum_inout != accum_inout_last); \ +} \ +\ +static void \ +interp_vertical_bilinear_final_##n_halvings##h_with_opacity_128bpp (uint64_t F, \ + const uint64_t * SMOL_RESTRICT top_src_row_parts, \ + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \ + uint64_t * SMOL_RESTRICT accum_inout, \ + uint32_t width, \ + uint16_t opacity) \ +{ \ + uint64_t *accum_inout_last = accum_inout + width; \ +\ + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \ +\ + do \ + { \ + uint64_t p, q; \ +\ + p = *(top_src_row_parts++); \ + q = *(bottom_src_row_parts++); \ +\ + p = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \ + p = ((p + *accum_inout) >> n_halvings) & 0x00ffffff00ffffffULL; \ +\ + apply_subpixel_opacity_128bpp_half (&p, opacity); \ + *(accum_inout++) = p; \ + } \ + while (accum_inout != accum_inout_last); \ +} + +#define DEF_SCALE_DEST_ROW_BILINEAR(n_halvings) \ +static int \ +scale_dest_row_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \ + SmolLocalCtx *local_ctx, \ + uint32_t dest_row_index) \ +{ \ + uint16_t *precalc_y = scale_ctx->vdim.precalc; \ + uint32_t bilin_index = dest_row_index << (n_halvings); \ + unsigned int i; \ +\ + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \ + interp_vertical_bilinear_store_64bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px); \ + bilin_index++; \ +\ + for (i = 0; i < (1 << (n_halvings)) - 2; i++) \ + { \ + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \ + interp_vertical_bilinear_add_64bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px); \ + bilin_index++; \ + } \ +\ + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \ +\ + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) \ + interp_vertical_bilinear_final_##n_halvings##h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px, \ + scale_ctx->vdim.first_opacity); \ + else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) \ + interp_vertical_bilinear_final_##n_halvings##h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px, \ + scale_ctx->vdim.last_opacity); \ + else \ + interp_vertical_bilinear_final_##n_halvings##h_64bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px); \ +\ + return 2; \ +} \ +\ +static int \ +scale_dest_row_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \ + SmolLocalCtx *local_ctx, \ + uint32_t dest_row_index) \ +{ \ + uint16_t *precalc_y = scale_ctx->vdim.precalc; \ + uint32_t bilin_index = dest_row_index << (n_halvings); \ + unsigned int i; \ +\ + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \ + interp_vertical_bilinear_store_128bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px * 2); \ + bilin_index++; \ +\ + for (i = 0; i < (1 << (n_halvings)) - 2; i++) \ + { \ + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \ + interp_vertical_bilinear_add_128bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px * 2); \ + bilin_index++; \ + } \ +\ + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \ +\ + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) \ + interp_vertical_bilinear_final_##n_halvings##h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px * 2, \ + scale_ctx->vdim.first_opacity); \ + else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) \ + interp_vertical_bilinear_final_##n_halvings##h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px * 2, \ + scale_ctx->vdim.last_opacity); \ + else \ + interp_vertical_bilinear_final_##n_halvings##h_128bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px * 2); \ +\ + return 2; \ +} + +static int +scale_dest_row_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) +{ + uint16_t *precalc_y = scale_ctx->vdim.precalc; + + update_local_ctx_bilinear (scale_ctx, local_ctx, dest_row_index); + + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) + interp_vertical_bilinear_store_with_opacity_64bpp (precalc_y [dest_row_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.first_opacity); + else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) + interp_vertical_bilinear_store_with_opacity_64bpp (precalc_y [dest_row_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.last_opacity); + else + interp_vertical_bilinear_store_64bpp (precalc_y [dest_row_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px); + + return 2; +} + +static int +scale_dest_row_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) +{ + uint16_t *precalc_y = scale_ctx->vdim.precalc; + + update_local_ctx_bilinear (scale_ctx, local_ctx, dest_row_index); + + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) + interp_vertical_bilinear_store_with_opacity_128bpp (precalc_y [dest_row_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2, + scale_ctx->vdim.first_opacity); + else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) + interp_vertical_bilinear_store_with_opacity_128bpp (precalc_y [dest_row_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2, + scale_ctx->vdim.last_opacity); + else + interp_vertical_bilinear_store_128bpp (precalc_y [dest_row_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2); -static void -scale_outrow_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) -{ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, outrow_index); - interp_vertical_bilinear_store_128bpp (scale_ctx->offsets_y [outrow_index * 2 + 1], - vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - scale_ctx->width_out * 2); - scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); + return 2; } DEF_INTERP_VERTICAL_BILINEAR_FINAL(1) -static void -scale_outrow_bilinear_1h_64bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) -{ - uint32_t bilin_index = outrow_index << 1; - - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); - interp_vertical_bilinear_store_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], - vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - scale_ctx->width_out); +static int +scale_dest_row_bilinear_1h_64bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) +{ + uint16_t *precalc_y = scale_ctx->vdim.precalc; + uint32_t bilin_index = dest_row_index << 1; + + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); + interp_vertical_bilinear_store_64bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px); bilin_index++; - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); - interp_vertical_bilinear_final_1h_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], - vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - scale_ctx->width_out); - scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); + + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) + interp_vertical_bilinear_final_1h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.first_opacity); + else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) + interp_vertical_bilinear_final_1h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.last_opacity); + else + interp_vertical_bilinear_final_1h_64bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px); + + return 2; } -static void -scale_outrow_bilinear_1h_128bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) -{ - uint32_t bilin_index = outrow_index << 1; - - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); - interp_vertical_bilinear_store_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], - vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - scale_ctx->width_out * 2); +static int +scale_dest_row_bilinear_1h_128bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) +{ + uint16_t *precalc_y = scale_ctx->vdim.precalc; + uint32_t bilin_index = dest_row_index << 1; + + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); + interp_vertical_bilinear_store_128bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2); bilin_index++; - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); - interp_vertical_bilinear_final_1h_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], - vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - scale_ctx->width_out * 2); - scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); + + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) + interp_vertical_bilinear_final_1h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2, + scale_ctx->vdim.first_opacity); + else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) + interp_vertical_bilinear_final_1h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2, + scale_ctx->vdim.last_opacity); + else + interp_vertical_bilinear_final_1h_128bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2); + + return 2; } DEF_INTERP_VERTICAL_BILINEAR_FINAL(2) -DEF_SCALE_OUTROW_BILINEAR(2) +DEF_SCALE_DEST_ROW_BILINEAR(2) DEF_INTERP_VERTICAL_BILINEAR_FINAL(3) -DEF_SCALE_OUTROW_BILINEAR(3) +DEF_SCALE_DEST_ROW_BILINEAR(3) DEF_INTERP_VERTICAL_BILINEAR_FINAL(4) -DEF_SCALE_OUTROW_BILINEAR(4) +DEF_SCALE_DEST_ROW_BILINEAR(4) DEF_INTERP_VERTICAL_BILINEAR_FINAL(5) -DEF_SCALE_OUTROW_BILINEAR(5) +DEF_SCALE_DEST_ROW_BILINEAR(5) DEF_INTERP_VERTICAL_BILINEAR_FINAL(6) -DEF_SCALE_OUTROW_BILINEAR(6) +DEF_SCALE_DEST_ROW_BILINEAR(6) static void finalize_vertical_64bpp (const uint64_t * SMOL_RESTRICT accums, @@ -2229,581 +3036,418 @@ finalize_vertical_64bpp (const uint64_t * SMOL_RESTRICT accums, } static void -weight_edge_row_64bpp (uint64_t *row, - uint16_t w, - uint32_t n) +finalize_vertical_with_opacity_64bpp (const uint64_t * SMOL_RESTRICT accums, + uint64_t multiplier, + uint64_t * SMOL_RESTRICT dest_parts, + uint32_t n, + uint16_t opacity) { - uint64_t *row_max = row + n; + uint64_t *parts_dest_max = dest_parts + n; - SMOL_ASSUME_ALIGNED (row, uint64_t *); + SMOL_ASSUME_ALIGNED (accums, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); - while (row != row_max) + while (dest_parts != parts_dest_max) { - *row = ((*row * w) >> 8) & 0x00ff00ff00ff00ffULL; - row++; + *dest_parts = scale_64bpp (*(accums++), multiplier); + apply_subpixel_opacity_64bpp (dest_parts, opacity); + dest_parts++; } } -static void -scale_and_weight_edge_rows_box_64bpp (const uint64_t * SMOL_RESTRICT first_row, - uint64_t * SMOL_RESTRICT last_row, - uint64_t * SMOL_RESTRICT accum, - uint16_t w2, - uint32_t n) +static int +scale_dest_row_box_64bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) { - const uint64_t *first_row_max = first_row + n; - - SMOL_ASSUME_ALIGNED (first_row, const uint64_t *); - SMOL_ASSUME_ALIGNED (last_row, uint64_t *); - SMOL_ASSUME_ALIGNED (accum, uint64_t *); + uint32_t *precalc_y = scale_ctx->vdim.precalc; + uint32_t ofs_y, ofs_y_max; + uint32_t w1, w2; + uint32_t n, i; - while (first_row != first_row_max) - { - uint64_t r, s, p, q; + unpack_box_precalc (precalc_y [dest_row_index], + scale_ctx->vdim.span_step, + &ofs_y, + &ofs_y_max, + &w1, + &w2, + &n); - p = *(first_row++); + /* First input row */ - r = *(last_row); - s = r * w2; - q = (s >> 8) & 0x00ff00ff00ff00ffULL; - /* (255 * r) - (F * r) */ - *(last_row++) = (((r << 8) - r - s) >> 8) & 0x00ff00ff00ff00ffULL; + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, ofs_y), + local_ctx->parts_row [0]); + copy_weighted_parts_64bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + w1); + ofs_y++; - *(accum++) = p + q; - } -} + /* Add up whole input rows */ -static void -update_vertical_ctx_box_64bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t ofs_y, - uint32_t ofs_y_max, - uint16_t w1, - uint16_t w2) -{ - /* Old in_ofs is the previous max */ - if (ofs_y == vertical_ctx->in_ofs) + for (i = 0; i < n; i++) { - uint64_t *t = vertical_ctx->parts_row [0]; - vertical_ctx->parts_row [0] = vertical_ctx->parts_row [1]; - vertical_ctx->parts_row [1] = t; + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, ofs_y), + local_ctx->parts_row [0]); + add_parts (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px); + + ofs_y++; } - else + + /* Last input row */ + + if (ofs_y < scale_ctx->vdim.src_size_px) { scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, ofs_y), - vertical_ctx->parts_row [0]); - weight_edge_row_64bpp (vertical_ctx->parts_row [0], w1, scale_ctx->width_out); + local_ctx, + src_row_ofs_to_pointer (scale_ctx, ofs_y), + local_ctx->parts_row [0]); + add_weighted_parts_64bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + w2); } - /* When w2 == 0, the final inrow may be out of bounds. Don't try to access it in - * that case. */ - if (w2 || ofs_y_max < scale_ctx->height_in) + /* Finalize */ + + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) { - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, ofs_y_max), - vertical_ctx->parts_row [1]); + finalize_vertical_with_opacity_64bpp (local_ctx->parts_row [1], + scale_ctx->vdim.span_mul, + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.first_opacity); } - else + else if (dest_row_index == scale_ctx->vdim.placement_size_px - 1 && scale_ctx->vdim.last_opacity < 256) { - memset (vertical_ctx->parts_row [1], 0, scale_ctx->width_out * sizeof (uint64_t)); + finalize_vertical_with_opacity_64bpp (local_ctx->parts_row [1], + scale_ctx->vdim.span_mul, + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.last_opacity); } - - vertical_ctx->in_ofs = ofs_y_max; -} - -static void -scale_outrow_box_64bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) -{ - uint32_t ofs_y, ofs_y_max; - uint16_t w1, w2; - - /* Get the inrow range for this outrow: [ofs_y .. ofs_y_max> */ - - ofs_y = scale_ctx->offsets_y [outrow_index * 2]; - ofs_y_max = scale_ctx->offsets_y [(outrow_index + 1) * 2]; - - /* Scale the first and last rows, weight them and store in accumulator */ - - w1 = (outrow_index == 0) ? 256 : 255 - scale_ctx->offsets_y [outrow_index * 2 - 1]; - w2 = scale_ctx->offsets_y [outrow_index * 2 + 1]; - - update_vertical_ctx_box_64bpp (scale_ctx, vertical_ctx, ofs_y, ofs_y_max, w1, w2); - - scale_and_weight_edge_rows_box_64bpp (vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - w2, - scale_ctx->width_out); - - ofs_y++; - - /* Add up whole rows */ - - while (ofs_y < ofs_y_max) + else { - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, ofs_y), - vertical_ctx->parts_row [0]); - add_parts (vertical_ctx->parts_row [0], - vertical_ctx->parts_row [2], - scale_ctx->width_out); - - ofs_y++; + finalize_vertical_64bpp (local_ctx->parts_row [1], + scale_ctx->vdim.span_mul, + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px); } - finalize_vertical_64bpp (vertical_ctx->parts_row [2], - scale_ctx->span_mul_y, - vertical_ctx->parts_row [0], - scale_ctx->width_out); - scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out); + return 0; } static void finalize_vertical_128bpp (const uint64_t * SMOL_RESTRICT accums, uint64_t multiplier, - uint64_t * SMOL_RESTRICT parts_out, + uint64_t * SMOL_RESTRICT dest_parts, uint32_t n) { - uint64_t *parts_out_max = parts_out + n * 2; + uint64_t *parts_dest_max = dest_parts + n * 2; SMOL_ASSUME_ALIGNED (accums, const uint64_t *); - SMOL_ASSUME_ALIGNED (parts_out, uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); - while (parts_out != parts_out_max) + while (dest_parts != parts_dest_max) { - *(parts_out++) = scale_128bpp_half (*(accums++), multiplier); - *(parts_out++) = scale_128bpp_half (*(accums++), multiplier); + *(dest_parts++) = scale_128bpp_half (*(accums++), multiplier); + *(dest_parts++) = scale_128bpp_half (*(accums++), multiplier); } } static void -weight_row_128bpp (uint64_t *row, - uint16_t w, - uint32_t n) +finalize_vertical_with_opacity_128bpp (const uint64_t * SMOL_RESTRICT accums, + uint64_t multiplier, + uint64_t * SMOL_RESTRICT dest_parts, + uint32_t n, + uint16_t opacity) { - uint64_t *row_max = row + (n * 2); + uint64_t *parts_dest_max = dest_parts + n * 2; - SMOL_ASSUME_ALIGNED (row, uint64_t *); + SMOL_ASSUME_ALIGNED (accums, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); - while (row != row_max) + while (dest_parts != parts_dest_max) { - row [0] = ((row [0] * w) >> 8) & 0x00ffffff00ffffffULL; - row [1] = ((row [1] * w) >> 8) & 0x00ffffff00ffffffULL; - row += 2; + dest_parts [0] = scale_128bpp_half (*(accums++), multiplier); + dest_parts [1] = scale_128bpp_half (*(accums++), multiplier); + apply_subpixel_opacity_128bpp (dest_parts, opacity); + dest_parts += 2; } } -static void -scale_outrow_box_128bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) +static int +scale_dest_row_box_128bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) { + uint32_t *precalc_y = scale_ctx->vdim.precalc; uint32_t ofs_y, ofs_y_max; - uint16_t w; + uint32_t w1, w2; + uint32_t n, i; - /* Get the inrow range for this outrow: [ofs_y .. ofs_y_max> */ + unpack_box_precalc (precalc_y [dest_row_index], + scale_ctx->vdim.span_step, + &ofs_y, + &ofs_y_max, + &w1, + &w2, + &n); - ofs_y = scale_ctx->offsets_y [outrow_index * 2]; - ofs_y_max = scale_ctx->offsets_y [(outrow_index + 1) * 2]; - - /* Scale the first inrow and store it */ + /* First input row */ scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, ofs_y), - vertical_ctx->parts_row [0]); - weight_row_128bpp (vertical_ctx->parts_row [0], - outrow_index == 0 ? 256 : 255 - scale_ctx->offsets_y [outrow_index * 2 - 1], - scale_ctx->width_out); + local_ctx, + src_row_ofs_to_pointer (scale_ctx, ofs_y), + local_ctx->parts_row [0]); + copy_weighted_parts_128bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + w1); ofs_y++; - /* Add up whole rows */ + /* Add up whole input rows */ - while (ofs_y < ofs_y_max) + for (i = 0; i < n; i++) { scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, ofs_y), - vertical_ctx->parts_row [1]); - add_parts (vertical_ctx->parts_row [1], - vertical_ctx->parts_row [0], - scale_ctx->width_out * 2); + local_ctx, + src_row_ofs_to_pointer (scale_ctx, ofs_y), + local_ctx->parts_row [0]); + add_parts (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px * 2); ofs_y++; } - /* Final row is optional; if this is the bottommost outrow it could be out of bounds */ + /* Last input row */ - w = scale_ctx->offsets_y [outrow_index * 2 + 1]; - if (w > 0) + if (ofs_y < scale_ctx->vdim.src_size_px) { scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, ofs_y), - vertical_ctx->parts_row [1]); - weight_row_128bpp (vertical_ctx->parts_row [1], - w - 1, /* Subtract 1 to avoid overflow */ - scale_ctx->width_out); - add_parts (vertical_ctx->parts_row [1], - vertical_ctx->parts_row [0], - scale_ctx->width_out * 2); + local_ctx, + src_row_ofs_to_pointer (scale_ctx, ofs_y), + local_ctx->parts_row [0]); + add_weighted_parts_128bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + w2); + } + + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) + { + finalize_vertical_with_opacity_128bpp (local_ctx->parts_row [1], + scale_ctx->vdim.span_mul, + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.first_opacity); + } + else if (dest_row_index == scale_ctx->vdim.placement_size_px - 1 && scale_ctx->vdim.last_opacity < 256) + { + finalize_vertical_with_opacity_128bpp (local_ctx->parts_row [1], + scale_ctx->vdim.span_mul, + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.last_opacity); + } + else + { + finalize_vertical_128bpp (local_ctx->parts_row [1], + scale_ctx->vdim.span_mul, + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px); } - finalize_vertical_128bpp (vertical_ctx->parts_row [0], - scale_ctx->span_mul_y, - vertical_ctx->parts_row [1], - scale_ctx->width_out); - scale_ctx->pack_row_func (vertical_ctx->parts_row [1], row_out, scale_ctx->width_out); + return 0; } -static void -scale_outrow_one_64bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t row_index, - uint32_t *row_out) +static int +scale_dest_row_one_64bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t row_index) { - SMOL_UNUSED (row_index); - /* Scale the row and store it */ - if (vertical_ctx->in_ofs != 0) + if (local_ctx->src_ofs != 0) { scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, 0), - vertical_ctx->parts_row [0]); - vertical_ctx->in_ofs = 0; + local_ctx, + src_row_ofs_to_pointer (scale_ctx, 0), + local_ctx->parts_row [0]); + local_ctx->src_ofs = 0; + } + + if (row_index == 0 && scale_ctx->vdim.first_opacity < 256) + { + apply_subpixel_opacity_row_copy_64bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.first_opacity); + } + else if (row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) + { + apply_subpixel_opacity_row_copy_64bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.last_opacity); + } + else + { + memcpy (local_ctx->parts_row [1], + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px * sizeof (uint64_t)); } - scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out); + return 1; } -static void -scale_outrow_one_128bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t row_index, - uint32_t *row_out) +static int +scale_dest_row_one_128bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t row_index) { - SMOL_UNUSED (row_index); - /* Scale the row and store it */ - if (vertical_ctx->in_ofs != 0) + if (local_ctx->src_ofs != 0) { scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, 0), - vertical_ctx->parts_row [0]); - vertical_ctx->in_ofs = 0; + local_ctx, + src_row_ofs_to_pointer (scale_ctx, 0), + local_ctx->parts_row [0]); + local_ctx->src_ofs = 0; + } + + if (row_index == 0 && scale_ctx->vdim.first_opacity < 256) + { + apply_subpixel_opacity_row_copy_128bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.first_opacity); + } + else if (row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) + { + apply_subpixel_opacity_row_copy_128bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.last_opacity); + } + else + { + memcpy (local_ctx->parts_row [1], + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px * sizeof (uint64_t) * 2); } - scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out); + return 1; } -static void -scale_outrow_copy (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t row_index, - uint32_t *row_out) +static int +scale_dest_row_copy (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t row_index) { scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, row_index), - vertical_ctx->parts_row [0]); - - scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out); -} + local_ctx, + src_row_ofs_to_pointer (scale_ctx, row_index), + local_ctx->parts_row [0]); + + return 0; +} + +/* --------------- * + * Function tables * + * --------------- */ + +#define R SMOL_REPACK_META + +static const SmolRepackMeta repack_meta [] = +{ + R (123, 24, PREMUL8, COMPRESSED, 1324, 64, PREMUL8, COMPRESSED), + + R (123, 24, PREMUL8, COMPRESSED, 1234, 128, PREMUL8, COMPRESSED), + + R (1234, 32, PREMUL8, COMPRESSED, 1324, 64, PREMUL8, COMPRESSED), + R (1234, 32, PREMUL8, COMPRESSED, 2431, 64, PREMUL8, COMPRESSED), + R (1234, 32, PREMUL8, COMPRESSED, 3241, 64, PREMUL8, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 1324, 64, PREMUL8, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 2431, 64, PREMUL8, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 3241, 64, PREMUL8, COMPRESSED), + + R (1234, 32, PREMUL8, COMPRESSED, 1234, 128, PREMUL8, COMPRESSED), + R (1234, 32, PREMUL8, COMPRESSED, 2341, 128, PREMUL8, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 1234, 128, PREMUL8, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 2341, 128, PREMUL8, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 1234, 128, PREMUL16, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 2341, 128, PREMUL16, COMPRESSED), + + R (1234, 64, PREMUL8, COMPRESSED, 132, 24, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 231, 24, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 324, 24, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 423, 24, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 132, 24, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 231, 24, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 324, 24, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 423, 24, UNASSOCIATED, COMPRESSED), + + R (1234, 64, PREMUL8, COMPRESSED, 1324, 32, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 1423, 32, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 2314, 32, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 4132, 32, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 4231, 32, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 1324, 32, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 1423, 32, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 2314, 32, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 4132, 32, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 4231, 32, UNASSOCIATED, COMPRESSED), + + R (1234, 128, PREMUL8, COMPRESSED, 123, 24, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 321, 24, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 123, 24, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 321, 24, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, COMPRESSED, 123, 24, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, COMPRESSED, 321, 24, UNASSOCIATED, COMPRESSED), + + R (1234, 128, PREMUL8, COMPRESSED, 1234, 32, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 3214, 32, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 4123, 32, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 4321, 32, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 1234, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 3214, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 4123, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 4321, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, COMPRESSED, 1234, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, COMPRESSED, 3214, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, COMPRESSED, 4123, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, COMPRESSED, 4321, 32, UNASSOCIATED, COMPRESSED), + + SMOL_REPACK_META_LAST +}; -/* --- Conversion tables --- */ +#undef R -static const SmolConversionTable avx2_conversions = +static const SmolImplementation implementation = { -{ { - /* Conversions where accumulators must hold the sum of fewer than - * 256 pixels. This can be done in 64bpp, but 128bpp may be used - * e.g. for 16 bits per channel internally premultiplied data. */ - - /* RGBA8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 3214, p, 64), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4123, p, 64), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64), - /* RGBA8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 1234, u, 64), - /* BGRA8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 3214, u, 64), - /* ARGB8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4123, u, 64), - /* ABGR8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4321, u, 64), - /* RGB8 */ SMOL_CONV (1234, p, 1324, p, 132a, p, 123, u, 64), - /* BGR8 */ SMOL_CONV (1234, p, 1324, p, 132a, p, 321, u, 64), - }, - /* BGRA8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 3214, p, 64), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4123, p, 64), - /* RGBA8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 3214, u, 64), - /* BGRA8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 1234, u, 64), - /* ARGB8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4321, u, 64), - /* ABGR8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4123, u, 64), - /* RGB8 */ SMOL_CONV (1234, p, 1324, p, 132a, p, 321, u, 64), - /* BGR8 */ SMOL_CONV (1234, p, 1324, p, 132a, p, 123, u, 64), - }, - /* ARGB8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 2341, p, 64), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1432, p, 64), - /* RGBA8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 2341, u, 64), - /* BGRA8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 4321, u, 64), - /* ARGB8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 1234, u, 64), - /* ABGR8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 1432, u, 64), - /* RGB8 */ SMOL_CONV (1234, p, 1324, p, a324, p, 234, u, 64), - /* BGR8 */ SMOL_CONV (1234, p, 1324, p, a324, p, 432, u, 64), - }, - /* ABGR8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 2341, p, 64), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1432, p, 64), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64), - /* RGBA8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 4321, u, 64), - /* BGRA8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 2341, u, 64), - /* ARGB8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 1432, u, 64), - /* ABGR8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 1234, u, 64), - /* RGB8 */ SMOL_CONV (1234, p, 1324, p, a324, p, 432, u, 64), - /* BGR8 */ SMOL_CONV (1234, p, 1324, p, a324, p, 234, u, 64), - }, - /* RGBA8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 1234, p, 64), - /* BGRA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 3214, p, 64), - /* ARGB8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4123, p, 64), - /* ABGR8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4321, p, 64), - /* RGBA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128), - /* BGRA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128), - /* ARGB8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128), - /* ABGR8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128), - /* RGB8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128), - /* BGR8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128), - }, - /* BGRA8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 3214, p, 64), - /* BGRA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 1234, p, 64), - /* ARGB8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4321, p, 64), - /* ABGR8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4123, p, 64), - /* RGBA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128), - /* BGRA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128), - /* ARGB8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128), - /* ABGR8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128), - /* RGB8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128), - /* BGR8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128), - }, - /* ARGB8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 2341, p, 64), - /* BGRA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 4321, p, 64), - /* ARGB8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1234, p, 64), - /* ABGR8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1432, p, 64), - /* RGBA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128), - /* BGRA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128), - /* ARGB8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128), - /* ABGR8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128), - /* RGB8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128), - /* BGR8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128), - }, - /* ABGR8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 4321, p, 64), - /* BGRA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 2341, p, 64), - /* ARGB8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1432, p, 64), - /* ABGR8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1234, p, 64), - /* RGBA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128), - /* BGRA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128), - /* ARGB8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128), - /* ABGR8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128), - /* RGB8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128), - /* BGR8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128), - }, - /* RGB8 -> */ - { - /* RGBA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64), - /* BGRA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64), - /* ARGB8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64), - /* ABGR8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64), - /* RGBA8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64), - /* BGRA8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64), - /* ARGB8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64), - /* ABGR8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64), - /* RGB8 */ SMOL_CONV (123, p, 132a, p, 132a, p, 123, p, 64), - /* BGR8 */ SMOL_CONV (123, p, 132a, p, 132a, p, 321, p, 64), - }, - /* BGR8 -> */ - { - /* RGBA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64), - /* BGRA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64), - /* ARGB8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64), - /* ABGR8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64), - /* RGBA8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64), - /* BGRA8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64), - /* ARGB8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64), - /* ABGR8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64), - /* RGB8 */ SMOL_CONV (123, p, 132a, p, 132a, p, 321, p, 64), - /* BGR8 */ SMOL_CONV (123, p, 132a, p, 132a, p, 123, p, 64), - } - }, + /* Horizontal init */ + init_horizontal, - { - /* Conversions where accumulators must hold the sum of up to - * 65535 pixels. We need 128bpp for this. */ - - /* RGBA8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 3214, p, 128), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4123, p, 128), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128), - /* RGBA8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 1234, u, 128), - /* BGRA8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 3214, u, 128), - /* ARGB8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4123, u, 128), - /* ABGR8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4321, u, 128), - /* RGB8 */ SMOL_CONV (1234, p, 1234, p, 123a, p, 123, u, 128), - /* BGR8 */ SMOL_CONV (1234, p, 1234, p, 123a, p, 321, u, 128), - }, - /* BGRA8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 3214, p, 128), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4123, p, 128), - /* RGBA8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 3214, u, 128), - /* BGRA8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 1234, u, 128), - /* ARGB8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4321, u, 128), - /* ABGR8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4123, u, 128), - /* RGB8 */ SMOL_CONV (1234, p, 1234, p, 123a, p, 321, u, 128), - /* BGR8 */ SMOL_CONV (1234, p, 1234, p, 123a, p, 123, u, 128), - }, - /* ARGB8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 2341, p, 128), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1432, p, 128), - /* RGBA8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 2341, u, 128), - /* BGRA8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 4321, u, 128), - /* ARGB8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 1234, u, 128), - /* ABGR8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 1432, u, 128), - /* RGB8 */ SMOL_CONV (1234, p, 1234, p, a234, p, 234, u, 128), - /* BGR8 */ SMOL_CONV (1234, p, 1234, p, a234, p, 432, u, 128), - }, - /* ABGR8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 2341, p, 128), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1432, p, 128), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128), - /* RGBA8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 4321, u, 128), - /* BGRA8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 2341, u, 128), - /* ARGB8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 1432, u, 128), - /* ABGR8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 1234, u, 128), - /* RGB8 */ SMOL_CONV (1234, p, 1234, p, a234, p, 432, u, 128), - /* BGR8 */ SMOL_CONV (1234, p, 1234, p, a234, p, 234, u, 128), - }, - /* RGBA8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 1234, p, 128), - /* BGRA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 3214, p, 128), - /* ARGB8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4123, p, 128), - /* ABGR8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4321, p, 128), - /* RGBA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128), - /* BGRA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128), - /* ARGB8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128), - /* ABGR8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128), - /* RGB8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128), - /* BGR8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128), - }, - /* BGRA8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 3214, p, 128), - /* BGRA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 1234, p, 128), - /* ARGB8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4321, p, 128), - /* ABGR8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4123, p, 128), - /* RGBA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128), - /* BGRA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128), - /* ARGB8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128), - /* ABGR8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128), - /* RGB8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128), - /* BGR8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128), - }, - /* ARGB8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 2341, p, 128), - /* BGRA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 4321, p, 128), - /* ARGB8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1234, p, 128), - /* ABGR8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1432, p, 128), - /* RGBA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128), - /* BGRA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128), - /* ARGB8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128), - /* ABGR8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128), - /* RGB8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128), - /* BGR8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128), - }, - /* ABGR8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 4321, p, 128), - /* BGRA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 2341, p, 128), - /* ARGB8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1432, p, 128), - /* ABGR8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1234, p, 128), - /* RGBA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128), - /* BGRA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128), - /* ARGB8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128), - /* ABGR8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128), - /* RGB8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128), - /* BGR8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128), - }, - /* RGB8 -> */ - { - /* RGBA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128), - /* BGRA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128), - /* ARGB8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128), - /* ABGR8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128), - /* RGBA8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128), - /* BGRA8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128), - /* ARGB8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128), - /* ABGR8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128), - /* RGB8 */ SMOL_CONV (123, p, 123a, p, 123a, p, 123, p, 128), - /* BGR8 */ SMOL_CONV (123, p, 123a, p, 123a, p, 321, p, 128), - }, - /* BGR8 -> */ - { - /* RGBA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128), - /* BGRA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128), - /* ARGB8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128), - /* ABGR8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128), - /* RGBA8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128), - /* BGRA8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128), - /* ARGB8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128), - /* ABGR8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128), - /* RGB8 */ SMOL_CONV (123, p, 123a, p, 123a, p, 321, p, 128), - /* BGR8 */ SMOL_CONV (123, p, 123a, p, 123a, p, 123, p, 128), - } -} } -}; + /* Vertical init */ + init_vertical, -static const SmolImplementation avx2_implementation = -{ { /* Horizontal filters */ + { + /* 24bpp */ + }, + { + /* 32bpp */ + }, { /* 64bpp */ interp_horizontal_copy_64bpp, @@ -2833,38 +3477,65 @@ static const SmolImplementation avx2_implementation = }, { /* Vertical filters */ + { + /* 24bpp */ + }, + { + /* 32bpp */ + }, { /* 64bpp */ - scale_outrow_copy, - scale_outrow_one_64bpp, - scale_outrow_bilinear_0h_64bpp, - scale_outrow_bilinear_1h_64bpp, - scale_outrow_bilinear_2h_64bpp, - scale_outrow_bilinear_3h_64bpp, - scale_outrow_bilinear_4h_64bpp, - scale_outrow_bilinear_5h_64bpp, - scale_outrow_bilinear_6h_64bpp, - scale_outrow_box_64bpp + scale_dest_row_copy, + scale_dest_row_one_64bpp, + scale_dest_row_bilinear_0h_64bpp, + scale_dest_row_bilinear_1h_64bpp, + scale_dest_row_bilinear_2h_64bpp, + scale_dest_row_bilinear_3h_64bpp, + scale_dest_row_bilinear_4h_64bpp, + scale_dest_row_bilinear_5h_64bpp, + scale_dest_row_bilinear_6h_64bpp, + scale_dest_row_box_64bpp }, { /* 128bpp */ - scale_outrow_copy, - scale_outrow_one_128bpp, - scale_outrow_bilinear_0h_128bpp, - scale_outrow_bilinear_1h_128bpp, - scale_outrow_bilinear_2h_128bpp, - scale_outrow_bilinear_3h_128bpp, - scale_outrow_bilinear_4h_128bpp, - scale_outrow_bilinear_5h_128bpp, - scale_outrow_bilinear_6h_128bpp, - scale_outrow_box_128bpp + scale_dest_row_copy, + scale_dest_row_one_128bpp, + scale_dest_row_bilinear_0h_128bpp, + scale_dest_row_bilinear_1h_128bpp, + scale_dest_row_bilinear_2h_128bpp, + scale_dest_row_bilinear_3h_128bpp, + scale_dest_row_bilinear_4h_128bpp, + scale_dest_row_bilinear_5h_128bpp, + scale_dest_row_bilinear_6h_128bpp, + scale_dest_row_box_128bpp } }, - &avx2_conversions + { + /* Composite over color */ + NULL, + NULL, + NULL, + NULL + }, + { + /* Composite over dest */ + NULL, + NULL, + NULL, + NULL + }, + { + /* Clear dest */ + NULL, + NULL, + NULL, + NULL + }, + repack_meta }; const SmolImplementation * _smol_get_avx2_implementation (void) { - return &avx2_implementation; + return &implementation; } diff --git a/chafa/internal/smolscale/smolscale-generic.c b/chafa/internal/smolscale/smolscale-generic.c new file mode 100644 index 00000000..1a27db8c --- /dev/null +++ b/chafa/internal/smolscale/smolscale-generic.c @@ -0,0 +1,3080 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Copyright © 2019-2023 Hans Petter Jansson. See COPYING for details. */ + +#include +#include /* malloc, free, alloca */ +#include /* memset */ +#include +#include "smolscale-private.h" + +/* ---------------------- * + * Context initialization * + * ---------------------- */ + +/* Linear precalc array: + * + * Each sample is extracted from a pair of adjacent pixels. The sample precalc + * consists of the first pixel's index, followed by its sample fraction [0..256]. + * The second sample is implicitly taken at index+1 and weighted as 256-fraction. + * _ _ _ + * In |_| |_| |_| + * \_/ \_/ <- two samples per output pixel + * Out |_| |_| + * + * When halving, + * _ _ _ + * In |_| |_| |_| + * \_/ \_/ <- four samples per output pixel + * |_| |_| + * \_/ <- halving + * Out |_| + */ + +static void +precalc_linear_range (uint16_t *array_out, + int first_index, int last_index, + uint64_t first_sample_ofs, uint64_t sample_step, + int sample_ofs_px_max, + int32_t dest_clip_before_px, + int *array_i_inout) +{ + uint64_t sample_ofs; + int i; + + sample_ofs = first_sample_ofs; + + for (i = first_index; i < last_index; i++) + { + uint16_t sample_ofs_px = sample_ofs / SMOL_BILIN_MULTIPLIER; + + if (sample_ofs_px >= sample_ofs_px_max - 1) + { + if (i >= dest_clip_before_px) + { + array_out [(*array_i_inout) * 2] = sample_ofs_px_max - 2; + array_out [(*array_i_inout) * 2 + 1] = 0; + (*array_i_inout)++; + } + continue; + } + + if (i >= dest_clip_before_px) + { + array_out [(*array_i_inout) * 2] = sample_ofs_px; + array_out [(*array_i_inout) * 2 + 1] = SMOL_SMALL_MUL + - ((sample_ofs / (SMOL_BILIN_MULTIPLIER / SMOL_SMALL_MUL)) % SMOL_SMALL_MUL); + (*array_i_inout)++; + } + + sample_ofs += sample_step; + } +} + +static void +precalc_bilinear_array (uint16_t *array, + uint64_t src_dim_spx, + uint64_t dest_ofs_spx, + uint64_t dest_dim_spx, + uint32_t dest_dim_prehalving_px, + unsigned int n_halvings, + int32_t dest_clip_before_px) +{ + uint32_t src_dim_px = SMOL_SPX_TO_PX (src_dim_spx); + uint64_t first_sample_ofs [3]; + uint64_t sample_step; + int i = 0; + + assert (src_dim_px > 1); + + dest_ofs_spx %= SMOL_SUBPIXEL_MUL; + + if (src_dim_spx > dest_dim_spx) + { + /* Minification */ + sample_step = ((uint64_t) src_dim_spx * SMOL_BILIN_MULTIPLIER) / dest_dim_spx; + first_sample_ofs [0] = (sample_step - SMOL_BILIN_MULTIPLIER) / 2; + first_sample_ofs [1] = ((sample_step - SMOL_BILIN_MULTIPLIER) / 2) + + ((sample_step * (SMOL_SUBPIXEL_MUL - dest_ofs_spx) * (1 << n_halvings)) / SMOL_SUBPIXEL_MUL); + } + else + { + /* Magnification */ + sample_step = ((src_dim_spx - SMOL_SUBPIXEL_MUL) * SMOL_BILIN_MULTIPLIER) + / (dest_dim_spx > SMOL_SUBPIXEL_MUL ? (dest_dim_spx - SMOL_SUBPIXEL_MUL) : 1); + first_sample_ofs [0] = 0; + first_sample_ofs [1] = (sample_step * (SMOL_SUBPIXEL_MUL - dest_ofs_spx)) / SMOL_SUBPIXEL_MUL; + } + + first_sample_ofs [2] = (((uint64_t) src_dim_spx * SMOL_BILIN_MULTIPLIER) / SMOL_SUBPIXEL_MUL) + + ((sample_step - SMOL_BILIN_MULTIPLIER) / 2) + - sample_step * (1U << n_halvings); + + /* Left fringe */ + precalc_linear_range (array, + 0, + 1 << n_halvings, + first_sample_ofs [0], + sample_step, + src_dim_px, + dest_clip_before_px, + &i); + + /* Main range */ + precalc_linear_range (array, + 1 << n_halvings, + dest_dim_prehalving_px - (1 << n_halvings), + first_sample_ofs [1], + sample_step, + src_dim_px, + dest_clip_before_px, + &i); + + /* Right fringe */ + precalc_linear_range (array, + dest_dim_prehalving_px - (1 << n_halvings), + dest_dim_prehalving_px, + first_sample_ofs [2], + sample_step, + src_dim_px, + dest_clip_before_px, + &i); +} + +static void +precalc_boxes_array (uint32_t *array, + uint32_t *span_step, + uint32_t *span_mul, + uint32_t src_dim_spx, + int32_t dest_dim, + uint32_t dest_ofs_spx, + uint32_t dest_dim_spx, + int32_t dest_clip_before_px) +{ + uint64_t fracF, frac_stepF; + uint64_t f; + uint64_t stride; + uint64_t a, b; + int i, dest_i; + + dest_ofs_spx %= SMOL_SUBPIXEL_MUL; + + /* Output sample can't be less than a pixel. Fringe opacity is applied in + * a separate step. FIXME: May cause wrong subpixel distribution -- revisit. */ + if (dest_dim_spx < 256) + dest_dim_spx = 256; + + frac_stepF = ((uint64_t) src_dim_spx * SMOL_BIG_MUL) / (uint64_t) dest_dim_spx; + fracF = 0; + + stride = frac_stepF / (uint64_t) SMOL_BIG_MUL; + f = (frac_stepF / SMOL_SMALL_MUL) % SMOL_SMALL_MUL; + + /* We divide by (b + 1) instead of just (b) to avoid overflows in + * scale_128bpp_half(), which would affect horizontal box scaling. The + * fudge factor counters limited precision in the inverted division + * operation. It causes 16-bit values to undershoot by less than 127/65535 + * (<.2%). Since the final output is 8-bit, and rounding neutralizes the + * error, this doesn't matter. */ + + a = (SMOL_BOXES_MULTIPLIER * 255); + b = ((stride * 255) + ((f * 255) / 256)); + *span_step = frac_stepF / SMOL_SMALL_MUL; + *span_mul = (a + (b / 2)) / (b + 1); + + /* Left fringe */ + i = 0; + dest_i = 0; + + if (dest_i >= dest_clip_before_px) + array [i++] = 0; + + /* Main range */ + fracF = ((frac_stepF * (SMOL_SUBPIXEL_MUL - dest_ofs_spx)) / SMOL_SUBPIXEL_MUL); + for (dest_i = 1; dest_i < dest_dim - 1; dest_i++) + { + if (dest_i >= dest_clip_before_px) + array [i++] = fracF / SMOL_SMALL_MUL; + fracF += frac_stepF; + } + + /* Right fringe */ + if (dest_dim > 1 && dest_i >= dest_clip_before_px) + array [i++] = (((uint64_t) src_dim_spx * SMOL_SMALL_MUL - frac_stepF) / SMOL_SMALL_MUL); +} + +static void +init_dim (SmolDim *dim) +{ + if (dim->filter_type == SMOL_FILTER_ONE || dim->filter_type == SMOL_FILTER_COPY) + { + } + else if (dim->filter_type == SMOL_FILTER_BOX) + { + precalc_boxes_array (dim->precalc, + &dim->span_step, + &dim->span_mul, + dim->src_size_spx, + dim->placement_size_px, + dim->placement_ofs_spx, + dim->placement_size_spx, + dim->clip_before_px); + } + else /* SMOL_FILTER_BILINEAR_?H */ + { + precalc_bilinear_array (dim->precalc, + dim->src_size_spx, + dim->placement_ofs_spx, + dim->placement_size_prehalving_spx, + dim->placement_size_prehalving_px, + dim->n_halvings, + dim->clip_before_px); + } +} + +static void +init_horizontal (SmolScaleCtx *scale_ctx) +{ + init_dim (&scale_ctx->hdim); +} + +static void +init_vertical (SmolScaleCtx *scale_ctx) +{ + init_dim (&scale_ctx->vdim); +} + +/* ---------------------- * + * sRGB/linear conversion * + * ---------------------- */ + +static void +from_srgb_pixel_xxxa_128bpp (uint64_t * SMOL_RESTRICT pixel_inout) +{ + uint64_t part; + + part = pixel_inout [0]; + pixel_inout [0] = + ((uint64_t) _smol_from_srgb_lut [part >> 32] << 32) + | _smol_from_srgb_lut [part & 0xff]; + + part = pixel_inout [1]; + pixel_inout [1] = + ((uint64_t) _smol_from_srgb_lut [part >> 32] << 32) + | ((part & 0xffffffff) << 3) | 7; +} + +static void +to_srgb_pixel_xxxa_128bpp (const uint64_t *pixel_in, uint64_t *pixel_out) +{ + pixel_out [0] = + (((uint64_t) _smol_to_srgb_lut [pixel_in [0] >> 32]) << 32) + | _smol_to_srgb_lut [pixel_in [0] & 0xffff]; + + pixel_out [1] = + (((uint64_t) _smol_to_srgb_lut [pixel_in [1] >> 32]) << 32) + | (pixel_in [1] & 0xffffffff); /* FIXME: No need to preserve alpha? */ +} + +/* Fetches alpha from linear pixel. Input alpha is in the range [0x000..0x7ff]. + * Returned alpha is in the range [0x00..0xff], rounded towards 0xff. */ +static SMOL_INLINE uint8_t +get_alpha_from_linear_xxxa_128bpp (const uint64_t * SMOL_RESTRICT pixel_in) +{ + uint16_t alpha = (pixel_in [1] + 7) >> 3; + return (uint8_t) (alpha - (alpha >> 8)); /* Turn 0x100 into 0xff */ +} + +/* ----------------- * + * Premultiplication * + * ----------------- */ + +static SMOL_INLINE void +premul_u_to_p8_128bpp (uint64_t * SMOL_RESTRICT inout, + uint16_t alpha) +{ + inout [0] = ((inout [0] * (alpha + 1)) >> 8) & 0x000000ff000000ff; + inout [1] = ((inout [1] * (alpha + 1)) >> 8) & 0x000000ff000000ff; +} + +static SMOL_INLINE void +unpremul_p8_to_u_128bpp (const uint64_t *in, + uint64_t *out, + uint8_t alpha) +{ + out [0] = ((in [0] * _smol_inv_div_p8_lut [alpha]) + >> INVERTED_DIV_SHIFT_P8) & 0x000000ff000000ff; + out [1] = ((in [1] * _smol_inv_div_p8_lut [alpha]) + >> INVERTED_DIV_SHIFT_P8) & 0x000000ff000000ff; +} + +static SMOL_INLINE uint64_t +premul_u_to_p8_64bpp (const uint64_t in, + uint16_t alpha) +{ + return ((in * (alpha + 1)) >> 8) & 0x00ff00ff00ff00ff; +} + +static SMOL_INLINE uint64_t +unpremul_p8_to_u_64bpp (const uint64_t in, + uint8_t alpha) +{ + uint64_t in_128bpp [2]; + uint64_t dest_128bpp [2]; + + in_128bpp [0] = (in & 0x000000ff000000ff); + in_128bpp [1] = (in & 0x00ff000000ff0000) >> 16; + + unpremul_p8_to_u_128bpp (in_128bpp, dest_128bpp, alpha); + + return dest_128bpp [0] | (dest_128bpp [1] << 16); +} + +static SMOL_INLINE void +premul_ul_to_p8l_128bpp (uint64_t * SMOL_RESTRICT inout, + uint16_t alpha) +{ + inout [0] = ((inout [0] * (alpha + 1)) >> 8) & 0x000007ff000007ff; + inout [1] = (((inout [1] * (alpha + 1)) >> 8) & 0x000007ff00000000) + | (inout [1] & 0x000007ff); +} + +static SMOL_INLINE void +unpremul_p8l_to_ul_128bpp (const uint64_t *in, + uint64_t *out, + uint8_t alpha) +{ + out [0] = ((in [0] * _smol_inv_div_p8l_lut [alpha]) + >> INVERTED_DIV_SHIFT_P8L) & 0x000007ff000007ff; + out [1] = ((in [1] * _smol_inv_div_p8l_lut [alpha]) + >> INVERTED_DIV_SHIFT_P8L) & 0x000007ff000007ff; +} + +static SMOL_INLINE void +premul_u_to_p16_128bpp (uint64_t *inout, + uint8_t alpha) +{ + inout [0] = inout [0] * ((uint16_t) alpha + 2); + inout [1] = inout [1] * ((uint16_t) alpha + 2); +} + +static SMOL_INLINE void +unpremul_p16_to_u_128bpp (const uint64_t * SMOL_RESTRICT in, + uint64_t * SMOL_RESTRICT out, + uint8_t alpha) +{ + out [0] = ((in [0] * _smol_inv_div_p16_lut [alpha]) + >> INVERTED_DIV_SHIFT_P16) & 0x000000ff000000ffULL; + out [1] = ((in [1] * _smol_inv_div_p16_lut [alpha]) + >> INVERTED_DIV_SHIFT_P16) & 0x000000ff000000ffULL; +} + +static SMOL_INLINE void +premul_ul_to_p16l_128bpp (uint64_t *inout, + uint8_t alpha) +{ + inout [0] = inout [0] * ((uint16_t) alpha + 2); + inout [1] = inout [1] * ((uint16_t) alpha + 2); +} + +static SMOL_INLINE void +unpremul_p16l_to_ul_128bpp (const uint64_t * SMOL_RESTRICT in, + uint64_t * SMOL_RESTRICT out, + uint8_t alpha) +{ + out [0] = ((in [0] * _smol_inv_div_p16l_lut [alpha]) + >> INVERTED_DIV_SHIFT_P16L) & 0x000007ff000007ffULL; + out [1] = ((in [1] * _smol_inv_div_p16l_lut [alpha]) + >> INVERTED_DIV_SHIFT_P16L) & 0x000007ff000007ffULL; +} + +/* --------- * + * Repacking * + * --------- */ + +/* It's nice to be able to shift by a negative amount */ +#define SHIFT_S(in, s) ((s >= 0) ? (in) << (s) : (in) >> -(s)) + +/* This is kind of bulky (~13 x86 insns), but it's about the same as using + * unions, and we don't have to worry about endianness. */ +#define PACK_FROM_1234_64BPP(in, a, b, c, d) \ + ((SHIFT_S ((in), ((a) - 1) * 16 + 8 - 32) & 0xff000000) \ + | (SHIFT_S ((in), ((b) - 1) * 16 + 8 - 40) & 0x00ff0000) \ + | (SHIFT_S ((in), ((c) - 1) * 16 + 8 - 48) & 0x0000ff00) \ + | (SHIFT_S ((in), ((d) - 1) * 16 + 8 - 56) & 0x000000ff)) + +#define PACK_FROM_1234_128BPP(in, a, b, c, d) \ + ((SHIFT_S ((in [((a) - 1) >> 1]), (((a) - 1) & 1) * 32 + 24 - 32) & 0xff000000) \ + | (SHIFT_S ((in [((b) - 1) >> 1]), (((b) - 1) & 1) * 32 + 24 - 40) & 0x00ff0000) \ + | (SHIFT_S ((in [((c) - 1) >> 1]), (((c) - 1) & 1) * 32 + 24 - 48) & 0x0000ff00) \ + | (SHIFT_S ((in [((d) - 1) >> 1]), (((d) - 1) & 1) * 32 + 24 - 56) & 0x000000ff)) + +#define SWAP_2_AND_3(n) ((n) == 2 ? 3 : (n) == 3 ? 2 : n) + +#define PACK_FROM_1324_64BPP(in, a, b, c, d) \ + ((SHIFT_S ((in), (SWAP_2_AND_3 (a) - 1) * 16 + 8 - 32) & 0xff000000) \ + | (SHIFT_S ((in), (SWAP_2_AND_3 (b) - 1) * 16 + 8 - 40) & 0x00ff0000) \ + | (SHIFT_S ((in), (SWAP_2_AND_3 (c) - 1) * 16 + 8 - 48) & 0x0000ff00) \ + | (SHIFT_S ((in), (SWAP_2_AND_3 (d) - 1) * 16 + 8 - 56) & 0x000000ff)) + +/* ---------------------- * + * Repacking: 24/32 -> 64 * + * ---------------------- */ + +static SMOL_INLINE uint64_t +unpack_pixel_123_p8_to_132a_p8_64bpp (const uint8_t *p) +{ + return ((uint64_t) p [0] << 48) | ((uint32_t) p [1] << 16) + | ((uint64_t) p [2] << 32) | 0xff; +} + +SMOL_REPACK_ROW_DEF (123, 24, 8, PREMUL8, COMPRESSED, + 1324, 64, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + *(dest_row++) = unpack_pixel_123_p8_to_132a_p8_64bpp (src_row); + src_row += 3; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE uint64_t +unpack_pixel_1234_p8_to_1324_p8_64bpp (uint32_t p) +{ + return (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff00ff); +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED, + 1324, 64, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + *(dest_row++) = unpack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++)); + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE uint64_t +unpack_pixel_1234_p8_to_3241_p8_64bpp (uint32_t p) +{ + return (((uint64_t) p & 0x0000ff00) << 40) + | (((uint64_t) p & 0x00ff00ff) << 16) | (p >> 24); +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED, + 3241, 64, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + *(dest_row++) = unpack_pixel_1234_p8_to_3241_p8_64bpp (*(src_row++)); + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE uint64_t +unpack_pixel_1234_p8_to_2431_p8_64bpp (uint32_t p) +{ + uint64_t p64 = p; + + return ((p64 & 0x00ff00ff) << 32) | ((p64 & 0x0000ff00) << 8) + | ((p64 & 0xff000000) >> 24); +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED, + 2431, 64, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + *(dest_row++) = unpack_pixel_1234_p8_to_2431_p8_64bpp (*(src_row++)); + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE uint64_t +unpack_pixel_a234_u_to_324a_p8_64bpp (uint32_t p) +{ + uint64_t p64 = (((uint64_t) p & 0x0000ff00) << 40) | (((uint64_t) p & 0x00ff00ff) << 16); + uint8_t alpha = p >> 24; + + return (premul_u_to_p8_64bpp (p64, alpha) & 0xffffffffffffff00ULL) | alpha; +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 3241, 64, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + *(dest_row++) = unpack_pixel_a234_u_to_324a_p8_64bpp (*(src_row++)); + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE uint64_t +unpack_pixel_1234_u_to_2431_p8_64bpp (uint32_t p) +{ + uint64_t p64 = (((uint64_t) p & 0x00ff00ff) << 32) | (((uint64_t) p & 0x0000ff00) << 8); + uint8_t alpha = p >> 24; + + return (premul_u_to_p8_64bpp (p64, alpha) & 0xffffffffffffff00ULL) | alpha; +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 2431, 64, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + *(dest_row++) = unpack_pixel_1234_u_to_2431_p8_64bpp (*(src_row++)); + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE uint64_t +unpack_pixel_123a_u_to_132a_p8_64bpp (uint32_t p) +{ + uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000); + uint8_t alpha = p & 0xff; + + return (premul_u_to_p8_64bpp (p64, alpha) & 0xffffffffffffff00ULL) | alpha; +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 1324, 64, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + *(dest_row++) = unpack_pixel_123a_u_to_132a_p8_64bpp (*(src_row++)); + } +} SMOL_REPACK_ROW_DEF_END + +/* ----------------------- * + * Repacking: 24/32 -> 128 * + * ----------------------- */ + +static SMOL_INLINE void +unpack_pixel_123_p8_to_123a_p8_128bpp (const uint8_t *in, + uint64_t *out) +{ + out [0] = ((uint64_t) in [0] << 32) | in [1]; + out [1] = ((uint64_t) in [2] << 32) | 0xff; +} + +SMOL_REPACK_ROW_DEF (123, 24, 8, PREMUL8, COMPRESSED, + 1234, 128, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + unpack_pixel_123_p8_to_123a_p8_128bpp (src_row, dest_row); + src_row += 3; + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (123, 24, 8, PREMUL8, COMPRESSED, + 1234, 128, 64, PREMUL8, LINEAR) { + while (dest_row != dest_row_max) + { + uint8_t alpha; + unpack_pixel_123_p8_to_123a_p8_128bpp (src_row, dest_row); + alpha = dest_row [1]; + unpremul_p8_to_u_128bpp (dest_row, dest_row, alpha); + from_srgb_pixel_xxxa_128bpp (dest_row); + premul_ul_to_p8l_128bpp (dest_row, alpha); + dest_row [1] = (dest_row [1] & 0xffffffff00000000) | (alpha << 3) | 7; + src_row += 3; + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_123a_p8_to_123a_p8_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = p; + out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16); + out [1] = ((p64 & 0x0000ff00) << 24) | (p64 & 0x000000ff); +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED, + 1234, 128, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + unpack_pixel_123a_p8_to_123a_p8_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED, + 1234, 128, 64, PREMUL8, LINEAR) { + while (dest_row != dest_row_max) + { + uint8_t alpha; + unpack_pixel_123a_p8_to_123a_p8_128bpp (*(src_row++), dest_row); + alpha = dest_row [1]; + unpremul_p8_to_u_128bpp (dest_row, dest_row, alpha); + from_srgb_pixel_xxxa_128bpp (dest_row); + premul_ul_to_p8l_128bpp (dest_row, alpha); + dest_row [1] = (dest_row [1] & 0xffffffff00000000) | (alpha << 3) | 7; + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_a234_p8_to_234a_p8_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = p; + out [0] = ((p64 & 0x00ff0000) << 16) | ((p64 & 0x0000ff00) >> 8); + out [1] = ((p64 & 0x000000ff) << 32) | ((p64 & 0xff000000) >> 24); +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED, + 2341, 128, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + unpack_pixel_a234_p8_to_234a_p8_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 32, 32, PREMUL8, COMPRESSED, + 2341, 128, 64, PREMUL8, LINEAR) { + while (dest_row != dest_row_max) + { + uint8_t alpha; + unpack_pixel_a234_p8_to_234a_p8_128bpp (*(src_row++), dest_row); + alpha = dest_row [1]; + unpremul_p8_to_u_128bpp (dest_row, dest_row, alpha); + from_srgb_pixel_xxxa_128bpp (dest_row); + premul_ul_to_p8l_128bpp (dest_row, alpha); + dest_row [1] = (dest_row [1] & 0xffffffff00000000) | (alpha << 3) | 7; + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_a234_u_to_234a_p8_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = (((uint64_t) p & 0x00ff00ff) << 32) | (((uint64_t) p & 0x0000ff00) << 8); + uint8_t alpha = p >> 24; + + p64 = (premul_u_to_p8_64bpp (p64, alpha) & 0xffffffffffffff00) | alpha; + out [0] = (p64 >> 16) & 0x000000ff000000ff; + out [1] = p64 & 0x000000ff000000ff; +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 2341, 128, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + unpack_pixel_a234_u_to_234a_p8_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_a234_u_to_234a_pl_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = p; + uint8_t alpha = p >> 24; + + out [0] = ((p64 & 0x00ff0000) << 16) | ((p64 & 0x0000ff00) >> 8); + out [1] = ((p64 & 0x000000ff) << 32); + + from_srgb_pixel_xxxa_128bpp (out); + premul_ul_to_p8l_128bpp (out, alpha); +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 2341, 128, 64, PREMUL8, LINEAR) { + while (dest_row != dest_row_max) + { + unpack_pixel_a234_u_to_234a_pl_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_a234_u_to_234a_p16_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = p; + uint8_t alpha = p >> 24; + + out [0] = ((p64 & 0x00ff0000) << 16) | ((p64 & 0x0000ff00) >> 8); + out [1] = ((p64 & 0x000000ff) << 32); + + premul_u_to_p16_128bpp (out, alpha); + out [1] |= (((uint16_t) alpha) << 8) | alpha; +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 2341, 128, 64, PREMUL16, COMPRESSED) { + while (dest_row != dest_row_max) + { + unpack_pixel_a234_u_to_234a_p16_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_a234_u_to_234a_p16l_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = p; + uint8_t alpha = p >> 24; + + out [0] = ((p64 & 0x00ff0000) << 16) | ((p64 & 0x0000ff00) >> 8); + out [1] = ((p64 & 0x000000ff) << 32); + + from_srgb_pixel_xxxa_128bpp (out); + out [0] *= alpha; + out [1] *= alpha; + + out [1] = (out [1] & 0xffffffff00000000ULL) | (alpha << 8) | alpha; +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 2341, 128, 64, PREMUL16, LINEAR) { + while (dest_row != dest_row_max) + { + unpack_pixel_a234_u_to_234a_p16l_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_123a_u_to_123a_p8_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000); + uint8_t alpha = p; + + p64 = (premul_u_to_p8_64bpp (p64, alpha) & 0xffffffffffffff00ULL) | alpha; + out [0] = (p64 >> 16) & 0x000000ff000000ff; + out [1] = p64 & 0x000000ff000000ff; +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 1234, 128, 64, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + unpack_pixel_123a_u_to_123a_p8_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_123a_u_to_123a_pl_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = p; + uint8_t alpha = p; + + out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16); + out [1] = ((p64 & 0x0000ff00) << 24); + + from_srgb_pixel_xxxa_128bpp (out); + premul_ul_to_p8l_128bpp (out, alpha); +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 1234, 128, 64, PREMUL8, LINEAR) { + while (dest_row != dest_row_max) + { + unpack_pixel_123a_u_to_123a_pl_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_123a_u_to_123a_p16_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = p; + uint8_t alpha = p; + + out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16); + out [1] = ((p64 & 0x0000ff00) << 24); + + premul_u_to_p16_128bpp (out, alpha); + out [1] |= (((uint16_t) alpha) << 8) | alpha; +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 1234, 128, 64, PREMUL16, COMPRESSED) { + while (dest_row != dest_row_max) + { + unpack_pixel_123a_u_to_123a_p16_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +static SMOL_INLINE void +unpack_pixel_123a_u_to_123a_p16l_128bpp (uint32_t p, + uint64_t *out) +{ + uint64_t p64 = p; + uint8_t alpha = p; + + out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16); + out [1] = ((p64 & 0x0000ff00) << 24); + + from_srgb_pixel_xxxa_128bpp (out); + premul_ul_to_p16l_128bpp (out, alpha); + + out [1] = (out [1] & 0xffffffff00000000ULL) | ((uint16_t) alpha << 8) | alpha; +} + +SMOL_REPACK_ROW_DEF (1234, 32, 32, UNASSOCIATED, COMPRESSED, + 1234, 128, 64, PREMUL16, LINEAR) { + while (dest_row != dest_row_max) + { + unpack_pixel_123a_u_to_123a_p16l_128bpp (*(src_row++), dest_row); + dest_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +/* ---------------------- * + * Repacking: 64 -> 24/32 * + * ---------------------- */ + +static SMOL_INLINE uint32_t +pack_pixel_1234_p8_to_1324_p8_64bpp (uint64_t in) +{ + return in | (in >> 24); +} + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 132, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++)); + *(dest_row++) = p >> 24; + *(dest_row++) = p >> 16; + *(dest_row++) = p >> 8; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 132, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint8_t alpha = *src_row; + uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t); + *(dest_row++) = p >> 24; + *(dest_row++) = p >> 16; + *(dest_row++) = p >> 8; + src_row++; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 231, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++)); + *(dest_row++) = p >> 8; + *(dest_row++) = p >> 16; + *(dest_row++) = p >> 24; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 231, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint8_t alpha = *src_row; + uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t); + *(dest_row++) = p >> 8; + *(dest_row++) = p >> 16; + *(dest_row++) = p >> 24; + src_row++; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 324, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++)); + *(dest_row++) = p >> 16; + *(dest_row++) = p >> 8; + *(dest_row++) = p; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 324, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint8_t alpha = *src_row >> 24; + uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t); + *(dest_row++) = p >> 16; + *(dest_row++) = p >> 8; + *(dest_row++) = p; + src_row++; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 423, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++)); + *(dest_row++) = p; + *(dest_row++) = p >> 8; + *(dest_row++) = p >> 16; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 423, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint8_t alpha = *src_row >> 24; + uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; + uint32_t p = pack_pixel_1234_p8_to_1324_p8_64bpp (t); + *(dest_row++) = p; + *(dest_row++) = p >> 8; + *(dest_row++) = p >> 16; + src_row++; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 1324, 32, 32, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + *(dest_row++) = pack_pixel_1234_p8_to_1324_p8_64bpp (*(src_row++)); + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, + 1324, 32, 32, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint8_t alpha = *src_row; + uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; + *(dest_row++) = pack_pixel_1234_p8_to_1324_p8_64bpp (t); + src_row++; + } +} SMOL_REPACK_ROW_DEF_END + +#define DEF_REPACK_FROM_1234_64BPP_TO_32BPP(a, b, c, d) \ + SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, \ + a##b##c##d, 32, 32, PREMUL8, COMPRESSED) { \ + while (dest_row != dest_row_max) \ + { \ + *(dest_row++) = PACK_FROM_1234_64BPP (*src_row, a, b, c, d); \ + src_row++; \ + } \ + } SMOL_REPACK_ROW_DEF_END \ + SMOL_REPACK_ROW_DEF (1234, 64, 64, PREMUL8, COMPRESSED, \ + a##b##c##d, 32, 32, UNASSOCIATED, COMPRESSED) { \ + while (dest_row != dest_row_max) \ + { \ + uint8_t alpha = *src_row; \ + uint64_t t = (unpremul_p8_to_u_64bpp (*src_row, alpha) & 0xffffffffffffff00ULL) | alpha; \ + *(dest_row++) = PACK_FROM_1234_64BPP (t, a, b, c, d); \ + src_row++; \ + } \ + } SMOL_REPACK_ROW_DEF_END + +DEF_REPACK_FROM_1234_64BPP_TO_32BPP (1, 4, 2, 3) +DEF_REPACK_FROM_1234_64BPP_TO_32BPP (2, 3, 1, 4) +DEF_REPACK_FROM_1234_64BPP_TO_32BPP (4, 1, 3, 2) +DEF_REPACK_FROM_1234_64BPP_TO_32BPP (4, 2, 3, 1) + +/* ----------------------- * + * Repacking: 128 -> 24/32 * + * ----------------------- */ + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, COMPRESSED, + 123, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + *(dest_row++) = *src_row >> 32; + *(dest_row++) = *(src_row++); + *(dest_row++) = *(src_row++) >> 32; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, LINEAR, + 123, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = get_alpha_from_linear_xxxa_128bpp (src_row); + unpremul_p8l_to_ul_128bpp (src_row, t, alpha); + to_srgb_pixel_xxxa_128bpp (src_row, t); + *(dest_row++) = t [0] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [1] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, COMPRESSED, + 123, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = src_row [1]; + unpremul_p8_to_u_128bpp (src_row, t, alpha); + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; + *(dest_row++) = t [0] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [1] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, LINEAR, + 123, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = get_alpha_from_linear_xxxa_128bpp (src_row); + unpremul_p8l_to_ul_128bpp (src_row, t, alpha); + to_srgb_pixel_xxxa_128bpp (t, t); + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; + *(dest_row++) = t [0] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [1] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16, COMPRESSED, + 123, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = src_row [1] >> 8; + unpremul_p16_to_u_128bpp (src_row, t, alpha); + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; + *(dest_row++) = t [0] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [1] >> 32; + src_row += 2; + } \ +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16, LINEAR, + 123, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = src_row [1] >> 8; + unpremul_p16_to_u_128bpp (src_row, t, alpha); + to_srgb_pixel_xxxa_128bpp (t, t); + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; + *(dest_row++) = t [0] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [1] >> 32; + src_row += 2; + } \ +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, COMPRESSED, + 321, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + *(dest_row++) = src_row [1] >> 32; + *(dest_row++) = src_row [0]; + *(dest_row++) = src_row [0] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, LINEAR, + 321, 24, 8, PREMUL8, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = get_alpha_from_linear_xxxa_128bpp (src_row); + unpremul_p8l_to_ul_128bpp (src_row, t, alpha); + to_srgb_pixel_xxxa_128bpp (t, t); + *(dest_row++) = t [1] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [0] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, COMPRESSED, + 321, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = src_row [1]; + unpremul_p8_to_u_128bpp (src_row, t, alpha); + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; + *(dest_row++) = t [1] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [0] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, LINEAR, + 321, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = get_alpha_from_linear_xxxa_128bpp (src_row); + unpremul_p8l_to_ul_128bpp (src_row, t, alpha); + to_srgb_pixel_xxxa_128bpp (t, t); + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; + *(dest_row++) = t [1] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [0] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16, COMPRESSED, + 321, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = src_row [1] >> 8; + unpremul_p16_to_u_128bpp (src_row, t, alpha); + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; + *(dest_row++) = t [1] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [0] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16, LINEAR, + 321, 24, 8, UNASSOCIATED, COMPRESSED) { + while (dest_row != dest_row_max) + { + uint64_t t [2]; + uint8_t alpha = src_row [1] >> 8; + unpremul_p16_to_u_128bpp (src_row, t, alpha); + to_srgb_pixel_xxxa_128bpp (t, t); + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; + *(dest_row++) = t [1] >> 32; + *(dest_row++) = t [0]; + *(dest_row++) = t [0] >> 32; + src_row += 2; + } +} SMOL_REPACK_ROW_DEF_END + +#define DEF_REPACK_FROM_1234_128BPP_TO_32BPP(a, b, c, d) \ + SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, COMPRESSED, \ + a##b##c##d, 32, 32, PREMUL8, COMPRESSED) { \ + while (dest_row != dest_row_max) \ + { \ + *(dest_row++) = PACK_FROM_1234_128BPP (src_row, a, b, c, d); \ + src_row += 2; \ + } \ + } SMOL_REPACK_ROW_DEF_END \ + SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, LINEAR, \ + a##b##c##d, 32, 32, PREMUL8, COMPRESSED) { \ + while (dest_row != dest_row_max) \ + { \ + uint64_t t [2]; \ + uint8_t alpha = get_alpha_from_linear_xxxa_128bpp (src_row); \ + unpremul_p8l_to_ul_128bpp (src_row, t, alpha); \ + to_srgb_pixel_xxxa_128bpp (t, t); \ + premul_u_to_p8_128bpp (t, alpha); \ + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \ + *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \ + src_row += 2; \ + } \ + } SMOL_REPACK_ROW_DEF_END \ + SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, COMPRESSED, \ + a##b##c##d, 32, 32, UNASSOCIATED, COMPRESSED) { \ + while (dest_row != dest_row_max) \ + { \ + uint64_t t [2]; \ + uint8_t alpha = src_row [1]; \ + unpremul_p8_to_u_128bpp (src_row, t, alpha); \ + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \ + *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \ + src_row += 2; \ + } \ + } SMOL_REPACK_ROW_DEF_END \ + SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL8, LINEAR, \ + a##b##c##d, 32, 32, UNASSOCIATED, COMPRESSED) { \ + while (dest_row != dest_row_max) \ + { \ + uint64_t t [2]; \ + uint8_t alpha = get_alpha_from_linear_xxxa_128bpp (src_row); \ + unpremul_p8l_to_ul_128bpp (src_row, t, alpha); \ + to_srgb_pixel_xxxa_128bpp (t, t); \ + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \ + *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \ + src_row += 2; \ + } \ + } SMOL_REPACK_ROW_DEF_END \ + SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16, COMPRESSED, \ + a##b##c##d, 32, 32, UNASSOCIATED, COMPRESSED) { \ + while (dest_row != dest_row_max) \ + { \ + uint64_t t [2]; \ + uint8_t alpha = src_row [1] >> 8; \ + unpremul_p16_to_u_128bpp (src_row, t, alpha); \ + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \ + *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \ + src_row += 2; \ + } \ + } SMOL_REPACK_ROW_DEF_END \ + SMOL_REPACK_ROW_DEF (1234, 128, 64, PREMUL16, LINEAR, \ + a##b##c##d, 32, 32, UNASSOCIATED, COMPRESSED) { \ + while (dest_row != dest_row_max) \ + { \ + uint64_t t [2]; \ + uint8_t alpha = src_row [1] >> 8; \ + unpremul_p16l_to_ul_128bpp (src_row, t, alpha); \ + to_srgb_pixel_xxxa_128bpp (t, t); \ + t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \ + *(dest_row++) = PACK_FROM_1234_128BPP (t, a, b, c, d); \ + src_row += 2; \ + } \ + } SMOL_REPACK_ROW_DEF_END + +DEF_REPACK_FROM_1234_128BPP_TO_32BPP (1, 2, 3, 4) +DEF_REPACK_FROM_1234_128BPP_TO_32BPP (3, 2, 1, 4) +DEF_REPACK_FROM_1234_128BPP_TO_32BPP (4, 1, 2, 3) +DEF_REPACK_FROM_1234_128BPP_TO_32BPP (4, 3, 2, 1) + +/* -------------- * + * Filter helpers * + * -------------- */ + +static SMOL_INLINE const char * +src_row_ofs_to_pointer (const SmolScaleCtx *scale_ctx, + uint32_t src_row_ofs) +{ + return scale_ctx->src_pixels + scale_ctx->src_rowstride * src_row_ofs; +} + +static SMOL_INLINE uint64_t +weight_pixel_64bpp (uint64_t p, + uint16_t w) +{ + return ((p * w) >> 8) & 0x00ff00ff00ff00ffULL; +} + +/* p and out may be the same address */ +static SMOL_INLINE void +weight_pixel_128bpp (const uint64_t *p, + uint64_t *out, + uint16_t w) +{ + out [0] = ((p [0] * w) >> 8) & 0x00ffffff00ffffffULL; + out [1] = ((p [1] * w) >> 8) & 0x00ffffff00ffffffULL; +} + +static SMOL_INLINE void +sum_parts_64bpp (const uint64_t ** SMOL_RESTRICT parts_in, + uint64_t * SMOL_RESTRICT accum, + uint32_t n) +{ + const uint64_t * SMOL_RESTRICT pp = *parts_in; + const uint64_t *pp_end; + + SMOL_ASSUME_ALIGNED_TO (pp, const uint64_t *, sizeof (uint64_t)); + + for (pp_end = pp + n; pp < pp_end; pp++) + { + *accum += *pp; + } + + *parts_in = pp; +} + +static SMOL_INLINE void +sum_parts_128bpp (const uint64_t ** SMOL_RESTRICT parts_in, + uint64_t * SMOL_RESTRICT accum, + uint32_t n) +{ + const uint64_t * SMOL_RESTRICT pp = *parts_in; + const uint64_t *pp_end; + + SMOL_ASSUME_ALIGNED_TO (pp, const uint64_t *, sizeof (uint64_t) * 2); + + for (pp_end = pp + n * 2; pp < pp_end; ) + { + accum [0] += *(pp++); + accum [1] += *(pp++); + } + + *parts_in = pp; +} + +static SMOL_INLINE uint64_t +scale_64bpp (uint64_t accum, + uint64_t multiplier) +{ + uint64_t a, b; + + a = ((accum & 0x0000ffff0000ffffULL) * multiplier + + (SMOL_BOXES_MULTIPLIER / 2) + ((SMOL_BOXES_MULTIPLIER / 2) << 32)) / SMOL_BOXES_MULTIPLIER; + b = (((accum & 0xffff0000ffff0000ULL) >> 16) * multiplier + + (SMOL_BOXES_MULTIPLIER / 2) + ((SMOL_BOXES_MULTIPLIER / 2) << 32)) / SMOL_BOXES_MULTIPLIER; + + return (a & 0x000000ff000000ffULL) | ((b & 0x000000ff000000ffULL) << 16); +} + +static SMOL_INLINE uint64_t +scale_128bpp_half (uint64_t accum, + uint64_t multiplier) +{ + uint64_t a, b; + + a = accum & 0x00000000ffffffffULL; + a = (a * multiplier + SMOL_BOXES_MULTIPLIER / 2) / SMOL_BOXES_MULTIPLIER; + + b = (accum & 0xffffffff00000000ULL) >> 32; + b = (b * multiplier + SMOL_BOXES_MULTIPLIER / 2) / SMOL_BOXES_MULTIPLIER; + + return a | (b << 32); +} + +static SMOL_INLINE void +scale_and_store_128bpp (const uint64_t * SMOL_RESTRICT accum, + uint64_t multiplier, + uint64_t ** SMOL_RESTRICT dest_row_parts) +{ + *(*dest_row_parts)++ = scale_128bpp_half (accum [0], multiplier); + *(*dest_row_parts)++ = scale_128bpp_half (accum [1], multiplier); +} + +static void +add_parts (const uint64_t * SMOL_RESTRICT parts_in, + uint64_t * SMOL_RESTRICT parts_acc_out, + uint32_t n) +{ + const uint64_t *parts_in_max = parts_in + n; + + SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *); + SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *); + + while (parts_in < parts_in_max) + *(parts_acc_out++) += *(parts_in++); +} + +static void +copy_weighted_parts_64bpp (const uint64_t * SMOL_RESTRICT parts_in, + uint64_t * SMOL_RESTRICT parts_acc_out, + uint32_t n, + uint16_t w) +{ + const uint64_t *parts_in_max = parts_in + n; + + SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *); + SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *); + + while (parts_in < parts_in_max) + { + *(parts_acc_out++) = weight_pixel_64bpp (*(parts_in++), w); + } +} + +static void +copy_weighted_parts_128bpp (const uint64_t * SMOL_RESTRICT parts_in, + uint64_t * SMOL_RESTRICT parts_acc_out, + uint32_t n, + uint16_t w) +{ + const uint64_t *parts_in_max = parts_in + n * 2; + + SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *); + SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *); + + while (parts_in < parts_in_max) + { + weight_pixel_128bpp (parts_in, parts_acc_out, w); + parts_in += 2; + parts_acc_out += 2; + } +} + +static void +add_weighted_parts_64bpp (const uint64_t * SMOL_RESTRICT parts_in, + uint64_t * SMOL_RESTRICT parts_acc_out, + uint32_t n, + uint16_t w) +{ + const uint64_t *parts_in_max = parts_in + n; + + SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *); + SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *); + + while (parts_in < parts_in_max) + { + *(parts_acc_out++) += weight_pixel_64bpp (*(parts_in++), w); + } +} + +static void +add_weighted_parts_128bpp (const uint64_t * SMOL_RESTRICT parts_in, + uint64_t * SMOL_RESTRICT parts_acc_out, + uint32_t n, + uint16_t w) +{ + const uint64_t *parts_in_max = parts_in + n * 2; + + SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *); + SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *); + + while (parts_in < parts_in_max) + { + uint64_t t [2]; + + weight_pixel_128bpp (parts_in, t, w); + parts_acc_out [0] += t [0]; + parts_acc_out [1] += t [1]; + parts_in += 2; + parts_acc_out += 2; + } +} + +static SMOL_INLINE void +apply_subpixel_opacity_64bpp (uint64_t * SMOL_RESTRICT u64_inout, uint16_t opacity) +{ + *u64_inout = ((*u64_inout * opacity) >> SMOL_SUBPIXEL_SHIFT) & 0x00ff00ff00ff00ffULL; +} + +static SMOL_INLINE void +apply_subpixel_opacity_128bpp_half (uint64_t * SMOL_RESTRICT u64_inout, uint16_t opacity) +{ + *u64_inout = ((*u64_inout * opacity) >> SMOL_SUBPIXEL_SHIFT) & 0x00ffffff00ffffffULL; +} + +static SMOL_INLINE void +apply_subpixel_opacity_128bpp (uint64_t *u64_inout, uint16_t opacity) +{ + apply_subpixel_opacity_128bpp_half (u64_inout, opacity); + apply_subpixel_opacity_128bpp_half (u64_inout + 1, opacity); +} + +static void +apply_subpixel_opacity_row_copy_64bpp (uint64_t * SMOL_RESTRICT u64_in, + uint64_t * SMOL_RESTRICT u64_out, + int n_pixels, + uint16_t opacity) +{ + uint64_t *u64_out_max = u64_out + n_pixels; + + while (u64_out != u64_out_max) + { + *u64_out = *u64_in++; + apply_subpixel_opacity_64bpp (u64_out, opacity); + u64_out++; + } +} + +static void +apply_subpixel_opacity_row_copy_128bpp (uint64_t * SMOL_RESTRICT u64_in, + uint64_t * SMOL_RESTRICT u64_out, + int n_pixels, + uint16_t opacity) +{ + uint64_t *u64_out_max = u64_out + (n_pixels * 2); + + while (u64_out != u64_out_max) + { + u64_out [0] = u64_in [0]; + u64_out [1] = u64_in [1]; + apply_subpixel_opacity_128bpp_half (u64_out, opacity); + apply_subpixel_opacity_128bpp_half (u64_out + 1, opacity); + u64_in += 2; + u64_out += 2; + } +} + +static void +apply_horiz_edge_opacity (const SmolScaleCtx *scale_ctx, + uint64_t *row_parts) +{ + if (scale_ctx->storage_type == SMOL_STORAGE_64BPP) + { + apply_subpixel_opacity_64bpp (&row_parts [0], scale_ctx->hdim.first_opacity); + apply_subpixel_opacity_64bpp (&row_parts [scale_ctx->hdim.placement_size_px - 1], scale_ctx->hdim.last_opacity); + } + else + { + apply_subpixel_opacity_128bpp (&row_parts [0], scale_ctx->hdim.first_opacity); + apply_subpixel_opacity_128bpp (&row_parts [(scale_ctx->hdim.placement_size_px - 1) * 2], scale_ctx->hdim.last_opacity); + } +} + +/* ------------------ * + * Horizontal scaling * + * ------------------ */ + +#define DEF_INTERP_HORIZONTAL_BILINEAR(n_halvings) \ +static void \ +interp_horizontal_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \ + const uint64_t * SMOL_RESTRICT src_row_parts, \ + uint64_t * SMOL_RESTRICT dest_row_parts) \ +{ \ + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; \ + uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px; \ + uint64_t p, q; \ + uint64_t F; \ + int i; \ +\ + SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); \ +\ + do \ + { \ + uint64_t accum = 0; \ +\ + for (i = 0; i < (1 << (n_halvings)); i++) \ + { \ + uint64_t pixel_ofs = *(precalc_x++); \ + F = *(precalc_x++); \ +\ + p = src_row_parts [pixel_ofs]; \ + q = src_row_parts [pixel_ofs + 1]; \ +\ + accum += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \ + } \ + *(dest_row_parts++) = ((accum) >> (n_halvings)) & 0x00ff00ff00ff00ffULL; \ + } \ + while (dest_row_parts != dest_row_parts_max); \ +} \ +\ +static void \ +interp_horizontal_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \ + const uint64_t * SMOL_RESTRICT src_row_parts, \ + uint64_t * SMOL_RESTRICT dest_row_parts) \ +{ \ + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; \ + uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px * 2; \ + uint64_t p, q; \ + uint64_t F; \ + int i; \ +\ + SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); \ +\ + do \ + { \ + uint64_t accum [2] = { 0 }; \ + \ + for (i = 0; i < (1 << (n_halvings)); i++) \ + { \ + uint32_t pixel_ofs = *(precalc_x++) * 2; \ + F = *(precalc_x++); \ +\ + p = src_row_parts [pixel_ofs]; \ + q = src_row_parts [pixel_ofs + 2]; \ +\ + accum [0] += ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \ +\ + p = src_row_parts [pixel_ofs + 1]; \ + q = src_row_parts [pixel_ofs + 3]; \ +\ + accum [1] += ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \ + } \ + *(dest_row_parts++) = ((accum [0]) >> (n_halvings)) & 0x00ffffff00ffffffULL; \ + *(dest_row_parts++) = ((accum [1]) >> (n_halvings)) & 0x00ffffff00ffffffULL; \ + } \ + while (dest_row_parts != dest_row_parts_max); \ +} + +static void +interp_horizontal_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx, + const uint64_t * SMOL_RESTRICT src_row_parts, + uint64_t * SMOL_RESTRICT dest_row_parts) +{ + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; + uint64_t * SMOL_RESTRICT dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px; + uint64_t p, q; + uint64_t F; + + SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); + + do + { + uint32_t pixel_ofs = *(precalc_x++); + F = *(precalc_x++); + + p = src_row_parts [pixel_ofs]; + q = src_row_parts [pixel_ofs + 1]; + + *(dest_row_parts++) = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; + } + while (dest_row_parts != dest_row_parts_max); +} + +static void +interp_horizontal_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx, + const uint64_t * SMOL_RESTRICT src_row_parts, + uint64_t * SMOL_RESTRICT dest_row_parts) +{ + const uint16_t * SMOL_RESTRICT precalc_x = scale_ctx->hdim.precalc; + uint64_t * SMOL_RESTRICT dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px * 2; + uint64_t p, q; + uint64_t F; + + SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); + + do + { + uint32_t pixel_ofs = *(precalc_x++) * 2; + F = *(precalc_x++); + + p = src_row_parts [pixel_ofs]; + q = src_row_parts [pixel_ofs + 2]; + + *(dest_row_parts++) = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; + + p = src_row_parts [pixel_ofs + 1]; + q = src_row_parts [pixel_ofs + 3]; + + *(dest_row_parts++) = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; + } + while (dest_row_parts != dest_row_parts_max); +} + +DEF_INTERP_HORIZONTAL_BILINEAR(1) +DEF_INTERP_HORIZONTAL_BILINEAR(2) +DEF_INTERP_HORIZONTAL_BILINEAR(3) +DEF_INTERP_HORIZONTAL_BILINEAR(4) +DEF_INTERP_HORIZONTAL_BILINEAR(5) +DEF_INTERP_HORIZONTAL_BILINEAR(6) + +static SMOL_INLINE void +unpack_box_precalc (const uint32_t precalc, + uint32_t step, + uint32_t *ofs0, + uint32_t *ofs1, + uint32_t *f0, + uint32_t *f1, + uint32_t *n) +{ + *ofs0 = precalc; + *ofs1 = *ofs0 + step; + *f0 = 256 - (*ofs0 % SMOL_SUBPIXEL_MUL); + *f1 = *ofs1 % SMOL_SUBPIXEL_MUL; + *ofs0 /= SMOL_SUBPIXEL_MUL; + *ofs1 /= SMOL_SUBPIXEL_MUL; + *n = *ofs1 - *ofs0 - 1; +} + +static void +interp_horizontal_boxes_64bpp (const SmolScaleCtx *scale_ctx, + const uint64_t *src_row_parts, + uint64_t * SMOL_RESTRICT dest_row_parts) +{ + const uint64_t * SMOL_RESTRICT pp; + const uint32_t *precalc_x = scale_ctx->hdim.precalc; + uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px; + uint64_t accum; + + SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); + + while (dest_row_parts < dest_row_parts_max) + { + uint32_t ofs0, ofs1; + uint32_t f0, f1; + uint32_t n; + + unpack_box_precalc (*(precalc_x++), + scale_ctx->hdim.span_step, + &ofs0, + &ofs1, + &f0, + &f1, + &n); + + pp = src_row_parts + ofs0; + + accum = weight_pixel_64bpp (*(pp++), f0); + sum_parts_64bpp ((const uint64_t ** SMOL_RESTRICT) &pp, &accum, n); + accum += weight_pixel_64bpp (*pp, f1); + + *(dest_row_parts++) = scale_64bpp (accum, scale_ctx->hdim.span_mul); + } +} + +static void +interp_horizontal_boxes_128bpp (const SmolScaleCtx *scale_ctx, + const uint64_t *src_row_parts, + uint64_t * SMOL_RESTRICT dest_row_parts) +{ + const uint64_t * SMOL_RESTRICT pp; + const uint32_t *precalc_x = scale_ctx->hdim.precalc; + uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px * 2; + uint64_t accum [2]; + + SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); + + while (dest_row_parts < dest_row_parts_max) + { + uint32_t ofs0, ofs1; + uint32_t f0, f1; + uint32_t n; + uint64_t t [2]; + + unpack_box_precalc (*(precalc_x++), + scale_ctx->hdim.span_step, + &ofs0, + &ofs1, + &f0, + &f1, + &n); + + pp = src_row_parts + (ofs0 * 2); + + weight_pixel_128bpp (pp, accum, f0); + pp += 2; + + sum_parts_128bpp ((const uint64_t ** SMOL_RESTRICT) &pp, accum, n); + + weight_pixel_128bpp (pp, t, f1); + accum [0] += t [0]; + accum [1] += t [1]; + + scale_and_store_128bpp (accum, + scale_ctx->hdim.span_mul, + (uint64_t ** SMOL_RESTRICT) &dest_row_parts); + } +} + +static void +interp_horizontal_one_64bpp (const SmolScaleCtx *scale_ctx, + const uint64_t * SMOL_RESTRICT src_row_parts, + uint64_t * SMOL_RESTRICT dest_row_parts) +{ + uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px; + uint64_t part; + + SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); + + part = *src_row_parts; + while (dest_row_parts != dest_row_parts_max) + *(dest_row_parts++) = part; +} + +static void +interp_horizontal_one_128bpp (const SmolScaleCtx *scale_ctx, + const uint64_t * SMOL_RESTRICT src_row_parts, + uint64_t * SMOL_RESTRICT dest_row_parts) +{ + uint64_t *dest_row_parts_max = dest_row_parts + scale_ctx->hdim.placement_size_px * 2; + + SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); + + while (dest_row_parts != dest_row_parts_max) + { + *(dest_row_parts++) = src_row_parts [0]; + *(dest_row_parts++) = src_row_parts [1]; + } +} + +static void +interp_horizontal_copy_64bpp (const SmolScaleCtx *scale_ctx, + const uint64_t * SMOL_RESTRICT src_row_parts, + uint64_t * SMOL_RESTRICT dest_row_parts) +{ + SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); + + memcpy (dest_row_parts, src_row_parts, scale_ctx->hdim.placement_size_px * sizeof (uint64_t)); +} + +static void +interp_horizontal_copy_128bpp (const SmolScaleCtx *scale_ctx, + const uint64_t * SMOL_RESTRICT src_row_parts, + uint64_t * SMOL_RESTRICT dest_row_parts) +{ + SMOL_ASSUME_ALIGNED (src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_row_parts, uint64_t *); + + memcpy (dest_row_parts, src_row_parts, scale_ctx->hdim.placement_size_px * 2 * sizeof (uint64_t)); +} + +static void +scale_horizontal (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + const char *src_row, + uint64_t *dest_row_parts) +{ + uint64_t * SMOL_RESTRICT src_row_unpacked; + + src_row_unpacked = local_ctx->parts_row [3]; + + /* 32-bit unpackers need 32-bit alignment */ + if ((((uintptr_t) src_row) & 3) + && scale_ctx->src_pixel_type != SMOL_PIXEL_RGB8 + && scale_ctx->src_pixel_type != SMOL_PIXEL_BGR8) + { + if (!local_ctx->src_aligned) + local_ctx->src_aligned = + smol_alloc_aligned (scale_ctx->hdim.src_size_px * sizeof (uint32_t), + &local_ctx->src_aligned_storage); + memcpy (local_ctx->src_aligned, src_row, scale_ctx->hdim.src_size_px * sizeof (uint32_t)); + src_row = (const char *) local_ctx->src_aligned; + } + + scale_ctx->src_unpack_row_func (src_row, + src_row_unpacked, + scale_ctx->hdim.src_size_px); + scale_ctx->hfilter_func (scale_ctx, + src_row_unpacked, + dest_row_parts); + + apply_horiz_edge_opacity (scale_ctx, dest_row_parts); +} + +/* ---------------- * + * Vertical scaling * + * ---------------- */ + +static void +update_local_ctx_bilinear (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) +{ + uint16_t *precalc_y = scale_ctx->vdim.precalc; + uint32_t new_src_ofs = precalc_y [dest_row_index * 2]; + + if (new_src_ofs == local_ctx->src_ofs) + return; + + if (new_src_ofs == local_ctx->src_ofs + 1) + { + uint64_t *t = local_ctx->parts_row [0]; + local_ctx->parts_row [0] = local_ctx->parts_row [1]; + local_ctx->parts_row [1] = t; + + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, new_src_ofs + 1), + local_ctx->parts_row [1]); + } + else + { + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, new_src_ofs), + local_ctx->parts_row [0]); + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, new_src_ofs + 1), + local_ctx->parts_row [1]); + } + + local_ctx->src_ofs = new_src_ofs; +} + +static void +interp_vertical_bilinear_store_64bpp (uint64_t F, + const uint64_t * SMOL_RESTRICT top_src_row_parts, + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, + uint64_t * SMOL_RESTRICT dest_parts, + uint32_t width) +{ + uint64_t *parts_dest_last = dest_parts + width; + + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); + + do + { + uint64_t p, q; + + p = *(top_src_row_parts++); + q = *(bottom_src_row_parts++); + + *(dest_parts++) = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; + } + while (dest_parts != parts_dest_last); +} + +static void +interp_vertical_bilinear_store_with_opacity_64bpp (uint64_t F, + const uint64_t * SMOL_RESTRICT top_src_row_parts, + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, + uint64_t * SMOL_RESTRICT dest_parts, + uint32_t width, + uint16_t opacity) +{ + uint64_t *parts_dest_last = dest_parts + width; + + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); + + do + { + uint64_t p, q; + + p = *(top_src_row_parts++); + q = *(bottom_src_row_parts++); + + *dest_parts = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; + apply_subpixel_opacity_64bpp (dest_parts, opacity); + dest_parts++; + } + while (dest_parts != parts_dest_last); +} + +static void +interp_vertical_bilinear_add_64bpp (uint64_t F, + const uint64_t * SMOL_RESTRICT top_src_row_parts, + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, + uint64_t * SMOL_RESTRICT accum_out, + uint32_t width) +{ + uint64_t *accum_dest_last = accum_out + width; + + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (accum_out, uint64_t *); + + do + { + uint64_t p, q; + + p = *(top_src_row_parts++); + q = *(bottom_src_row_parts++); + + *(accum_out++) += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; + } + while (accum_out != accum_dest_last); +} + +static void +interp_vertical_bilinear_store_128bpp (uint64_t F, + const uint64_t * SMOL_RESTRICT top_src_row_parts, + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, + uint64_t * SMOL_RESTRICT dest_parts, + uint32_t width) +{ + uint64_t *parts_dest_last = dest_parts + width; + + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); + + do + { + uint64_t p, q; + + p = *(top_src_row_parts++); + q = *(bottom_src_row_parts++); + + *(dest_parts++) = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; + } + while (dest_parts != parts_dest_last); +} + +static void +interp_vertical_bilinear_store_with_opacity_128bpp (uint64_t F, + const uint64_t * SMOL_RESTRICT top_src_row_parts, + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, + uint64_t * SMOL_RESTRICT dest_parts, + uint32_t width, + uint16_t opacity) +{ + uint64_t *parts_dest_last = dest_parts + width; + + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); + + do + { + uint64_t p, q; + + p = *(top_src_row_parts++); + q = *(bottom_src_row_parts++); + + *dest_parts = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; + apply_subpixel_opacity_128bpp_half (dest_parts, opacity); + dest_parts++; + } + while (dest_parts != parts_dest_last); +} + +static void +interp_vertical_bilinear_add_128bpp (uint64_t F, + const uint64_t * SMOL_RESTRICT top_src_row_parts, + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, + uint64_t * SMOL_RESTRICT accum_out, + uint32_t width) +{ + uint64_t *accum_dest_last = accum_out + width; + + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); + SMOL_ASSUME_ALIGNED (accum_out, uint64_t *); + + do + { + uint64_t p, q; + + p = *(top_src_row_parts++); + q = *(bottom_src_row_parts++); + + *(accum_out++) += ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; + } + while (accum_out != accum_dest_last); +} + +#define DEF_INTERP_VERTICAL_BILINEAR_FINAL(n_halvings) \ +static void \ +interp_vertical_bilinear_final_##n_halvings##h_64bpp (uint64_t F, \ + const uint64_t * SMOL_RESTRICT top_src_row_parts, \ + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \ + uint64_t * SMOL_RESTRICT accum_inout, \ + uint32_t width) \ +{ \ + uint64_t *accum_inout_last = accum_inout + width; \ +\ + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \ +\ + do \ + { \ + uint64_t p, q; \ +\ + p = *(top_src_row_parts++); \ + q = *(bottom_src_row_parts++); \ +\ + p = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \ + p = ((p + *accum_inout) >> n_halvings) & 0x00ff00ff00ff00ffULL; \ +\ + *(accum_inout++) = p; \ + } \ + while (accum_inout != accum_inout_last); \ +} \ +\ +static void \ +interp_vertical_bilinear_final_##n_halvings##h_with_opacity_64bpp (uint64_t F, \ + const uint64_t * SMOL_RESTRICT top_src_row_parts, \ + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \ + uint64_t * SMOL_RESTRICT accum_inout, \ + uint32_t width, \ + uint16_t opacity) \ +{ \ + uint64_t *accum_inout_last = accum_inout + width; \ +\ + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \ +\ + do \ + { \ + uint64_t p, q; \ +\ + p = *(top_src_row_parts++); \ + q = *(bottom_src_row_parts++); \ +\ + p = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \ + p = ((p + *accum_inout) >> n_halvings) & 0x00ff00ff00ff00ffULL; \ +\ + apply_subpixel_opacity_64bpp (&p, opacity); \ + *(accum_inout++) = p; \ + } \ + while (accum_inout != accum_inout_last); \ +} \ +\ +static void \ +interp_vertical_bilinear_final_##n_halvings##h_128bpp (uint64_t F, \ + const uint64_t * SMOL_RESTRICT top_src_row_parts, \ + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \ + uint64_t * SMOL_RESTRICT accum_inout, \ + uint32_t width) \ +{ \ + uint64_t *accum_inout_last = accum_inout + width; \ +\ + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \ +\ + do \ + { \ + uint64_t p, q; \ +\ + p = *(top_src_row_parts++); \ + q = *(bottom_src_row_parts++); \ +\ + p = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \ + p = ((p + *accum_inout) >> n_halvings) & 0x00ffffff00ffffffULL; \ +\ + *(accum_inout++) = p; \ + } \ + while (accum_inout != accum_inout_last); \ +} \ +\ +static void \ +interp_vertical_bilinear_final_##n_halvings##h_with_opacity_128bpp (uint64_t F, \ + const uint64_t * SMOL_RESTRICT top_src_row_parts, \ + const uint64_t * SMOL_RESTRICT bottom_src_row_parts, \ + uint64_t * SMOL_RESTRICT accum_inout, \ + uint32_t width, \ + uint16_t opacity) \ +{ \ + uint64_t *accum_inout_last = accum_inout + width; \ +\ + SMOL_ASSUME_ALIGNED (top_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (bottom_src_row_parts, const uint64_t *); \ + SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \ +\ + do \ + { \ + uint64_t p, q; \ +\ + p = *(top_src_row_parts++); \ + q = *(bottom_src_row_parts++); \ +\ + p = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \ + p = ((p + *accum_inout) >> n_halvings) & 0x00ffffff00ffffffULL; \ +\ + apply_subpixel_opacity_128bpp_half (&p, opacity); \ + *(accum_inout++) = p; \ + } \ + while (accum_inout != accum_inout_last); \ +} + +#define DEF_SCALE_DEST_ROW_BILINEAR(n_halvings) \ +static int \ +scale_dest_row_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \ + SmolLocalCtx *local_ctx, \ + uint32_t dest_row_index) \ +{ \ + uint16_t *precalc_y = scale_ctx->vdim.precalc; \ + uint32_t bilin_index = dest_row_index << (n_halvings); \ + unsigned int i; \ +\ + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \ + interp_vertical_bilinear_store_64bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px); \ + bilin_index++; \ +\ + for (i = 0; i < (1 << (n_halvings)) - 2; i++) \ + { \ + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \ + interp_vertical_bilinear_add_64bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px); \ + bilin_index++; \ + } \ +\ + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \ +\ + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) \ + interp_vertical_bilinear_final_##n_halvings##h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px, \ + scale_ctx->vdim.first_opacity); \ + else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) \ + interp_vertical_bilinear_final_##n_halvings##h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px, \ + scale_ctx->vdim.last_opacity); \ + else \ + interp_vertical_bilinear_final_##n_halvings##h_64bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px); \ +\ + return 2; \ +} \ +\ +static int \ +scale_dest_row_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \ + SmolLocalCtx *local_ctx, \ + uint32_t dest_row_index) \ +{ \ + uint16_t *precalc_y = scale_ctx->vdim.precalc; \ + uint32_t bilin_index = dest_row_index << (n_halvings); \ + unsigned int i; \ +\ + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \ + interp_vertical_bilinear_store_128bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px * 2); \ + bilin_index++; \ +\ + for (i = 0; i < (1 << (n_halvings)) - 2; i++) \ + { \ + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \ + interp_vertical_bilinear_add_128bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px * 2); \ + bilin_index++; \ + } \ +\ + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); \ +\ + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) \ + interp_vertical_bilinear_final_##n_halvings##h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px * 2, \ + scale_ctx->vdim.first_opacity); \ + else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) \ + interp_vertical_bilinear_final_##n_halvings##h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px * 2, \ + scale_ctx->vdim.last_opacity); \ + else \ + interp_vertical_bilinear_final_##n_halvings##h_128bpp (precalc_y [bilin_index * 2 + 1], \ + local_ctx->parts_row [0], \ + local_ctx->parts_row [1], \ + local_ctx->parts_row [2], \ + scale_ctx->hdim.placement_size_px * 2); \ +\ + return 2; \ +} + +static int +scale_dest_row_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) +{ + uint16_t *precalc_y = scale_ctx->vdim.precalc; + + update_local_ctx_bilinear (scale_ctx, local_ctx, dest_row_index); + + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) + interp_vertical_bilinear_store_with_opacity_64bpp (precalc_y [dest_row_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.first_opacity); + else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) + interp_vertical_bilinear_store_with_opacity_64bpp (precalc_y [dest_row_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.last_opacity); + else + interp_vertical_bilinear_store_64bpp (precalc_y [dest_row_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px); + + return 2; +} + +static int +scale_dest_row_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) +{ + uint16_t *precalc_y = scale_ctx->vdim.precalc; + + update_local_ctx_bilinear (scale_ctx, local_ctx, dest_row_index); + + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) + interp_vertical_bilinear_store_with_opacity_128bpp (precalc_y [dest_row_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2, + scale_ctx->vdim.first_opacity); + else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) + interp_vertical_bilinear_store_with_opacity_128bpp (precalc_y [dest_row_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2, + scale_ctx->vdim.last_opacity); + else + interp_vertical_bilinear_store_128bpp (precalc_y [dest_row_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2); + + return 2; +} + +DEF_INTERP_VERTICAL_BILINEAR_FINAL(1) + +static int +scale_dest_row_bilinear_1h_64bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) +{ + uint16_t *precalc_y = scale_ctx->vdim.precalc; + uint32_t bilin_index = dest_row_index << 1; + + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); + interp_vertical_bilinear_store_64bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px); + bilin_index++; + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); + + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) + interp_vertical_bilinear_final_1h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.first_opacity); + else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) + interp_vertical_bilinear_final_1h_with_opacity_64bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.last_opacity); + else + interp_vertical_bilinear_final_1h_64bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px); + + return 2; +} + +static int +scale_dest_row_bilinear_1h_128bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) +{ + uint16_t *precalc_y = scale_ctx->vdim.precalc; + uint32_t bilin_index = dest_row_index << 1; + + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); + interp_vertical_bilinear_store_128bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2); + bilin_index++; + update_local_ctx_bilinear (scale_ctx, local_ctx, bilin_index); + + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) + interp_vertical_bilinear_final_1h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2, + scale_ctx->vdim.first_opacity); + else if (dest_row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) + interp_vertical_bilinear_final_1h_with_opacity_128bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2, + scale_ctx->vdim.last_opacity); + else + interp_vertical_bilinear_final_1h_128bpp (precalc_y [bilin_index * 2 + 1], + local_ctx->parts_row [0], + local_ctx->parts_row [1], + local_ctx->parts_row [2], + scale_ctx->hdim.placement_size_px * 2); + + return 2; +} + +DEF_INTERP_VERTICAL_BILINEAR_FINAL(2) +DEF_SCALE_DEST_ROW_BILINEAR(2) +DEF_INTERP_VERTICAL_BILINEAR_FINAL(3) +DEF_SCALE_DEST_ROW_BILINEAR(3) +DEF_INTERP_VERTICAL_BILINEAR_FINAL(4) +DEF_SCALE_DEST_ROW_BILINEAR(4) +DEF_INTERP_VERTICAL_BILINEAR_FINAL(5) +DEF_SCALE_DEST_ROW_BILINEAR(5) +DEF_INTERP_VERTICAL_BILINEAR_FINAL(6) +DEF_SCALE_DEST_ROW_BILINEAR(6) + +static void +finalize_vertical_64bpp (const uint64_t * SMOL_RESTRICT accums, + uint64_t multiplier, + uint64_t * SMOL_RESTRICT dest_parts, + uint32_t n) +{ + uint64_t *parts_dest_max = dest_parts + n; + + SMOL_ASSUME_ALIGNED (accums, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); + + while (dest_parts != parts_dest_max) + { + *(dest_parts++) = scale_64bpp (*(accums++), multiplier); + } +} + +static void +finalize_vertical_with_opacity_64bpp (const uint64_t * SMOL_RESTRICT accums, + uint64_t multiplier, + uint64_t * SMOL_RESTRICT dest_parts, + uint32_t n, + uint16_t opacity) +{ + uint64_t *parts_dest_max = dest_parts + n; + + SMOL_ASSUME_ALIGNED (accums, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); + + while (dest_parts != parts_dest_max) + { + *dest_parts = scale_64bpp (*(accums++), multiplier); + apply_subpixel_opacity_64bpp (dest_parts, opacity); + dest_parts++; + } +} + +static int +scale_dest_row_box_64bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) +{ + uint32_t *precalc_y = scale_ctx->vdim.precalc; + uint32_t ofs_y, ofs_y_max; + uint32_t w1, w2; + uint32_t n, i; + + unpack_box_precalc (precalc_y [dest_row_index], + scale_ctx->vdim.span_step, + &ofs_y, + &ofs_y_max, + &w1, + &w2, + &n); + + /* First input row */ + + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, ofs_y), + local_ctx->parts_row [0]); + copy_weighted_parts_64bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + w1); + ofs_y++; + + /* Add up whole input rows */ + + for (i = 0; i < n; i++) + { + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, ofs_y), + local_ctx->parts_row [0]); + add_parts (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px); + + ofs_y++; + } + + /* Last input row */ + + if (ofs_y < scale_ctx->vdim.src_size_px) + { + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, ofs_y), + local_ctx->parts_row [0]); + add_weighted_parts_64bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + w2); + } + + /* Finalize */ + + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) + { + finalize_vertical_with_opacity_64bpp (local_ctx->parts_row [1], + scale_ctx->vdim.span_mul, + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.first_opacity); + } + else if (dest_row_index == scale_ctx->vdim.placement_size_px - 1 && scale_ctx->vdim.last_opacity < 256) + { + finalize_vertical_with_opacity_64bpp (local_ctx->parts_row [1], + scale_ctx->vdim.span_mul, + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.last_opacity); + } + else + { + finalize_vertical_64bpp (local_ctx->parts_row [1], + scale_ctx->vdim.span_mul, + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px); + } + + return 0; +} + +static void +finalize_vertical_128bpp (const uint64_t * SMOL_RESTRICT accums, + uint64_t multiplier, + uint64_t * SMOL_RESTRICT dest_parts, + uint32_t n) +{ + uint64_t *parts_dest_max = dest_parts + n * 2; + + SMOL_ASSUME_ALIGNED (accums, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); + + while (dest_parts != parts_dest_max) + { + *(dest_parts++) = scale_128bpp_half (*(accums++), multiplier); + *(dest_parts++) = scale_128bpp_half (*(accums++), multiplier); + } +} + +static void +finalize_vertical_with_opacity_128bpp (const uint64_t * SMOL_RESTRICT accums, + uint64_t multiplier, + uint64_t * SMOL_RESTRICT dest_parts, + uint32_t n, + uint16_t opacity) +{ + uint64_t *parts_dest_max = dest_parts + n * 2; + + SMOL_ASSUME_ALIGNED (accums, const uint64_t *); + SMOL_ASSUME_ALIGNED (dest_parts, uint64_t *); + + while (dest_parts != parts_dest_max) + { + dest_parts [0] = scale_128bpp_half (*(accums++), multiplier); + dest_parts [1] = scale_128bpp_half (*(accums++), multiplier); + apply_subpixel_opacity_128bpp (dest_parts, opacity); + dest_parts += 2; + } +} + +static int +scale_dest_row_box_128bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index) +{ + uint32_t *precalc_y = scale_ctx->vdim.precalc; + uint32_t ofs_y, ofs_y_max; + uint32_t w1, w2; + uint32_t n, i; + + unpack_box_precalc (precalc_y [dest_row_index], + scale_ctx->vdim.span_step, + &ofs_y, + &ofs_y_max, + &w1, + &w2, + &n); + + /* First input row */ + + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, ofs_y), + local_ctx->parts_row [0]); + copy_weighted_parts_128bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + w1); + ofs_y++; + + /* Add up whole input rows */ + + for (i = 0; i < n; i++) + { + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, ofs_y), + local_ctx->parts_row [0]); + add_parts (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px * 2); + + ofs_y++; + } + + /* Last input row */ + + if (ofs_y < scale_ctx->vdim.src_size_px) + { + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, ofs_y), + local_ctx->parts_row [0]); + add_weighted_parts_128bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + w2); + } + + if (dest_row_index == 0 && scale_ctx->vdim.first_opacity < 256) + { + finalize_vertical_with_opacity_128bpp (local_ctx->parts_row [1], + scale_ctx->vdim.span_mul, + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.first_opacity); + } + else if (dest_row_index == scale_ctx->vdim.placement_size_px - 1 && scale_ctx->vdim.last_opacity < 256) + { + finalize_vertical_with_opacity_128bpp (local_ctx->parts_row [1], + scale_ctx->vdim.span_mul, + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.last_opacity); + } + else + { + finalize_vertical_128bpp (local_ctx->parts_row [1], + scale_ctx->vdim.span_mul, + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px); + } + + return 0; +} + +static int +scale_dest_row_one_64bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t row_index) +{ + /* Scale the row and store it */ + + if (local_ctx->src_ofs != 0) + { + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, 0), + local_ctx->parts_row [0]); + local_ctx->src_ofs = 0; + } + + if (row_index == 0 && scale_ctx->vdim.first_opacity < 256) + { + apply_subpixel_opacity_row_copy_64bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.first_opacity); + } + else if (row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) + { + apply_subpixel_opacity_row_copy_64bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.last_opacity); + } + else + { + memcpy (local_ctx->parts_row [1], + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px * sizeof (uint64_t)); + } + + return 1; +} + +static int +scale_dest_row_one_128bpp (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t row_index) +{ + /* Scale the row and store it */ + + if (local_ctx->src_ofs != 0) + { + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, 0), + local_ctx->parts_row [0]); + local_ctx->src_ofs = 0; + } + + if (row_index == 0 && scale_ctx->vdim.first_opacity < 256) + { + apply_subpixel_opacity_row_copy_128bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.first_opacity); + } + else if (row_index == (scale_ctx->vdim.placement_size_px - 1) && scale_ctx->vdim.last_opacity < 256) + { + apply_subpixel_opacity_row_copy_128bpp (local_ctx->parts_row [0], + local_ctx->parts_row [1], + scale_ctx->hdim.placement_size_px, + scale_ctx->vdim.last_opacity); + } + else + { + memcpy (local_ctx->parts_row [1], + local_ctx->parts_row [0], + scale_ctx->hdim.placement_size_px * sizeof (uint64_t) * 2); + } + + return 1; +} + +static int +scale_dest_row_copy (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t row_index) +{ + scale_horizontal (scale_ctx, + local_ctx, + src_row_ofs_to_pointer (scale_ctx, row_index), + local_ctx->parts_row [0]); + + return 0; +} + +/* ----------- * + * Compositing * + * ----------- */ + +static void +composite_over_color_64bpp (uint64_t * SMOL_RESTRICT srcdest_row, + const uint64_t * SMOL_RESTRICT color_pixel, + uint32_t n_pixels) +{ + uint32_t i; + + SMOL_ASSUME_ALIGNED_TO (srcdest_row, uint64_t *, sizeof (uint64_t)); + SMOL_ASSUME_ALIGNED_TO (color_pixel, const uint64_t *, sizeof (uint64_t)); + + for (i = 0; i < n_pixels; i++) + { + uint64_t a = srcdest_row [i] & 0xff; + + srcdest_row [i] += (((*color_pixel) * (0xff - a)) >> 8) & 0x00ff00ff00ff00ff; + } +} + +static void +composite_over_color_128bpp (uint64_t * SMOL_RESTRICT srcdest_row, + const uint64_t * SMOL_RESTRICT color_pixel, + uint32_t n_pixels) +{ + uint32_t i; + + SMOL_ASSUME_ALIGNED_TO (srcdest_row, uint64_t *, sizeof (uint64_t) * 2); + SMOL_ASSUME_ALIGNED_TO (color_pixel, const uint64_t *, sizeof (uint64_t)); + + for (i = 0; i < n_pixels * 2; i += 2) + { + uint64_t a = (srcdest_row [i + 1] >> 4) & 0xfff; + + srcdest_row [i] += ((color_pixel [0] * (0xfff - a)) >> 12) & 0x000fffff000fffff; + srcdest_row [i + 1] += ((color_pixel [1] * (0xfff - a)) >> 12) & 0x000fffff000fffff; + } +} + +static void +composite_over_dest_64bpp (const uint64_t * SMOL_RESTRICT src_row, + uint64_t * SMOL_RESTRICT dest_row, + uint32_t n_pixels) +{ + uint32_t i; + + SMOL_ASSUME_ALIGNED_TO (src_row, const uint64_t *, sizeof (uint64_t)); + SMOL_ASSUME_ALIGNED_TO (dest_row, uint64_t *, sizeof (uint64_t)); + + for (i = 0; i < n_pixels; i++) + { + dest_row [i] = ((src_row [i] + dest_row [i]) >> 1) & 0x7fff7fff7fff7fff; + } +} + +static void +composite_over_dest_128bpp (const uint64_t * SMOL_RESTRICT src_row, + uint64_t * SMOL_RESTRICT dest_row, + uint32_t n_pixels) +{ + uint32_t i; + + SMOL_ASSUME_ALIGNED_TO (src_row, const uint64_t *, sizeof (uint64_t) * 2); + SMOL_ASSUME_ALIGNED_TO (dest_row, uint64_t *, sizeof (uint64_t) * 2); + + for (i = 0; i < n_pixels * 2; i += 2) + { + dest_row [i] = ((src_row [i] + dest_row [i]) >> 1) & 0x7fffffff7fffffff; + dest_row [i + 1] = ((src_row [i + 1] + dest_row [i + 1]) >> 1) & 0x7fffffff7fffffff; + } +} + +/* -------- * + * Clearing * + * -------- */ + +static void +clear_24bpp (const void *src_pixel_batch, + void *dest_row, + uint32_t n_pixels) +{ + const uint8_t *src_pixel_batch_u8 = src_pixel_batch; + const uint32_t *src_pixel_batch_u32 = src_pixel_batch; + uint8_t *dest_row_u8 = dest_row; + uint32_t *dest_row_u32 = dest_row; + uint32_t i; + + SMOL_ASSUME_ALIGNED_TO (src_pixel_batch_u32, const uint32_t *, sizeof (uint32_t)); + + for (i = 0; n_pixels - i >= 4; i += 4) + { + *(dest_row_u32++) = src_pixel_batch_u32 [0]; + *(dest_row_u32++) = src_pixel_batch_u32 [1]; + *(dest_row_u32++) = src_pixel_batch_u32 [2]; + } + + for ( ; i < n_pixels; i++) + { + dest_row_u8 [i * 3] = src_pixel_batch_u8 [0]; + dest_row_u8 [i * 3 + 1] = src_pixel_batch_u8 [1]; + dest_row_u8 [i * 3 + 2] = src_pixel_batch_u8 [2]; + } +} + +static void +clear_32bpp (const void *src_pixel_batch, + void *dest_row, + uint32_t n_pixels) +{ + const uint32_t *src_pixel_batch_u32 = src_pixel_batch; + uint32_t *dest_row_u32 = dest_row; + uint32_t i; + + SMOL_ASSUME_ALIGNED_TO (src_pixel_batch_u32, const uint32_t *, sizeof (uint32_t)); + + for (i = 0; i < n_pixels; i++) + dest_row_u32 [i] = src_pixel_batch_u32 [0]; +} + +/* --------------- * + * Function tables * + * --------------- */ + +#define R SMOL_REPACK_META + +static const SmolRepackMeta repack_meta [] = +{ + R (123, 24, PREMUL8, COMPRESSED, 1324, 64, PREMUL8, COMPRESSED), + + R (123, 24, PREMUL8, COMPRESSED, 1234, 128, PREMUL8, COMPRESSED), + R (123, 24, PREMUL8, COMPRESSED, 1234, 128, PREMUL8, LINEAR), + + R (1234, 32, PREMUL8, COMPRESSED, 1324, 64, PREMUL8, COMPRESSED), + R (1234, 32, PREMUL8, COMPRESSED, 2431, 64, PREMUL8, COMPRESSED), + R (1234, 32, PREMUL8, COMPRESSED, 3241, 64, PREMUL8, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 1324, 64, PREMUL8, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 2431, 64, PREMUL8, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 3241, 64, PREMUL8, COMPRESSED), + + R (1234, 32, PREMUL8, COMPRESSED, 1234, 128, PREMUL8, COMPRESSED), + R (1234, 32, PREMUL8, COMPRESSED, 2341, 128, PREMUL8, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 1234, 128, PREMUL8, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 2341, 128, PREMUL8, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 1234, 128, PREMUL16, COMPRESSED), + R (1234, 32, UNASSOCIATED, COMPRESSED, 2341, 128, PREMUL16, COMPRESSED), + R (1234, 32, PREMUL8, COMPRESSED, 1234, 128, PREMUL8, LINEAR), + R (1234, 32, PREMUL8, COMPRESSED, 2341, 128, PREMUL8, LINEAR), + R (1234, 32, UNASSOCIATED, COMPRESSED, 1234, 128, PREMUL8, LINEAR), + R (1234, 32, UNASSOCIATED, COMPRESSED, 2341, 128, PREMUL8, LINEAR), + R (1234, 32, UNASSOCIATED, COMPRESSED, 1234, 128, PREMUL16, LINEAR), + R (1234, 32, UNASSOCIATED, COMPRESSED, 2341, 128, PREMUL16, LINEAR), + + R (1234, 64, PREMUL8, COMPRESSED, 132, 24, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 231, 24, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 324, 24, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 423, 24, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 132, 24, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 231, 24, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 324, 24, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 423, 24, UNASSOCIATED, COMPRESSED), + + R (1234, 64, PREMUL8, COMPRESSED, 1324, 32, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 1423, 32, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 2314, 32, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 4132, 32, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 4231, 32, PREMUL8, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 1324, 32, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 1423, 32, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 2314, 32, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 4132, 32, UNASSOCIATED, COMPRESSED), + R (1234, 64, PREMUL8, COMPRESSED, 4231, 32, UNASSOCIATED, COMPRESSED), + + R (1234, 128, PREMUL8, COMPRESSED, 123, 24, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 321, 24, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 123, 24, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 321, 24, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, COMPRESSED, 123, 24, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, COMPRESSED, 321, 24, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, LINEAR, 123, 24, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, LINEAR, 321, 24, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, LINEAR, 123, 24, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, LINEAR, 321, 24, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, LINEAR, 123, 24, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, LINEAR, 321, 24, UNASSOCIATED, COMPRESSED), + + R (1234, 128, PREMUL8, COMPRESSED, 1234, 32, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 3214, 32, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 4123, 32, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 4321, 32, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 1234, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 3214, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 4123, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, COMPRESSED, 4321, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, COMPRESSED, 1234, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, COMPRESSED, 3214, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, COMPRESSED, 4123, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, COMPRESSED, 4321, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, LINEAR, 1234, 32, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, LINEAR, 3214, 32, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, LINEAR, 4123, 32, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, LINEAR, 4321, 32, PREMUL8, COMPRESSED), + R (1234, 128, PREMUL8, LINEAR, 1234, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, LINEAR, 3214, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, LINEAR, 4123, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL8, LINEAR, 4321, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, LINEAR, 1234, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, LINEAR, 3214, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, LINEAR, 4123, 32, UNASSOCIATED, COMPRESSED), + R (1234, 128, PREMUL16, LINEAR, 4321, 32, UNASSOCIATED, COMPRESSED), + + SMOL_REPACK_META_LAST +}; + +#undef R + +static const SmolImplementation implementation = +{ + /* Horizontal init */ + init_horizontal, + + /* Vertical init */ + init_vertical, + + { + /* Horizontal filters */ + { + /* 24bpp */ + }, + { + /* 32bpp */ + }, + { + /* 64bpp */ + interp_horizontal_copy_64bpp, + interp_horizontal_one_64bpp, + interp_horizontal_bilinear_0h_64bpp, + interp_horizontal_bilinear_1h_64bpp, + interp_horizontal_bilinear_2h_64bpp, + interp_horizontal_bilinear_3h_64bpp, + interp_horizontal_bilinear_4h_64bpp, + interp_horizontal_bilinear_5h_64bpp, + interp_horizontal_bilinear_6h_64bpp, + interp_horizontal_boxes_64bpp + }, + { + /* 128bpp */ + interp_horizontal_copy_128bpp, + interp_horizontal_one_128bpp, + interp_horizontal_bilinear_0h_128bpp, + interp_horizontal_bilinear_1h_128bpp, + interp_horizontal_bilinear_2h_128bpp, + interp_horizontal_bilinear_3h_128bpp, + interp_horizontal_bilinear_4h_128bpp, + interp_horizontal_bilinear_5h_128bpp, + interp_horizontal_bilinear_6h_128bpp, + interp_horizontal_boxes_128bpp + } + }, + { + /* Vertical filters */ + { + /* 24bpp */ + }, + { + /* 32bpp */ + }, + { + /* 64bpp */ + scale_dest_row_copy, + scale_dest_row_one_64bpp, + scale_dest_row_bilinear_0h_64bpp, + scale_dest_row_bilinear_1h_64bpp, + scale_dest_row_bilinear_2h_64bpp, + scale_dest_row_bilinear_3h_64bpp, + scale_dest_row_bilinear_4h_64bpp, + scale_dest_row_bilinear_5h_64bpp, + scale_dest_row_bilinear_6h_64bpp, + scale_dest_row_box_64bpp + }, + { + /* 128bpp */ + scale_dest_row_copy, + scale_dest_row_one_128bpp, + scale_dest_row_bilinear_0h_128bpp, + scale_dest_row_bilinear_1h_128bpp, + scale_dest_row_bilinear_2h_128bpp, + scale_dest_row_bilinear_3h_128bpp, + scale_dest_row_bilinear_4h_128bpp, + scale_dest_row_bilinear_5h_128bpp, + scale_dest_row_bilinear_6h_128bpp, + scale_dest_row_box_128bpp + } + }, + { + /* Composite over color */ + NULL, + NULL, + composite_over_color_64bpp, + composite_over_color_128bpp + }, + { + /* Composite over dest */ + NULL, + NULL, + composite_over_dest_64bpp, + composite_over_dest_128bpp + }, + { + /* Clear dest */ + clear_24bpp, + clear_32bpp, + NULL, + NULL + }, + repack_meta +}; + +const SmolImplementation * +_smol_get_generic_implementation (void) +{ + return &implementation; +} diff --git a/chafa/internal/smolscale/smolscale-private.h b/chafa/internal/smolscale/smolscale-private.h index 4ad3429f..bb305841 100644 --- a/chafa/internal/smolscale/smolscale-private.h +++ b/chafa/internal/smolscale/smolscale-private.h @@ -2,6 +2,15 @@ /* Copyright © 2019-2023 Hans Petter Jansson. See COPYING for details. */ +/* If you're just going to use Smolscale in your project, you don't have to + * worry about anything in here. The public API and documentation, such as + * it is, lives in smolscale.h. + * + * If, on the other hand, you're here to hack on Smolscale itself, this file + * contains all the internal shared declarations. */ + +#undef SMOL_ENABLE_ASSERTS + #include #include "smolscale.h" @@ -12,8 +21,16 @@ extern "C" { #endif -#include "config.h" +#ifdef SMOL_ENABLE_ASSERTS +# include +# define SMOL_ASSERT(x) assert (x) +#else +# define SMOL_ASSERT(x) +#endif +/* We'll use at most ~4MB of scratch space. That won't fit on the stack + * everywhere, so we default to malloc(). If you know better, you can define + * SMOL_USE_ALLOCA. */ #ifdef SMOL_USE_ALLOCA # define _SMOL_ALLOC(n) alloca (n) # define _SMOL_FREE(p) @@ -70,7 +87,10 @@ typedef unsigned int SmolBool; #define SMOL_ALIGNMENT 64 -#define SMOL_ASSUME_ALIGNED_TO(x, t, n) (x) = (t) __builtin_assume_aligned ((x), (n)) +#define SMOL_ASSIGN_ALIGNED_TO(x, t, n) (t) __builtin_assume_aligned ((x), (n)) +#define SMOL_ASSIGN_ALIGNED(x, t) SMOL_ASSIGN_ALIGNED_TO ((x), t, SMOL_ALIGNMENT) + +#define SMOL_ASSUME_ALIGNED_TO(x, t, n) (x) = SMOL_ASSIGN_ALIGNED_TO ((x), t, (n)) #define SMOL_ASSUME_ALIGNED(x, t) SMOL_ASSUME_ALIGNED_TO ((x), t, SMOL_ALIGNMENT) /* Pointer to beginning of storage is stored in *r. This must be passed to smol_free() later. */ @@ -81,8 +101,11 @@ typedef unsigned int SmolBool; typedef enum { + SMOL_STORAGE_24BPP, + SMOL_STORAGE_32BPP, SMOL_STORAGE_64BPP, SMOL_STORAGE_128BPP, + SMOL_STORAGE_MAX } SmolStorageType; @@ -104,92 +127,281 @@ typedef enum } SmolFilterType; +typedef enum +{ + SMOL_REORDER_1234_TO_1234, + + SMOL_REORDER_1234_TO_2341, + SMOL_REORDER_1234_TO_3214, + SMOL_REORDER_1234_TO_4123, + SMOL_REORDER_1234_TO_4321, + SMOL_REORDER_1234_TO_123, + SMOL_REORDER_1234_TO_321, + SMOL_REORDER_123_TO_1234, + + SMOL_REORDER_1234_TO_1324, + SMOL_REORDER_1234_TO_2314, + SMOL_REORDER_1234_TO_2431, + SMOL_REORDER_1234_TO_4132, + SMOL_REORDER_1234_TO_4231, + SMOL_REORDER_1234_TO_132, + SMOL_REORDER_1234_TO_231, + SMOL_REORDER_123_TO_1324, + + SMOL_REORDER_1234_TO_324, + SMOL_REORDER_1234_TO_423, + + SMOL_REORDER_1234_TO_1423, + SMOL_REORDER_1234_TO_3241, + + SMOL_REORDER_MAX +} +SmolReorderType; + +typedef enum +{ + SMOL_ALPHA_UNASSOCIATED, + SMOL_ALPHA_PREMUL8, + SMOL_ALPHA_PREMUL16, + + SMOL_ALPHA_MAX +} +SmolAlphaType; + +typedef enum +{ + SMOL_GAMMA_SRGB_COMPRESSED, + SMOL_GAMMA_SRGB_LINEAR, + + SMOL_GAMMA_MAX +} +SmolGammaType; + +typedef struct +{ + unsigned char src [4]; + unsigned char dest [4]; +} +SmolReorderMeta; + +typedef struct +{ + unsigned char storage; + unsigned char pixel_stride; + unsigned char alpha; + unsigned char order [4]; +} +SmolPixelTypeMeta; + /* For reusing rows that have already undergone horizontal scaling */ typedef struct { - uint32_t in_ofs; + uint32_t src_ofs; uint64_t *parts_row [4]; uint64_t *row_storage [4]; - uint32_t *in_aligned; - uint32_t *in_aligned_storage; + uint32_t *src_aligned; + uint32_t *src_aligned_storage; } -SmolVerticalCtx; +SmolLocalCtx; -typedef void (SmolUnpackRowFunc) (const uint32_t *row_in, - uint64_t *row_out, +typedef void (SmolInitFunc) (SmolScaleCtx *scale_ctx); +typedef void (SmolRepackRowFunc) (const void *src_row, + void *dest_row, uint32_t n_pixels); -typedef void (SmolPackRowFunc) (const uint64_t *row_in, - uint32_t *row_out, - uint32_t n_pixels); typedef void (SmolHFilterFunc) (const SmolScaleCtx *scale_ctx, - const uint64_t *row_limbs_in, - uint64_t *row_limbs_out); -typedef void (SmolVFilterFunc) (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out); - -#define SMOL_CONV_UNDEFINED { 0, NULL, NULL } -#define SMOL_CONV(un_from_order, un_from_type, un_to_order, un_to_type, pk_from_order, pk_from_type, pk_to_order, pk_to_type, storage_bits) \ -{ storage_bits / 8, (SmolUnpackRowFunc *) unpack_row_##un_from_order##_##un_from_type##_to_##un_to_order##_##un_to_type##_##storage_bits##bpp, \ -(SmolPackRowFunc *) pack_row_##pk_from_order##_##pk_from_type##_to_##pk_to_order##_##pk_to_type##_##storage_bits##bpp } + const uint64_t *src_row_limbs, + uint64_t *dest_row_limbs); +typedef int (SmolVFilterFunc) (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index); +typedef void (SmolCompositeOverColorFunc) (uint64_t *srcdest_row, + const uint64_t *color_pixel, + uint32_t n_pixels); +typedef void (SmolCompositeOverDestFunc) (const uint64_t *src_row, + uint64_t *dest_row, + uint32_t n_pixels); +typedef void (SmolClearFunc) (const void *src_pixel_batch, + void *dest_row, + uint32_t n_pixels); + +#define SMOL_REPACK_SIGNATURE_GET_REORDER(sig) ((sig) >> (2 * (SMOL_GAMMA_BITS + SMOL_ALPHA_BITS + SMOL_STORAGE_BITS))) + +#define SMOL_REORDER_BITS 6 +#define SMOL_STORAGE_BITS 2 +#define SMOL_ALPHA_BITS 2 +#define SMOL_GAMMA_BITS 1 + +#define SMOL_MAKE_REPACK_SIGNATURE_ANY_ORDER(src_storage, src_alpha, src_gamma, \ + dest_storage, dest_alpha, dest_gamma) \ + (((src_storage) << (SMOL_GAMMA_BITS + SMOL_ALPHA_BITS + SMOL_STORAGE_BITS + SMOL_GAMMA_BITS + SMOL_ALPHA_BITS)) \ + | ((src_alpha) << (SMOL_GAMMA_BITS + SMOL_ALPHA_BITS + SMOL_STORAGE_BITS + SMOL_GAMMA_BITS)) \ + | ((src_gamma) << (SMOL_GAMMA_BITS + SMOL_ALPHA_BITS + SMOL_STORAGE_BITS)) \ + | ((dest_storage) << (SMOL_GAMMA_BITS + SMOL_ALPHA_BITS)) \ + | ((dest_alpha) << (SMOL_GAMMA_BITS)) \ + | ((dest_gamma) << 0)) \ + +#define MASK_ITEM(m, n_bits) ((m) ? (1 << (n_bits)) - 1 : 0) + +#define SMOL_REPACK_SIGNATURE_ANY_ORDER_MASK(src_storage, src_alpha, src_gamma, \ + dest_storage, dest_alpha, dest_gamma) \ + SMOL_MAKE_REPACK_SIGNATURE_ANY_ORDER(MASK_ITEM (src_storage, SMOL_STORAGE_BITS), \ + MASK_ITEM (src_alpha, SMOL_ALPHA_BITS), \ + MASK_ITEM (src_gamma, SMOL_GAMMA_BITS), \ + MASK_ITEM (dest_storage, SMOL_STORAGE_BITS), \ + MASK_ITEM (dest_alpha, SMOL_ALPHA_BITS), \ + MASK_ITEM (dest_gamma, SMOL_GAMMA_BITS)) + +#define SMOL_REPACK_META(src_order, src_storage, src_alpha, src_gamma, \ + dest_order, dest_storage, dest_alpha, dest_gamma) \ + { (((SMOL_REORDER_##src_order##_TO_##dest_order) << 10) \ + | ((SMOL_STORAGE_##src_storage##BPP) << 8) | ((SMOL_ALPHA_##src_alpha) << 6) \ + | ((SMOL_GAMMA_SRGB_##src_gamma) << 5) \ + | ((SMOL_STORAGE_##dest_storage##BPP) << 3) | ((SMOL_ALPHA_##dest_alpha) << 1) \ + | ((SMOL_GAMMA_SRGB_##dest_gamma) << 0)), \ + (SmolRepackRowFunc *) repack_row_##src_order##_##src_storage##_##src_alpha##_##src_gamma##_to_##dest_order##_##dest_storage##_##dest_alpha##_##dest_gamma } + +#define SMOL_REPACK_META_LAST { 0xffff, NULL } typedef struct { - uint8_t n_bytes_per_pixel; - SmolUnpackRowFunc *unpack_row_func; - SmolPackRowFunc *pack_row_func; + uint16_t signature; + SmolRepackRowFunc *repack_row_func; } -SmolConversion; +SmolRepackMeta; + +#define SMOL_REPACK_ROW_DEF(src_order, src_storage, src_limb_bits, src_alpha, src_gamma, \ + dest_order, dest_storage, dest_limb_bits, dest_alpha, dest_gamma) \ + static void repack_row_##src_order##_##src_storage##_##src_alpha##_##src_gamma##_to_##dest_order##_##dest_storage##_##dest_alpha##_##dest_gamma \ + (const uint##src_limb_bits##_t * SMOL_RESTRICT src_row, \ + uint##dest_limb_bits##_t * SMOL_RESTRICT dest_row, \ + uint32_t n_pixels) \ + { \ + uint##dest_limb_bits##_t *dest_row_max = dest_row + n_pixels * (dest_storage / dest_limb_bits); \ + SMOL_ASSUME_ALIGNED_TO (src_row, uint##src_limb_bits##_t *, src_limb_bits / 8); \ + SMOL_ASSUME_ALIGNED_TO (dest_row, uint##dest_limb_bits##_t *, dest_limb_bits / 8); + +#define SMOL_REPACK_ROW_DEF_END } typedef struct { - SmolConversion conversions [SMOL_STORAGE_MAX] [SMOL_PIXEL_MAX] [SMOL_PIXEL_MAX]; + SmolInitFunc *init_h_func; + SmolInitFunc *init_v_func; + SmolHFilterFunc *hfilter_funcs [SMOL_STORAGE_MAX] [SMOL_FILTER_MAX]; + SmolVFilterFunc *vfilter_funcs [SMOL_STORAGE_MAX] [SMOL_FILTER_MAX]; + SmolCompositeOverColorFunc *composite_over_color_funcs [SMOL_STORAGE_MAX]; + SmolCompositeOverDestFunc *composite_over_dest_funcs [SMOL_STORAGE_MAX]; + SmolClearFunc *clear_funcs [SMOL_STORAGE_MAX]; + const SmolRepackMeta *repack_meta; } -SmolConversionTable; +SmolImplementation; typedef struct { - SmolHFilterFunc *hfilter_funcs [SMOL_STORAGE_MAX] [SMOL_FILTER_MAX]; - SmolVFilterFunc *vfilter_funcs [SMOL_STORAGE_MAX] [SMOL_FILTER_MAX]; + void *precalc; + SmolFilterType filter_type; + + uint32_t src_size_px, src_size_spx; + uint32_t dest_size_px, dest_size_spx; + + unsigned int n_halvings; + + int32_t placement_ofs_px, placement_ofs_spx; + uint32_t placement_size_px, placement_size_spx; + uint32_t placement_size_prehalving_px, placement_size_prehalving_spx; + + uint32_t span_step; /* For box filter, in spx */ + uint32_t span_mul; /* For box filter */ + + /* Opacity of first and last column or row. Used for subpixel placement + * and applied after each scaling step. */ + uint16_t first_opacity, last_opacity; + + /* Rows or cols to add consisting of unbroken pixel_color. This is done + * after scaling but before conversion to output pixel format. */ + uint16_t clear_before_px, clear_after_px; - /* Can be a NULL pointer if the implementation does not override any - * conversions. */ - const SmolConversionTable *ctab; + uint16_t clip_before_px, clip_after_px; } -SmolImplementation; +SmolDim; + +#define SMOL_CLEAR_BATCH_SIZE 96 struct SmolScaleCtx { /* */ - const uint32_t *pixels_in; - uint32_t *pixels_out; - uint32_t width_in, height_in, rowstride_in; - uint32_t width_out, height_out, rowstride_out; + const char *src_pixels; + char *dest_pixels; - SmolPixelType pixel_type_in, pixel_type_out; - SmolFilterType filter_h, filter_v; + uint32_t src_rowstride; + uint32_t dest_rowstride; + + SmolPixelType src_pixel_type, dest_pixel_type; SmolStorageType storage_type; + SmolGammaType gamma_type; + SmolCompositeOp composite_op; + + /* Raw flags passed in by user */ + SmolFlags flags; - SmolUnpackRowFunc *unpack_row_func; - SmolPackRowFunc *pack_row_func; + SmolRepackRowFunc *src_unpack_row_func; + SmolRepackRowFunc *dest_unpack_row_func; + SmolRepackRowFunc *pack_row_func; SmolHFilterFunc *hfilter_func; SmolVFilterFunc *vfilter_func; + SmolCompositeOverColorFunc *composite_over_color_func; + SmolCompositeOverDestFunc *composite_over_dest_func; + SmolClearFunc *clear_dest_func; /* User specified, can be NULL */ SmolPostRowFunc *post_row_func; void *user_data; - /* Each offset is split in two uint16s: { pixel index, fraction }. These - * are relative to the image after halvings have taken place. */ - uint16_t *offsets_x, *offsets_y; - uint32_t span_mul_x, span_mul_y; /* For box filter */ + /* Storage for dimensions' precalc arrays. Single allocation. */ + void *precalc_storage; + + /* Specifics for each dimension */ + SmolDim hdim, vdim; + + /* TRUE if input rows can be copied directly to output. */ + unsigned int is_noop : 1; + + /* TRUE if we have a color_pixel to composite on. */ + unsigned int have_composite_color : 1; - uint32_t width_bilin_out, height_bilin_out; - unsigned int width_halvings, height_halvings; + /* Unpacked color to composite on */ + uint64_t color_pixel [2]; + + /* A batch of color pixels in dest storage format. The batch size + * is in bytes, and chosen as an even multiple of 3, allowing 32 bytes wide + * operations (e.g. AVX2) to be used to clear packed RGB pixels. */ + unsigned char color_pixels_clear_batch [SMOL_CLEAR_BATCH_SIZE]; }; +/* Number of pixels to convert per batch. For some conversions, we perform + * an alpha test per batch to avoid the expensive premul path when the image + * data is opaque. + * + * FIXME: Unimplemented. */ +#define PIXEL_BATCH_SIZE 32 + +#define SRGB_LINEAR_BITS 11 +#define SRGB_LINEAR_MAX (1 << (SRGB_LINEAR_BITS)) + +extern const uint16_t _smol_from_srgb_lut [256]; +extern const uint8_t _smol_to_srgb_lut [SRGB_LINEAR_MAX]; + +#define INVERTED_DIV_SHIFT_P8 (21 - 8) +#define INVERTED_DIV_SHIFT_P8L (22 - SRGB_LINEAR_BITS) +#define INVERTED_DIV_SHIFT_P16 (24 - 8) +#define INVERTED_DIV_SHIFT_P16L (30 - SRGB_LINEAR_BITS) + +extern const uint32_t _smol_inv_div_p8_lut [256]; +extern const uint32_t _smol_inv_div_p8l_lut [256]; +extern const uint32_t _smol_inv_div_p16_lut [256]; +extern const uint32_t _smol_inv_div_p16l_lut [256]; + +const SmolImplementation *_smol_get_generic_implementation (void); #ifdef SMOL_WITH_AVX2 const SmolImplementation *_smol_get_avx2_implementation (void); #endif diff --git a/chafa/internal/smolscale/smolscale.c b/chafa/internal/smolscale/smolscale.c index 2f983219..c14bff7c 100644 --- a/chafa/internal/smolscale/smolscale.c +++ b/chafa/internal/smolscale/smolscale.c @@ -8,3023 +8,1337 @@ #include #include "smolscale-private.h" -/* --- Premultiplication --- */ +/* ----------------------- * + * Misc. conversion tables * + * ----------------------- */ -#define INVERTED_DIV_SHIFT 21 -#define INVERTED_DIV_ROUNDING (1U << (INVERTED_DIV_SHIFT - 1)) -#define INVERTED_DIV_ROUNDING_128BPP \ - (((uint64_t) INVERTED_DIV_ROUNDING << 32) | INVERTED_DIV_ROUNDING) - -/* This table is used to divide by an integer [1..255] using only a lookup, - * multiplication and a shift. This is faster than plain division on most - * architectures. +/* Table of channel reorderings. Each entry describes an available shuffle + * implementation indexed by its SmolReorderType. Channel indexes are 1-based. + * A zero index denotes that the channel is not present (e.g. 3-channel RGB). * - * Each entry represents the integer 2097152 (1 << 21) divided by the index - * of the entry. Consequently, + * Keep in sync with the private SmolReorderType enum. */ +static const SmolReorderMeta reorder_meta [SMOL_REORDER_MAX] = +{ + { { 1, 2, 3, 4 }, { 1, 2, 3, 4 } }, + + { { 1, 2, 3, 4 }, { 2, 3, 4, 1 } }, + { { 1, 2, 3, 4 }, { 3, 2, 1, 4 } }, + { { 1, 2, 3, 4 }, { 4, 1, 2, 3 } }, + { { 1, 2, 3, 4 }, { 4, 3, 2, 1 } }, + { { 1, 2, 3, 4 }, { 1, 2, 3, 0 } }, + { { 1, 2, 3, 4 }, { 3, 2, 1, 0 } }, + { { 1, 2, 3, 0 }, { 1, 2, 3, 4 } }, + + { { 1, 2, 3, 4 }, { 1, 3, 2, 4 } }, + { { 1, 2, 3, 4 }, { 2, 3, 1, 4 } }, + { { 1, 2, 3, 4 }, { 2, 4, 3, 1 } }, + { { 1, 2, 3, 4 }, { 4, 1, 3, 2 } }, + { { 1, 2, 3, 4 }, { 4, 2, 3, 1 } }, + { { 1, 2, 3, 4 }, { 1, 3, 2, 0 } }, + { { 1, 2, 3, 4 }, { 2, 3, 1, 0 } }, + { { 1, 2, 3, 0 }, { 1, 3, 2, 4 } }, + + { { 1, 2, 3, 4 }, { 3, 2, 4, 0 } }, + { { 1, 2, 3, 4 }, { 4, 2, 3, 0 } }, + + { { 1, 2, 3, 4 }, { 1, 4, 2, 3 } }, + { { 1, 2, 3, 4 }, { 3, 2, 4, 1 } } +}; + +/* Metadata for each pixel type. Storage type, number of channels, alpha type, + * channel ordering. Channel indexes are 1-based, and 4 is always alpha. A + * zero index denotes that the channel is not present. * - * (v / i) ~= (v * inverted_div_table [i] + (1 << 20)) >> 21 + * RGBA = 1, 2, 3, 4. * - * (1 << 20) is added for nearest rounding. It would've been nice to keep - * this table in uint16_t, but alas, we need the extra bits for sufficient - * precision. */ -static const uint32_t inverted_div_table [256] = -{ - 0,2097152,1048576, 699051, 524288, 419430, 349525, 299593, - 262144, 233017, 209715, 190650, 174763, 161319, 149797, 139810, - 131072, 123362, 116508, 110376, 104858, 99864, 95325, 91181, - 87381, 83886, 80660, 77672, 74898, 72316, 69905, 67650, - 65536, 63550, 61681, 59919, 58254, 56680, 55188, 53773, - 52429, 51150, 49932, 48771, 47663, 46603, 45590, 44620, - 43691, 42799, 41943, 41121, 40330, 39569, 38836, 38130, - 37449, 36792, 36158, 35545, 34953, 34380, 33825, 33288, - 32768, 32264, 31775, 31301, 30840, 30394, 29959, 29537, - 29127, 28728, 28340, 27962, 27594, 27236, 26887, 26546, - 26214, 25891, 25575, 25267, 24966, 24672, 24385, 24105, - 23831, 23564, 23302, 23046, 22795, 22550, 22310, 22075, - 21845, 21620, 21400, 21183, 20972, 20764, 20560, 20361, - 20165, 19973, 19784, 19600, 19418, 19240, 19065, 18893, - 18725, 18559, 18396, 18236, 18079, 17924, 17772, 17623, - 17476, 17332, 17190, 17050, 16913, 16777, 16644, 16513, - 16384, 16257, 16132, 16009, 15888, 15768, 15650, 15534, - 15420, 15308, 15197, 15087, 14980, 14873, 14769, 14665, - 14564, 14463, 14364, 14266, 14170, 14075, 13981, 13888, - 13797, 13707, 13618, 13530, 13443, 13358, 13273, 13190, - 13107, 13026, 12945, 12866, 12788, 12710, 12633, 12558, - 12483, 12409, 12336, 12264, 12193, 12122, 12053, 11984, - 11916, 11848, 11782, 11716, 11651, 11586, 11523, 11460, - 11398, 11336, 11275, 11215, 11155, 11096, 11038, 10980, - 10923, 10866, 10810, 10755, 10700, 10645, 10592, 10538, - 10486, 10434, 10382, 10331, 10280, 10230, 10180, 10131, - 10082, 10034, 9986, 9939, 9892, 9846, 9800, 9754, - 9709, 9664, 9620, 9576, 9533, 9489, 9447, 9404, - 9362, 9321, 9279, 9239, 9198, 9158, 9118, 9079, - 9039, 9001, 8962, 8924, 8886, 8849, 8812, 8775, - 8738, 8702, 8666, 8630, 8595, 8560, 8525, 8490, - 8456, 8422, 8389, 8355, 8322, 8289, 8257, 8224, + * Keep in sync with the public SmolPixelType enum. */ +static const SmolPixelTypeMeta pixel_type_meta [SMOL_PIXEL_MAX] = +{ + { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_PREMUL8, { 1, 2, 3, 4 } }, + { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_PREMUL8, { 3, 2, 1, 4 } }, + { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_PREMUL8, { 4, 1, 2, 3 } }, + { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_PREMUL8, { 4, 3, 2, 1 } }, + { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_UNASSOCIATED, { 1, 2, 3, 4 } }, + { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_UNASSOCIATED, { 3, 2, 1, 4 } }, + { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_UNASSOCIATED, { 4, 1, 2, 3 } }, + { SMOL_STORAGE_32BPP, 4, SMOL_ALPHA_UNASSOCIATED, { 4, 3, 2, 1 } }, + { SMOL_STORAGE_24BPP, 3, SMOL_ALPHA_PREMUL8, { 1, 2, 3, 0 } }, + { SMOL_STORAGE_24BPP, 3, SMOL_ALPHA_PREMUL8, { 3, 2, 1, 0 } } }; -/* Masking and shifting out the results is left to the caller. In - * and out may not overlap. */ -static SMOL_INLINE void -unpremul_i_to_u_128bpp (const uint64_t * SMOL_RESTRICT in, - uint64_t * SMOL_RESTRICT out, - uint8_t alpha) -{ - out [0] = ((in [0] * (uint64_t) inverted_div_table [alpha] - + INVERTED_DIV_ROUNDING_128BPP) >> INVERTED_DIV_SHIFT); - out [1] = ((in [1] * (uint64_t) inverted_div_table [alpha] - + INVERTED_DIV_ROUNDING_128BPP) >> INVERTED_DIV_SHIFT); -} - -static SMOL_INLINE void -unpremul_p_to_u_128bpp (const uint64_t * SMOL_RESTRICT in, - uint64_t * SMOL_RESTRICT out, - uint8_t alpha) -{ - out [0] = (((in [0] << 8) * (uint64_t) inverted_div_table [alpha]) - >> INVERTED_DIV_SHIFT); - out [1] = (((in [1] << 8) * (uint64_t) inverted_div_table [alpha]) - >> INVERTED_DIV_SHIFT); -} - -static SMOL_INLINE uint64_t -unpremul_p_to_u_64bpp (const uint64_t in, - uint8_t alpha) -{ - uint64_t in_128bpp [2]; - uint64_t out_128bpp [2]; - - in_128bpp [0] = (in & 0x000000ff000000ff); - in_128bpp [1] = (in & 0x00ff000000ff0000) >> 16; - - unpremul_p_to_u_128bpp (in_128bpp, out_128bpp, alpha); - - return (out_128bpp [0] & 0x000000ff000000ff) - | ((out_128bpp [1] & 0x000000ff000000ff) << 16); -} - -static SMOL_INLINE uint64_t -premul_u_to_p_64bpp (const uint64_t in, - uint8_t alpha) -{ - return ((in * ((uint16_t) alpha + 1)) >> 8) & 0x00ff00ff00ff00ff; -} - -/* --- Packing --- */ - -/* It's nice to be able to shift by a negative amount */ -#define SHIFT_S(in, s) ((s >= 0) ? (in) << (s) : (in) >> -(s)) - -#if 0 -/* Currently unused */ - -/* This is kind of bulky (~13 x86 insns), but it's about the same as using - * unions, and we don't have to worry about endianness. */ -#define PACK_FROM_1234_64BPP(in, a, b, c, d) \ - ((SHIFT_S ((in), ((a) - 1) * 16 + 8 - 32) & 0xff000000) \ - | (SHIFT_S ((in), ((b) - 1) * 16 + 8 - 40) & 0x00ff0000) \ - | (SHIFT_S ((in), ((c) - 1) * 16 + 8 - 48) & 0x0000ff00) \ - | (SHIFT_S ((in), ((d) - 1) * 16 + 8 - 56) & 0x000000ff)) -#endif +/* Channel ordering corrected for little endian. Only applies when fetching + * entire pixels as dwords (i.e. u32), so 3-byte variants don't require any + * correction. + * + * Keep in sync with the public SmolPixelType enum. */ +static const SmolPixelType pixel_type_u32_le [SMOL_PIXEL_MAX] = +{ + SMOL_PIXEL_ABGR8_PREMULTIPLIED, + SMOL_PIXEL_ARGB8_PREMULTIPLIED, + SMOL_PIXEL_BGRA8_PREMULTIPLIED, + SMOL_PIXEL_RGBA8_PREMULTIPLIED, + SMOL_PIXEL_ABGR8_UNASSOCIATED, + SMOL_PIXEL_ARGB8_UNASSOCIATED, + SMOL_PIXEL_BGRA8_UNASSOCIATED, + SMOL_PIXEL_RGBA8_UNASSOCIATED, + SMOL_PIXEL_RGB8, + SMOL_PIXEL_BGR8 +}; -#define PACK_FROM_1234_128BPP(in, a, b, c, d) \ - ((SHIFT_S ((in [((a) - 1) >> 1]), (((a) - 1) & 1) * 32 + 24 - 32) & 0xff000000) \ - | (SHIFT_S ((in [((b) - 1) >> 1]), (((b) - 1) & 1) * 32 + 24 - 40) & 0x00ff0000) \ - | (SHIFT_S ((in [((c) - 1) >> 1]), (((c) - 1) & 1) * 32 + 24 - 48) & 0x0000ff00) \ - | (SHIFT_S ((in [((d) - 1) >> 1]), (((d) - 1) & 1) * 32 + 24 - 56) & 0x000000ff)) - -#define SWAP_2_AND_3(n) ((n) == 2 ? 3 : (n) == 3 ? 2 : n) - -#define PACK_FROM_1324_64BPP(in, a, b, c, d) \ - ((SHIFT_S ((in), (SWAP_2_AND_3 (a) - 1) * 16 + 8 - 32) & 0xff000000) \ - | (SHIFT_S ((in), (SWAP_2_AND_3 (b) - 1) * 16 + 8 - 40) & 0x00ff0000) \ - | (SHIFT_S ((in), (SWAP_2_AND_3 (c) - 1) * 16 + 8 - 48) & 0x0000ff00) \ - | (SHIFT_S ((in), (SWAP_2_AND_3 (d) - 1) * 16 + 8 - 56) & 0x000000ff)) - -#if 0 -/* Currently unused */ - -#define PACK_FROM_1324_128BPP(in, a, b, c, d) \ - ((SHIFT_S ((in [(SWAP_2_AND_3 (a) - 1) >> 1]), \ - ((SWAP_2_AND_3 (a) - 1) & 1) * 32 + 24 - 32) & 0xff000000) \ - | (SHIFT_S ((in [(SWAP_2_AND_3 (b) - 1) >> 1]), \ - ((SWAP_2_AND_3 (b) - 1) & 1) * 32 + 24 - 40) & 0x00ff0000) \ - | (SHIFT_S ((in [(SWAP_2_AND_3 (c) - 1) >> 1]), \ - ((SWAP_2_AND_3 (c) - 1) & 1) * 32 + 24 - 48) & 0x0000ff00) \ - | (SHIFT_S ((in [(SWAP_2_AND_3 (d) - 1) >> 1]), \ - ((SWAP_2_AND_3 (d) - 1) & 1) * 32 + 24 - 56) & 0x000000ff)) -#endif +/* ----------------------------------- * + * sRGB/linear conversion: Shared code * + * ----------------------------------- */ -/* Pack p -> p */ +/* These tables are manually tweaked to be reversible without information + * loss; _smol_to_srgb_lut [_smol_from_srgb_lut [i]] == i. + * + * As a side effect, the values in the lower range (first 35 indexes) are + * off by < 2%. */ + +const uint16_t _smol_from_srgb_lut [256] = +{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, + 62, 64, 67, 69, 72, 74, 77, 79, 82, 85, 88, 91, + 94, 97, 100, 103, 106, 109, 113, 116, 119, 123, 126, 130, + 134, 137, 141, 145, 149, 153, 157, 161, 165, 169, 174, 178, + 182, 187, 191, 196, 201, 205, 210, 215, 220, 225, 230, 235, + 240, 246, 251, 256, 262, 267, 273, 279, 284, 290, 296, 302, + 308, 314, 320, 326, 333, 339, 345, 352, 359, 365, 372, 379, + 385, 392, 399, 406, 414, 421, 428, 435, 443, 450, 458, 466, + 473, 481, 489, 497, 505, 513, 521, 530, 538, 546, 555, 563, + 572, 581, 589, 598, 607, 616, 625, 634, 644, 653, 662, 672, + 682, 691, 701, 711, 721, 731, 741, 751, 761, 771, 782, 792, + 803, 813, 824, 835, 845, 856, 867, 879, 890, 901, 912, 924, + 935, 947, 959, 970, 982, 994, 1006, 1018, 1030, 1043, 1055, 1067, + 1080, 1093, 1105, 1118, 1131, 1144, 1157, 1170, 1183, 1197, 1210, 1223, + 1237, 1251, 1264, 1278, 1292, 1306, 1320, 1334, 1349, 1363, 1377, 1392, + 1407, 1421, 1436, 1451, 1466, 1481, 1496, 1512, 1527, 1542, 1558, 1573, + 1589, 1605, 1621, 1637, 1653, 1669, 1685, 1702, 1718, 1735, 1751, 1768, + 1785, 1802, 1819, 1836, 1853, 1870, 1887, 1905, 1922, 1940, 1958, 1976, + 1994, 2012, 2030, 2047 +}; -static SMOL_INLINE uint32_t -pack_pixel_1324_p_to_1234_p_64bpp (uint64_t in) -{ - return in | (in >> 24); -} +const uint8_t _smol_to_srgb_lut [SRGB_LINEAR_MAX] = +{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 32, 33, 34, 35, 36, 36, 37, 37, 38, 38, + 39, 39, 40, 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, + 46, 46, 47, 47, 47, 48, 48, 49, 49, 49, 50, 50, 51, 51, + 51, 52, 52, 53, 53, 53, 54, 54, 55, 55, 55, 56, 56, 56, + 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, + 61, 62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, + 66, 66, 66, 67, 67, 67, 68, 68, 68, 68, 69, 69, 69, 70, + 70, 70, 70, 71, 71, 71, 71, 72, 72, 72, 73, 73, 73, 73, + 74, 74, 74, 74, 75, 75, 75, 75, 76, 76, 76, 76, 77, 77, + 77, 77, 78, 78, 78, 78, 79, 79, 79, 79, 80, 80, 80, 80, + 81, 81, 81, 81, 81, 82, 82, 82, 82, 83, 83, 83, 83, 84, + 84, 84, 84, 84, 85, 85, 85, 85, 86, 86, 86, 86, 86, 87, + 87, 87, 87, 88, 88, 88, 88, 88, 89, 89, 89, 89, 89, 90, + 90, 90, 90, 90, 91, 91, 91, 91, 91, 92, 92, 92, 92, 92, + 93, 93, 93, 93, 93, 94, 94, 94, 94, 94, 95, 95, 95, 95, + 95, 96, 96, 96, 96, 96, 97, 97, 97, 97, 97, 98, 98, 98, + 98, 98, 98, 99, 99, 99, 99, 99, 100, 100, 100, 100, 100, 100, + 101, 101, 101, 101, 101, 102, 102, 102, 102, 102, 102, 103, 103, 103, + 103, 103, 103, 104, 104, 104, 104, 104, 105, 105, 105, 105, 105, 105, + 106, 106, 106, 106, 106, 106, 107, 107, 107, 107, 107, 107, 108, 108, + 108, 108, 108, 108, 109, 109, 109, 109, 109, 109, 110, 110, 110, 110, + 110, 110, 110, 111, 111, 111, 111, 111, 111, 112, 112, 112, 112, 112, + 112, 113, 113, 113, 113, 113, 113, 113, 114, 114, 114, 114, 114, 114, + 115, 115, 115, 115, 115, 115, 115, 116, 116, 116, 116, 116, 116, 117, + 117, 117, 117, 117, 117, 117, 118, 118, 118, 118, 118, 118, 118, 119, + 119, 119, 119, 119, 119, 120, 120, 120, 120, 120, 120, 120, 121, 121, + 121, 121, 121, 121, 121, 122, 122, 122, 122, 122, 122, 122, 123, 123, + 123, 123, 123, 123, 123, 124, 124, 124, 124, 124, 124, 124, 124, 125, + 125, 125, 125, 125, 125, 125, 126, 126, 126, 126, 126, 126, 126, 127, + 127, 127, 127, 127, 127, 127, 128, 128, 128, 128, 128, 128, 128, 128, + 129, 129, 129, 129, 129, 129, 129, 129, 130, 130, 130, 130, 130, 130, + 130, 131, 131, 131, 131, 131, 131, 131, 131, 132, 132, 132, 132, 132, + 132, 132, 132, 133, 133, 133, 133, 133, 133, 133, 134, 134, 134, 134, + 134, 134, 134, 134, 135, 135, 135, 135, 135, 135, 135, 135, 136, 136, + 136, 136, 136, 136, 136, 136, 137, 137, 137, 137, 137, 137, 137, 137, + 137, 138, 138, 138, 138, 138, 138, 138, 138, 139, 139, 139, 139, 139, + 139, 139, 139, 140, 140, 140, 140, 140, 140, 140, 140, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 142, 142, 142, 142, 142, 142, 142, 142, + 143, 143, 143, 143, 143, 143, 143, 143, 143, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 145, 145, 145, 145, 145, 145, 145, 145, 146, 146, + 146, 146, 146, 146, 146, 146, 146, 147, 147, 147, 147, 147, 147, 147, + 147, 147, 148, 148, 148, 148, 148, 148, 148, 148, 148, 149, 149, 149, + 149, 149, 149, 149, 149, 149, 150, 150, 150, 150, 150, 150, 150, 150, + 150, 151, 151, 151, 151, 151, 151, 151, 151, 151, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 153, 153, 153, 153, 153, 153, 153, 153, + 153, 154, 154, 154, 154, 154, 154, 154, 154, 154, 154, 155, 155, 155, + 155, 155, 155, 155, 155, 155, 156, 156, 156, 156, 156, 156, 156, 156, + 156, 156, 157, 157, 157, 157, 157, 157, 157, 157, 157, 158, 158, 158, + 158, 158, 158, 158, 158, 158, 158, 159, 159, 159, 159, 159, 159, 159, + 159, 159, 159, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 161, + 161, 161, 161, 161, 161, 161, 161, 161, 161, 162, 162, 162, 162, 162, + 162, 162, 162, 162, 162, 163, 163, 163, 163, 163, 163, 163, 163, 163, + 163, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, + 167, 168, 168, 168, 168, 168, 168, 168, 168, 168, 168, 169, 169, 169, + 169, 169, 169, 169, 169, 169, 169, 169, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 171, 171, 171, 171, 171, 171, 171, 171, 171, + 171, 172, 172, 172, 172, 172, 172, 172, 172, 172, 172, 172, 173, 173, + 173, 173, 173, 173, 173, 173, 173, 173, 173, 174, 174, 174, 174, 174, + 174, 174, 174, 174, 174, 174, 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 176, 176, 176, 176, 176, 176, 176, 176, 176, 176, 176, + 176, 177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, 178, 179, 179, 179, 179, 179, + 179, 179, 179, 179, 179, 179, 179, 180, 180, 180, 180, 180, 180, 180, + 180, 180, 180, 180, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, + 181, 181, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, + 183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 183, 184, 184, 184, + 184, 184, 184, 184, 184, 184, 184, 184, 184, 185, 185, 185, 185, 185, + 185, 185, 185, 185, 185, 185, 185, 186, 186, 186, 186, 186, 186, 186, + 186, 186, 186, 186, 186, 187, 187, 187, 187, 187, 187, 187, 187, 187, + 187, 187, 187, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, + 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 190, 191, 191, + 191, 191, 191, 191, 191, 191, 191, 191, 191, 191, 191, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 193, 193, 193, 193, 193, + 193, 193, 193, 193, 193, 193, 193, 193, 194, 194, 194, 194, 194, 194, + 194, 194, 194, 194, 194, 194, 194, 195, 195, 195, 195, 195, 195, 195, + 195, 195, 195, 195, 195, 195, 196, 196, 196, 196, 196, 196, 196, 196, + 196, 196, 196, 196, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, + 197, 197, 197, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, + 198, 198, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, + 199, 199, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, + 200, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, + 202, 202, 202, 202, 202, 202, 202, 202, 202, 202, 202, 202, 202, 202, + 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 204, + 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 205, + 205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 206, 206, + 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 208, 208, + 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 209, 209, + 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 210, 210, + 210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 211, 211, + 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, 212, 212, 212, 212, 212, 213, + 213, 213, 213, 213, 213, 213, 213, 213, 213, 213, 213, 213, 213, 214, + 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 215, + 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, + 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, + 216, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, + 217, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, + 218, 218, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, + 220, 220, 220, 220, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 222, 222, 222, 222, 222, 222, 222, 222, 222, + 222, 222, 222, 222, 222, 222, 223, 223, 223, 223, 223, 223, 223, 223, + 223, 223, 223, 223, 223, 223, 223, 223, 224, 224, 224, 224, 224, 224, + 224, 224, 224, 224, 224, 224, 224, 224, 224, 225, 225, 225, 225, 225, + 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 226, 226, 226, 226, + 226, 226, 226, 226, 226, 226, 226, 226, 226, 226, 226, 226, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, + 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, + 228, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 231, 231, 231, 231, 231, 231, 231, 231, 231, + 231, 231, 231, 231, 231, 231, 231, 232, 232, 232, 232, 232, 232, 232, + 232, 232, 232, 232, 232, 232, 232, 232, 232, 233, 233, 233, 233, 233, + 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, + 235, 235, 235, 235, 235, 235, 235, 235, 235, 235, 235, 235, 235, 235, + 235, 235, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 237, 237, 237, 237, 237, 237, 237, 237, 237, 237, + 237, 237, 237, 237, 237, 237, 237, 238, 238, 238, 238, 238, 238, 238, + 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 239, 239, 239, 239, + 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 239, 240, 240, + 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, 240, + 240, 241, 241, 241, 241, 241, 241, 241, 241, 241, 241, 241, 241, 241, + 241, 241, 241, 241, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 243, 243, 243, 243, 243, 243, 243, + 243, 243, 243, 243, 243, 243, 243, 243, 243, 243, 244, 244, 244, 244, + 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 245, + 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, + 245, 245, 245, 246, 246, 246, 246, 246, 246, 246, 246, 246, 246, 246, + 246, 246, 246, 246, 246, 246, 247, 247, 247, 247, 247, 247, 247, 247, + 247, 247, 247, 247, 247, 247, 247, 247, 247, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 249, + 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, + 250, 250, 250, 250, 250, 250, 251, 251, 251, 251, 251, 251, 251, 251, + 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, + 253, 253, 253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255 +}; -static void -pack_row_1324_p_to_1234_p_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint32_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint32_t *row_out_max = row_out + n_pixels; +/* ------------------------------ * + * Premultiplication: Shared code * + * ------------------------------ */ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); +/* These tables are used to divide by an integer [1..255] using only a lookup, + * multiplication and a shift. This is faster than plain division on most + * architectures. + * + * The values are tuned to minimize the error and overhead when turning + * premultiplied (8-bit, 11-bit, 16-bit, 19-bit) into 8-bit unassociated alpha. */ + +/* Lossy premultiplication: 8-bit * alpha -> 8-bit. Not perfectly reversible. */ +const uint32_t _smol_inv_div_p8_lut [256] = +{ + 0x00000000, 0x00181fff, 0x000e2fff, 0x0009f555, 0x0007a7ff, 0x00063333, 0x00052555, 0x00047999, + 0x0003ebff, 0x0003838e, 0x00032333, 0x0002e2e8, 0x0002a2aa, 0x0002713b, 0x00024249, 0x00021ccc, + 0x0001f924, 0x0001dd17, 0x0001c1c7, 0x0001ab4b, 0x000195e5, 0x0001830c, 0x000170c3, 0x00016164, + 0x0001537a, 0x0001450d, 0x0001390b, 0x00012de9, 0x00012249, 0x00011846, 0x00010eaa, 0x0001069e, + 0x0000fd70, 0x0000f6aa, 0x0000eedb, 0x0000e8f5, 0x0000e1c7, 0x0000db8e, 0x0000d638, 0x0000d069, + 0x0000cb7c, 0x0000c690, 0x0000c186, 0x0000bd2d, 0x0000b8f9, 0x0000b4f7, 0x0000b0ff, 0x0000ad65, + 0x0000a9ac, 0x0000a687, 0x0000a286, 0x00009f33, 0x00009c98, 0x000099b9, 0x000096f1, 0x00009414, + 0x00009147, 0x00008efa, 0x00008c59, 0x00008a0a, 0x000087b4, 0x0000856c, 0x00008341, 0x0000818c, + 0x00007f55, 0x00007d60, 0x00007b7f, 0x000079b2, 0x000077b9, 0x00007608, 0x0000743c, 0x000072b5, + 0x0000711a, 0x00006fac, 0x00006e1a, 0x00006cad, 0x00006b17, 0x000069e1, 0x00006864, 0x00006736, + 0x000065db, 0x000064b1, 0x00006357, 0x00006250, 0x000060c5, 0x00006060, 0x00005ec0, 0x00005da5, + 0x00005c9b, 0x00005b8b, 0x00005a93, 0x000059ab, 0x00005884, 0x00005799, 0x000056ae, 0x000055d5, + 0x000054e2, 0x0000540b, 0x00005343, 0x00005255, 0x0000517c, 0x000050a7, 0x00004fff, 0x00004f2c, + 0x00004e5e, 0x00004d9f, 0x00004cec, 0x00004c34, 0x00004b78, 0x00004adc, 0x00004a23, 0x00004981, + 0x000048ce, 0x00004836, 0x0000478c, 0x000046eb, 0x00004656, 0x000045b6, 0x00004524, 0x0000449c, + 0x000043ff, 0x00004370, 0x000042e2, 0x00004257, 0x000041ce, 0x00004147, 0x000040c3, 0x00004081, + 0x00003fff, 0x00003f57, 0x00003ed3, 0x00003e54, 0x00003dd9, 0x00003d60, 0x00003ced, 0x00003c78, + 0x00003c07, 0x00003b9a, 0x00003b26, 0x00003abf, 0x00003a4f, 0x000039e1, 0x0000397e, 0x00003917, + 0x000038af, 0x00003848, 0x000037ee, 0x00003787, 0x00003726, 0x000036c9, 0x0000366b, 0x0000360d, + 0x000035b0, 0x00003567, 0x00003503, 0x000034aa, 0x00003453, 0x000033ff, 0x000033a8, 0x0000335c, + 0x00003305, 0x000032b3, 0x00003266, 0x00003213, 0x000031c7, 0x00003178, 0x0000312b, 0x000030df, + 0x00003094, 0x00003049, 0x00003018, 0x00002fc0, 0x00002f76, 0x00002f2d, 0x00002ee8, 0x00002ea6, + 0x00002e5f, 0x00002e1c, 0x00002dd9, 0x00002d99, 0x00002d59, 0x00002d17, 0x00002cdf, 0x00002c9b, + 0x00002c5d, 0x00002c1c, 0x00002be1, 0x00002ba6, 0x00002b6a, 0x00002b2e, 0x00002af3, 0x00002ac7, + 0x00002a85, 0x00002a4a, 0x00002a11, 0x000029dc, 0x000029a6, 0x0000296e, 0x00002936, 0x00002904, + 0x000028cd, 0x0000289a, 0x00002866, 0x00002833, 0x0000280a, 0x000027d0, 0x0000279e, 0x0000276f, + 0x0000273c, 0x0000270d, 0x000026de, 0x000026ad, 0x0000267e, 0x00002652, 0x00002622, 0x000025f5, + 0x000025c9, 0x0000259b, 0x0000256f, 0x00002545, 0x00002518, 0x000024ef, 0x000024c3, 0x0000249c, + 0x0000246f, 0x00002446, 0x0000241c, 0x000023f4, 0x000023ca, 0x000023a2, 0x0000237b, 0x00002354, + 0x0000232e, 0x00002306, 0x000022e0, 0x000022b9, 0x00002294, 0x0000226f, 0x0000224b, 0x00002226, + 0x00002202, 0x000021dc, 0x000021b8, 0x00002195, 0x00002172, 0x0000214f, 0x0000212c, 0x0000210a, + 0x000020e7, 0x000020c5, 0x000020a4, 0x00002083, 0x00002061, 0x00002041, 0x00002020, 0x00002020 +}; - while (row_out != row_out_max) - { - *(row_out++) = pack_pixel_1324_p_to_1234_p_64bpp (*(row_in++)); - } -} +/* Lossy premultiplication: 11-bit * alpha -> 11-bit. Not perfectly reversible. */ +const uint32_t _smol_inv_div_p8l_lut [256] = +{ + 0x00000000, 0x0007ffff, 0x0003ffff, 0x0002aaaa, 0x0001ffff, 0x00019999, 0x00015555, 0x00012492, + 0x0000ffff, 0x0000e38e, 0x0000cccc, 0x0000ba2e, 0x0000aaaa, 0x00009d89, 0x00009249, 0x00008888, + 0x00007fff, 0x00007878, 0x000071c7, 0x00006bca, 0x00006666, 0x00006186, 0x00005d17, 0x0000590b, + 0x00005555, 0x000051eb, 0x00004ec4, 0x00004bda, 0x00004924, 0x0000469e, 0x00004444, 0x00004210, + 0x00003fff, 0x00003e0f, 0x00003c3c, 0x00003a83, 0x000038e3, 0x0000372a, 0x000035b7, 0x00003458, + 0x0000330a, 0x000031cc, 0x0000309e, 0x00002f7d, 0x00002e69, 0x00002d62, 0x00002c66, 0x00002b75, + 0x00002a8e, 0x000029b0, 0x000028db, 0x0000280f, 0x0000274a, 0x0000268c, 0x000025d6, 0x00002526, + 0x0000247d, 0x000023d9, 0x0000233c, 0x000022a3, 0x0000220f, 0x00002181, 0x000020f7, 0x00002071, + 0x00001ff0, 0x00001f72, 0x00001ef8, 0x00001e82, 0x00001e0f, 0x00001da0, 0x00001d34, 0x00001ccb, + 0x00001c65, 0x00001bf5, 0x00001b95, 0x00001b37, 0x00001adb, 0x00001a82, 0x00001a2c, 0x000019d7, + 0x00001985, 0x00001934, 0x000018e6, 0x00001899, 0x0000184f, 0x00001806, 0x000017be, 0x00001779, + 0x00001734, 0x000016f2, 0x000016b1, 0x00001671, 0x00001633, 0x000015f6, 0x000015ba, 0x00001580, + 0x00001547, 0x0000150f, 0x000014d8, 0x000014a2, 0x0000146d, 0x0000143a, 0x00001407, 0x000013d5, + 0x000013a5, 0x00001375, 0x00001346, 0x00001318, 0x000012eb, 0x000012be, 0x0000128e, 0x00001263, + 0x00001239, 0x00001210, 0x000011e7, 0x000011c0, 0x00001199, 0x00001172, 0x0000114d, 0x00001127, + 0x00001103, 0x000010df, 0x000010bc, 0x00001099, 0x00001077, 0x00001055, 0x00001034, 0x00001014, + 0x00000ff4, 0x00000fd4, 0x00000fb5, 0x00000f96, 0x00000f78, 0x00000f5a, 0x00000f3d, 0x00000f20, + 0x00000f04, 0x00000ee8, 0x00000ecc, 0x00000eb1, 0x00000e96, 0x00000e7c, 0x00000e62, 0x00000e48, + 0x00000e2f, 0x00000e16, 0x00000dfa, 0x00000de2, 0x00000dca, 0x00000db2, 0x00000d9b, 0x00000d84, + 0x00000d6d, 0x00000d57, 0x00000d41, 0x00000d2b, 0x00000d16, 0x00000d00, 0x00000ceb, 0x00000cd7, + 0x00000cc2, 0x00000cae, 0x00000c9a, 0x00000c86, 0x00000c73, 0x00000c5f, 0x00000c4c, 0x00000c3a, + 0x00000c27, 0x00000c15, 0x00000c03, 0x00000bf1, 0x00000bdf, 0x00000bcd, 0x00000bbc, 0x00000bab, + 0x00000b9a, 0x00000b89, 0x00000b79, 0x00000b68, 0x00000b58, 0x00000b48, 0x00000b38, 0x00000b27, + 0x00000b17, 0x00000b08, 0x00000af9, 0x00000aea, 0x00000adb, 0x00000acc, 0x00000abe, 0x00000ab0, + 0x00000aa1, 0x00000a93, 0x00000a85, 0x00000a78, 0x00000a6a, 0x00000a5c, 0x00000a4f, 0x00000a42, + 0x00000a35, 0x00000a28, 0x00000a1b, 0x00000a0e, 0x00000a02, 0x000009f5, 0x000009e9, 0x000009dd, + 0x000009d1, 0x000009c5, 0x000009b9, 0x000009ad, 0x000009a1, 0x00000996, 0x0000098a, 0x0000097f, + 0x00000974, 0x00000969, 0x0000095e, 0x00000951, 0x00000947, 0x0000093c, 0x00000931, 0x00000927, + 0x0000091c, 0x00000912, 0x00000908, 0x000008fe, 0x000008f3, 0x000008e9, 0x000008e0, 0x000008d6, + 0x000008cc, 0x000008c2, 0x000008b9, 0x000008af, 0x000008a6, 0x0000089d, 0x00000893, 0x0000088a, + 0x00000881, 0x00000878, 0x0000086f, 0x00000866, 0x0000085e, 0x00000855, 0x0000084c, 0x00000844, + 0x0000083b, 0x00000833, 0x0000082a, 0x00000822, 0x0000081a, 0x00000812, 0x0000080a, 0x00000801 +}; -static void -pack_row_132a_p_to_123_p_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; +/* Lossless premultiplication: 8-bit * alpha -> 16-bit. Reversible with this table. */ +const uint32_t _smol_inv_div_p16_lut [256] = +{ + 0x00000000, 0x00005556, 0x00004000, 0x00003334, 0x00002aab, 0x00002493, 0x00002000, 0x00001c72, + 0x0000199a, 0x00001746, 0x00001556, 0x000013b2, 0x0000124a, 0x00001112, 0x00001000, 0x00000f10, + 0x00000e39, 0x00000d7a, 0x00000ccd, 0x00000c31, 0x00000ba3, 0x00000b22, 0x00000aab, 0x00000a3e, + 0x000009d9, 0x0000097c, 0x00000925, 0x000008d4, 0x00000889, 0x00000843, 0x00000800, 0x000007c2, + 0x00000788, 0x00000751, 0x0000071d, 0x000006ec, 0x000006bd, 0x00000691, 0x00000667, 0x0000063f, + 0x00000619, 0x000005f5, 0x000005d2, 0x000005b1, 0x00000591, 0x00000573, 0x00000556, 0x0000053a, + 0x0000051f, 0x00000506, 0x000004ed, 0x000004d5, 0x000004be, 0x000004a8, 0x00000493, 0x0000047e, + 0x0000046a, 0x00000457, 0x00000445, 0x00000433, 0x00000422, 0x00000411, 0x00000400, 0x000003f1, + 0x000003e1, 0x000003d3, 0x000003c4, 0x000003b6, 0x000003a9, 0x0000039c, 0x0000038f, 0x00000382, + 0x00000376, 0x0000036a, 0x0000035f, 0x00000354, 0x00000349, 0x0000033e, 0x00000334, 0x0000032a, + 0x00000320, 0x00000316, 0x0000030d, 0x00000304, 0x000002fb, 0x000002f2, 0x000002e9, 0x000002e1, + 0x000002d9, 0x000002d1, 0x000002c9, 0x000002c1, 0x000002ba, 0x000002b2, 0x000002ab, 0x000002a4, + 0x0000029d, 0x00000296, 0x00000290, 0x00000289, 0x00000283, 0x0000027d, 0x00000277, 0x00000271, + 0x0000026b, 0x00000265, 0x0000025f, 0x0000025a, 0x00000254, 0x0000024f, 0x0000024a, 0x00000244, + 0x0000023f, 0x0000023a, 0x00000235, 0x00000231, 0x0000022c, 0x00000227, 0x00000223, 0x0000021e, + 0x0000021a, 0x00000215, 0x00000211, 0x0000020d, 0x00000209, 0x00000205, 0x00000200, 0x000001fd, + 0x000001f9, 0x000001f5, 0x000001f1, 0x000001ed, 0x000001ea, 0x000001e6, 0x000001e2, 0x000001df, + 0x000001db, 0x000001d8, 0x000001d5, 0x000001d1, 0x000001ce, 0x000001cb, 0x000001c8, 0x000001c4, + 0x000001c1, 0x000001be, 0x000001bb, 0x000001b8, 0x000001b5, 0x000001b3, 0x000001b0, 0x000001ad, + 0x000001aa, 0x000001a7, 0x000001a5, 0x000001a2, 0x0000019f, 0x0000019d, 0x0000019a, 0x00000198, + 0x00000195, 0x00000193, 0x00000190, 0x0000018e, 0x0000018b, 0x00000189, 0x00000187, 0x00000184, + 0x00000182, 0x00000180, 0x0000017e, 0x0000017b, 0x00000179, 0x00000177, 0x00000175, 0x00000173, + 0x00000171, 0x0000016f, 0x0000016d, 0x0000016b, 0x00000169, 0x00000167, 0x00000165, 0x00000163, + 0x00000161, 0x0000015f, 0x0000015d, 0x0000015b, 0x00000159, 0x00000158, 0x00000156, 0x00000154, + 0x00000152, 0x00000151, 0x0000014f, 0x0000014d, 0x0000014b, 0x0000014a, 0x00000148, 0x00000147, + 0x00000145, 0x00000143, 0x00000142, 0x00000140, 0x0000013f, 0x0000013d, 0x0000013c, 0x0000013a, + 0x00000139, 0x00000137, 0x00000136, 0x00000134, 0x00000133, 0x00000131, 0x00000130, 0x0000012f, + 0x0000012d, 0x0000012c, 0x0000012a, 0x00000129, 0x00000128, 0x00000126, 0x00000125, 0x00000124, + 0x00000122, 0x00000121, 0x00000120, 0x0000011f, 0x0000011d, 0x0000011c, 0x0000011b, 0x0000011a, + 0x00000119, 0x00000117, 0x00000116, 0x00000115, 0x00000114, 0x00000113, 0x00000112, 0x00000110, + 0x0000010f, 0x0000010e, 0x0000010d, 0x0000010c, 0x0000010b, 0x0000010a, 0x00000109, 0x00000108, + 0x00000107, 0x00000106, 0x00000105, 0x00000104, 0x00000103, 0x00000102, 0x00000100, 0x00000100 +}; - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); +/* Lossless premultiplication: 11-bit * alpha -> 19-bit. Reversible with this table. */ +const uint32_t _smol_inv_div_p16l_lut [256] = +{ + 0x00000000, 0x0002aaab, 0x00020000, 0x0001999a, 0x00015556, 0x00012493, 0x00010000, 0x0000e38f, + 0x0000cccd, 0x0000ba2f, 0x0000aaab, 0x00009d8a, 0x0000924a, 0x00008889, 0x00008000, 0x00007879, + 0x000071c8, 0x00006bcb, 0x00006667, 0x00006187, 0x00005d18, 0x0000590c, 0x00005556, 0x000051ec, + 0x00004ec5, 0x00004bdb, 0x00004925, 0x0000469f, 0x00004445, 0x00004211, 0x00004000, 0x00003e10, + 0x00003c3d, 0x00003a84, 0x000038e4, 0x0000375a, 0x000035e6, 0x00003484, 0x00003334, 0x000031f4, + 0x000030c4, 0x00002fa1, 0x00002e8c, 0x00002d83, 0x00002c86, 0x00002b94, 0x00002aab, 0x000029cc, + 0x000028f6, 0x00002829, 0x00002763, 0x000026a5, 0x000025ee, 0x0000253d, 0x00002493, 0x000023ef, + 0x00002350, 0x000022b7, 0x00002223, 0x00002193, 0x00002109, 0x00002083, 0x00002000, 0x00001f82, + 0x00001f08, 0x00001e92, 0x00001e1f, 0x00001daf, 0x00001d42, 0x00001cd9, 0x00001c72, 0x00001c0f, + 0x00001bad, 0x00001b4f, 0x00001af3, 0x00001a99, 0x00001a42, 0x000019ed, 0x0000199a, 0x00001949, + 0x000018fa, 0x000018ad, 0x00001862, 0x00001819, 0x000017d1, 0x0000178b, 0x00001746, 0x00001703, + 0x000016c2, 0x00001682, 0x00001643, 0x00001606, 0x000015ca, 0x0000158f, 0x00001556, 0x0000151e, + 0x000014e6, 0x000014b0, 0x0000147b, 0x00001447, 0x00001415, 0x000013e3, 0x000013b2, 0x00001382, + 0x00001353, 0x00001324, 0x000012f7, 0x000012ca, 0x0000129f, 0x00001274, 0x0000124a, 0x00001220, + 0x000011f8, 0x000011d0, 0x000011a8, 0x00001182, 0x0000115c, 0x00001136, 0x00001112, 0x000010ed, + 0x000010ca, 0x000010a7, 0x00001085, 0x00001063, 0x00001042, 0x00001021, 0x00001000, 0x00000fe1, + 0x00000fc1, 0x00000fa3, 0x00000f84, 0x00000f67, 0x00000f49, 0x00000f2c, 0x00000f10, 0x00000ef3, + 0x00000ed8, 0x00000ebc, 0x00000ea1, 0x00000e87, 0x00000e6d, 0x00000e53, 0x00000e39, 0x00000e20, + 0x00000e08, 0x00000def, 0x00000dd7, 0x00000dbf, 0x00000da8, 0x00000d91, 0x00000d7a, 0x00000d63, + 0x00000d4d, 0x00000d37, 0x00000d21, 0x00000d0c, 0x00000cf7, 0x00000ce2, 0x00000ccd, 0x00000cb9, + 0x00000ca5, 0x00000c91, 0x00000c7d, 0x00000c6a, 0x00000c57, 0x00000c44, 0x00000c31, 0x00000c1f, + 0x00000c0d, 0x00000bfb, 0x00000be9, 0x00000bd7, 0x00000bc6, 0x00000bb4, 0x00000ba3, 0x00000b93, + 0x00000b82, 0x00000b71, 0x00000b61, 0x00000b51, 0x00000b41, 0x00000b31, 0x00000b22, 0x00000b12, + 0x00000b03, 0x00000af4, 0x00000ae5, 0x00000ad7, 0x00000ac8, 0x00000ab9, 0x00000aab, 0x00000a9d, + 0x00000a8f, 0x00000a81, 0x00000a73, 0x00000a66, 0x00000a58, 0x00000a4b, 0x00000a3e, 0x00000a31, + 0x00000a24, 0x00000a17, 0x00000a0b, 0x000009fe, 0x000009f2, 0x000009e5, 0x000009d9, 0x000009cd, + 0x000009c1, 0x000009b5, 0x000009aa, 0x0000099e, 0x00000992, 0x00000987, 0x0000097c, 0x00000971, + 0x00000965, 0x0000095b, 0x00000950, 0x00000945, 0x0000093a, 0x00000930, 0x00000925, 0x0000091b, + 0x00000910, 0x00000906, 0x000008fc, 0x000008f2, 0x000008e8, 0x000008de, 0x000008d4, 0x000008cb, + 0x000008c1, 0x000008b8, 0x000008ae, 0x000008a5, 0x0000089b, 0x00000892, 0x00000889, 0x00000880, + 0x00000877, 0x0000086e, 0x00000865, 0x0000085c, 0x00000854, 0x0000084b, 0x00000843, 0x0000083a, + 0x00000832, 0x00000829, 0x00000821, 0x00000819, 0x00000811, 0x00000809, 0x00000800, 0x000007f9 +}; - while (row_out != row_out_max) - { - /* FIXME: Would be faster to shift directly */ - uint32_t p = pack_pixel_1324_p_to_1234_p_64bpp (*(row_in++)); - *(row_out++) = p >> 24; - *(row_out++) = p >> 16; - *(row_out++) = p >> 8; - } -} +/* ------- * + * Helpers * + * ------- */ -static void -pack_row_132a_p_to_321_p_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +static SMOL_INLINE int +check_row_range (const SmolScaleCtx *scale_ctx, + int32_t *first_dest_row, + int32_t *n_dest_rows) { - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) + if (*first_dest_row < 0) { - /* FIXME: Would be faster to shift directly */ - uint32_t p = pack_pixel_1324_p_to_1234_p_64bpp (*(row_in++)); - *(row_out++) = p >> 8; - *(row_out++) = p >> 16; - *(row_out++) = p >> 24; + *n_dest_rows += *first_dest_row; + *first_dest_row = 0; } -} - -#define DEF_PACK_FROM_1324_P_TO_P_64BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_1324_p_to_##a##b##c##d##_p_64bpp (uint64_t in) \ -{ \ - return PACK_FROM_1324_64BPP (in, a, b, c, d); \ -} \ - \ -static void \ -pack_row_1324_p_to_##a##b##c##d##_p_64bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - *(row_out++) = pack_pixel_1324_p_to_##a##b##c##d##_p_64bpp (*(row_in++)); \ -} - -DEF_PACK_FROM_1324_P_TO_P_64BPP (1, 4, 3, 2) -DEF_PACK_FROM_1324_P_TO_P_64BPP (2, 3, 4, 1) -DEF_PACK_FROM_1324_P_TO_P_64BPP (3, 2, 1, 4) -DEF_PACK_FROM_1324_P_TO_P_64BPP (4, 1, 2, 3) -DEF_PACK_FROM_1324_P_TO_P_64BPP (4, 3, 2, 1) - -static SMOL_INLINE uint32_t -pack_pixel_1234_p_to_1234_p_128bpp (const uint64_t *in) -{ - /* FIXME: Are masks needed? */ - return ((in [0] >> 8) & 0xff000000) - | ((in [0] << 16) & 0x00ff0000) - | ((in [1] >> 24) & 0x0000ff00) - | (in [1] & 0x000000ff); -} - -static void -pack_row_1234_p_to_1234_p_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint32_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint32_t *row_out_max = row_out + n_pixels; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) + else if (*first_dest_row >= (int32_t) scale_ctx->vdim.dest_size_px) { - *(row_out++) = pack_pixel_1234_p_to_1234_p_128bpp (row_in); - row_in += 2; + return 0; } -} - -#define DEF_PACK_FROM_1234_P_TO_P_128BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_1234_p_to_##a##b##c##d##_p_128bpp (const uint64_t * SMOL_RESTRICT in) \ -{ \ - return PACK_FROM_1234_128BPP (in, a, b, c, d); \ -} \ - \ -static void \ -pack_row_1234_p_to_##a##b##c##d##_p_128bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - { \ - *(row_out++) = pack_pixel_1234_p_to_##a##b##c##d##_p_128bpp (row_in); \ - row_in += 2; \ - } \ -} - -DEF_PACK_FROM_1234_P_TO_P_128BPP (1, 4, 3, 2) -DEF_PACK_FROM_1234_P_TO_P_128BPP (2, 3, 4, 1) -DEF_PACK_FROM_1234_P_TO_P_128BPP (3, 2, 1, 4) -DEF_PACK_FROM_1234_P_TO_P_128BPP (4, 1, 2, 3) -DEF_PACK_FROM_1234_P_TO_P_128BPP (4, 3, 2, 1) - -static void -pack_row_123a_p_to_123_p_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) + if (*n_dest_rows < 0 || *first_dest_row + *n_dest_rows > (int32_t) scale_ctx->vdim.dest_size_px) { - *(row_out++) = *row_in >> 32; - *(row_out++) = *(row_in++); - *(row_out++) = *(row_in++) >> 32; + *n_dest_rows = scale_ctx->vdim.dest_size_px - *first_dest_row; } -} - -static void -pack_row_123a_p_to_321_p_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) + else if (*n_dest_rows == 0) { - *(row_out++) = row_in [1] >> 32; - *(row_out++) = row_in [0]; - *(row_out++) = row_in [0] >> 32; - row_in += 2; + return 0; } -} -/* Pack p (alpha last) -> u */ - -static SMOL_INLINE uint32_t -pack_pixel_132a_p_to_1234_u_64bpp (uint64_t in) -{ - uint8_t alpha = in; - in = (unpremul_p_to_u_64bpp (in, alpha) & 0xffffffffffffff00) | alpha; - return in | (in >> 24); + return 1; } -static void -pack_row_132a_p_to_1234_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint32_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint32_t *row_out_max = row_out + n_pixels; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - *(row_out++) = pack_pixel_132a_p_to_1234_u_64bpp (*(row_in++)); - } -} +/* ------------------- * + * Scaling: Outer loop * + * ------------------- */ -static void -pack_row_132a_p_to_123_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +static SMOL_INLINE const char * +src_row_ofs_to_pointer (const SmolScaleCtx *scale_ctx, + uint32_t src_row_ofs) { - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_132a_p_to_1234_u_64bpp (*(row_in++)); - *(row_out++) = p >> 24; - *(row_out++) = p >> 16; - *(row_out++) = p >> 8; - } + return scale_ctx->src_pixels + scale_ctx->src_rowstride * src_row_ofs; } -static void -pack_row_132a_p_to_321_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +static SMOL_INLINE char * +dest_row_ofs_to_pointer (const SmolScaleCtx *scale_ctx, + uint32_t dest_row_ofs) { - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_132a_p_to_1234_u_64bpp (*(row_in++)); - *(row_out++) = p >> 8; - *(row_out++) = p >> 16; - *(row_out++) = p >> 24; - } -} - -#define DEF_PACK_FROM_132A_P_TO_U_64BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_132a_p_to_##a##b##c##d##_u_64bpp (uint64_t in) \ -{ \ - uint8_t alpha = in; \ - in = (unpremul_p_to_u_64bpp (in, alpha) & 0xffffffffffffff00) | alpha; \ - return PACK_FROM_1324_64BPP (in, a, b, c, d); \ -} \ - \ -static void \ -pack_row_132a_p_to_##a##b##c##d##_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - *(row_out++) = pack_pixel_132a_p_to_##a##b##c##d##_u_64bpp (*(row_in++)); \ -} - -DEF_PACK_FROM_132A_P_TO_U_64BPP (3, 2, 1, 4) -DEF_PACK_FROM_132A_P_TO_U_64BPP (4, 1, 2, 3) -DEF_PACK_FROM_132A_P_TO_U_64BPP (4, 3, 2, 1) - -#define DEF_PACK_FROM_123A_P_TO_U_128BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_123a_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT in) \ -{ \ - uint64_t t [2]; \ - uint8_t alpha = in [1]; \ - unpremul_p_to_u_128bpp (in, t, alpha); \ - t [1] = (t [1] & 0xffffffff00000000) | alpha; \ - return PACK_FROM_1234_128BPP (t, a, b, c, d); \ -} \ - \ -static void \ -pack_row_123a_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - { \ - *(row_out++) = pack_pixel_123a_p_to_##a##b##c##d##_u_128bpp (row_in); \ - row_in += 2; \ - } \ + return scale_ctx->dest_pixels + scale_ctx->dest_rowstride * dest_row_ofs; } -DEF_PACK_FROM_123A_P_TO_U_128BPP (1, 2, 3, 4) -DEF_PACK_FROM_123A_P_TO_U_128BPP (3, 2, 1, 4) -DEF_PACK_FROM_123A_P_TO_U_128BPP (4, 1, 2, 3) -DEF_PACK_FROM_123A_P_TO_U_128BPP (4, 3, 2, 1) - -static void -pack_row_123a_p_to_123_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +static SMOL_INLINE void * +dest_hofs_to_pointer (const SmolScaleCtx *scale_ctx, + void *dest_row_ptr, + uint32_t dest_hofs) { - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_123a_p_to_1234_u_128bpp (row_in); - row_in += 2; - *(row_out++) = p >> 24; - *(row_out++) = p >> 16; - *(row_out++) = p >> 8; - } + uint8_t *dest_row_ptr_u8 = dest_row_ptr; + return dest_row_ptr_u8 + dest_hofs * pixel_type_meta [scale_ctx->dest_pixel_type].pixel_stride; } static void -pack_row_123a_p_to_321_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_123a_p_to_1234_u_128bpp (row_in); - row_in += 2; - *(row_out++) = p >> 8; - *(row_out++) = p >> 16; - *(row_out++) = p >> 24; - } -} - -/* Pack p (alpha first) -> u */ - -static SMOL_INLINE uint32_t -pack_pixel_a324_p_to_1234_u_64bpp (uint64_t in) +copy_row (const SmolScaleCtx *scale_ctx, + uint32_t dest_row_index, + uint32_t *row_out) { - uint8_t alpha = (in >> 48) & 0xff; /* FIXME: May not need mask */ - in = (unpremul_p_to_u_64bpp (in, alpha) & 0x0000ffffffffffff) | ((uint64_t) alpha << 48); - return in | (in >> 24); + memcpy (row_out, + src_row_ofs_to_pointer (scale_ctx, dest_row_index), + scale_ctx->hdim.dest_size_px * pixel_type_meta [scale_ctx->dest_pixel_type].pixel_stride); } static void -pack_row_a324_p_to_1234_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint32_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +scale_dest_row (const SmolScaleCtx *scale_ctx, + SmolLocalCtx *local_ctx, + uint32_t dest_row_index, + void *row_out) { - uint32_t *row_out_max = row_out + n_pixels; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) + if (dest_row_index < scale_ctx->vdim.clear_before_px + || dest_row_index >= scale_ctx->vdim.dest_size_px - scale_ctx->vdim.clear_after_px) { - *(row_out++) = pack_pixel_a324_p_to_1234_u_64bpp (*(row_in++)); - } -} - -static void -pack_row_a324_p_to_234_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; + /* Row doesn't intersect placement */ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_a324_p_to_1234_u_64bpp (*(row_in++)); - *(row_out++) = p >> 16; - *(row_out++) = p >> 8; - *(row_out++) = p; + if (scale_ctx->composite_op == SMOL_COMPOSITE_SRC_CLEAR_DEST) + { + /* Clear entire row */ + scale_ctx->clear_dest_func (scale_ctx->color_pixels_clear_batch, + row_out, + scale_ctx->hdim.dest_size_px); + } } -} - -static void -pack_row_a324_p_to_432_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) + else { - uint32_t p = pack_pixel_a324_p_to_1234_u_64bpp (*(row_in++)); - *(row_out++) = p; - *(row_out++) = p >> 8; - *(row_out++) = p >> 16; - } -} - -#define DEF_PACK_FROM_A324_P_TO_U_64BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_a324_p_to_##a##b##c##d##_u_64bpp (uint64_t in) \ -{ \ - uint8_t alpha = (in >> 48) & 0xff; /* FIXME: May not need mask */ \ - in = (unpremul_p_to_u_64bpp (in, alpha) & 0x0000ffffffffffff) | ((uint64_t) alpha << 48); \ - return PACK_FROM_1324_64BPP (in, a, b, c, d); \ -} \ - \ -static void \ -pack_row_a324_p_to_##a##b##c##d##_u_64bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - *(row_out++) = pack_pixel_a324_p_to_##a##b##c##d##_u_64bpp (*(row_in++)); \ -} - -DEF_PACK_FROM_A324_P_TO_U_64BPP (1, 4, 3, 2) -DEF_PACK_FROM_A324_P_TO_U_64BPP (2, 3, 4, 1) -DEF_PACK_FROM_A324_P_TO_U_64BPP (4, 3, 2, 1) - -#define DEF_PACK_FROM_A234_P_TO_U_128BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_a234_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT in) \ -{ \ - uint64_t t [2]; \ - uint8_t alpha = in [0] >> 32; \ - unpremul_p_to_u_128bpp (in, t, alpha); \ - t [0] = (t [0] & 0x00000000ffffffff) | ((uint64_t) alpha << 32); \ - return PACK_FROM_1234_128BPP (t, a, b, c, d); \ -} \ - \ -static void \ -pack_row_a234_p_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - { \ - *(row_out++) = pack_pixel_a234_p_to_##a##b##c##d##_u_128bpp (row_in); \ - row_in += 2; \ - } \ -} - -DEF_PACK_FROM_A234_P_TO_U_128BPP (1, 2, 3, 4) -DEF_PACK_FROM_A234_P_TO_U_128BPP (1, 4, 3, 2) -DEF_PACK_FROM_A234_P_TO_U_128BPP (2, 3, 4, 1) -DEF_PACK_FROM_A234_P_TO_U_128BPP (4, 3, 2, 1) + if (scale_ctx->composite_op == SMOL_COMPOSITE_SRC_CLEAR_DEST) + { + /* Clear left */ + scale_ctx->clear_dest_func (scale_ctx->color_pixels_clear_batch, + row_out, + scale_ctx->hdim.clear_before_px); + } -static void -pack_row_a234_p_to_234_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; + if (scale_ctx->is_noop) + { + copy_row (scale_ctx, dest_row_index, row_out); + } + else + { + int scaled_row_index; - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); + scaled_row_index = scale_ctx->vfilter_func (scale_ctx, + local_ctx, + dest_row_index - scale_ctx->vdim.clear_before_px); - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_a234_p_to_1234_u_128bpp (row_in); - row_in += 2; - *(row_out++) = p >> 16; - *(row_out++) = p >> 8; - *(row_out++) = p; - } -} + if ((scale_ctx->composite_op == SMOL_COMPOSITE_SRC + || scale_ctx->composite_op == SMOL_COMPOSITE_SRC_CLEAR_DEST) + && scale_ctx->have_composite_color) + { + scale_ctx->composite_over_color_func (local_ctx->parts_row [scaled_row_index], + scale_ctx->color_pixel, + scale_ctx->hdim.placement_size_px); + } -static void -pack_row_a234_p_to_432_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; + scale_ctx->pack_row_func (local_ctx->parts_row [scaled_row_index], + dest_hofs_to_pointer (scale_ctx, row_out, scale_ctx->hdim.placement_ofs_px), + scale_ctx->hdim.placement_size_px); - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); + } - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_a234_p_to_1234_u_128bpp (row_in); - row_in += 2; - *(row_out++) = p; - *(row_out++) = p >> 8; - *(row_out++) = p >> 16; + if (scale_ctx->composite_op == SMOL_COMPOSITE_SRC_CLEAR_DEST) + { + /* Clear right */ + scale_ctx->clear_dest_func (scale_ctx->color_pixels_clear_batch, + dest_hofs_to_pointer (scale_ctx, row_out, + scale_ctx->hdim.placement_ofs_px + + scale_ctx->hdim.placement_size_px), + scale_ctx->hdim.clear_after_px); + } } -} - -/* Pack i (alpha last) to u */ -static SMOL_INLINE uint32_t -pack_pixel_123a_i_to_1234_u_128bpp (const uint64_t * SMOL_RESTRICT in) -{ - uint8_t alpha = (in [1] >> 8) & 0xff; - uint64_t t [2]; - - unpremul_i_to_u_128bpp (in, t, alpha); - - return ((t [0] >> 8) & 0xff000000) - | ((t [0] << 16) & 0x00ff0000) - | ((t [1] >> 24) & 0x0000ff00) - | alpha; + if (scale_ctx->post_row_func) + scale_ctx->post_row_func (row_out, scale_ctx->hdim.dest_size_px, scale_ctx->user_data); } static void -pack_row_123a_i_to_1234_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint32_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +do_rows (const SmolScaleCtx *scale_ctx, + void *dest, + uint32_t row_dest_index, + uint32_t n_rows) { - uint32_t *row_out_max = row_out + n_pixels; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); - - while (row_out != row_out_max) - { - *(row_out++) = pack_pixel_123a_i_to_1234_u_128bpp (row_in); - row_in += 2; - } -} + SmolLocalCtx local_ctx = { 0 }; + uint32_t n_parts_per_pixel = 1; + uint32_t n_stored_rows = 4; + uint32_t i; -static void -pack_row_123a_i_to_123_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; + if (scale_ctx->storage_type == SMOL_STORAGE_128BPP) + n_parts_per_pixel = 2; - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); + /* Must be one less, or this test in update_local_ctx() will wrap around: + * if (new_src_ofs == local_ctx->src_ofs + 1) { ... } */ + local_ctx.src_ofs = UINT_MAX - 1; - while (row_out != row_out_max) + for (i = 0; i < n_stored_rows; i++) { - uint32_t p = pack_pixel_123a_i_to_1234_u_128bpp (row_in); - row_in += 2; - *(row_out++) = p >> 24; - *(row_out++) = p >> 16; - *(row_out++) = p >> 8; - } -} - -static void -pack_row_123a_i_to_321_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, - uint8_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint8_t *row_out_max = row_out + n_pixels * 3; - - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); + /* Allocate space for an extra pixel at the rightmost edge. This pixel + * allows bilinear horizontal sampling to exceed the input width and + * produce transparency when the output is smaller than its whole-pixel + * count. This is especially noticeable with halving, which can + * produce 2^n such samples (the extra pixel is sampled repeatedly in + * those cases). + * + * FIXME: This is no longer true, and the extra storage is probably not + * needed. The edge transparency is now handled by applying a precalculated + * opacity directly. We should verify that the extra storage can be + * eliminated without overruns. */ + + local_ctx.parts_row [i] = + smol_alloc_aligned (MAX (scale_ctx->hdim.src_size_px + 1, scale_ctx->hdim.placement_size_px) + * n_parts_per_pixel * sizeof (uint64_t), + &local_ctx.row_storage [i]); - while (row_out != row_out_max) - { - uint32_t p = pack_pixel_123a_i_to_1234_u_128bpp (row_in); - row_in += 2; - *(row_out++) = p >> 8; - *(row_out++) = p >> 16; - *(row_out++) = p >> 24; + local_ctx.parts_row [i] [scale_ctx->hdim.src_size_px * n_parts_per_pixel] = 0; + if (n_parts_per_pixel == 2) + local_ctx.parts_row [i] [scale_ctx->hdim.src_size_px * n_parts_per_pixel + 1] = 0; } -} - -#define DEF_PACK_FROM_123A_I_TO_U_128BPP(a, b, c, d) \ -static SMOL_INLINE uint32_t \ -pack_pixel_123a_i_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT in) \ -{ \ - uint8_t alpha = (in [1] >> 8) & 0xff; \ - uint64_t t [2]; \ - unpremul_i_to_u_128bpp (in, t, alpha); \ - t [1] = (t [1] & 0xffffffff00000000ULL) | alpha; \ - return PACK_FROM_1234_128BPP (t, a, b, c, d); \ -} \ - \ -static void \ -pack_row_123a_i_to_##a##b##c##d##_u_128bpp (const uint64_t * SMOL_RESTRICT row_in, \ - uint32_t * SMOL_RESTRICT row_out, \ - uint32_t n_pixels) \ -{ \ - uint32_t *row_out_max = row_out + n_pixels; \ - SMOL_ASSUME_ALIGNED (row_in, const uint64_t *); \ - while (row_out != row_out_max) \ - { \ - *(row_out++) = pack_pixel_123a_i_to_##a##b##c##d##_u_128bpp (row_in); \ - row_in += 2; \ - } \ -} - -DEF_PACK_FROM_123A_I_TO_U_128BPP(3, 2, 1, 4) -DEF_PACK_FROM_123A_I_TO_U_128BPP(4, 1, 2, 3) -DEF_PACK_FROM_123A_I_TO_U_128BPP(4, 3, 2, 1) - -/* Unpack p -> p */ -static SMOL_INLINE uint64_t -unpack_pixel_1234_p_to_1324_p_64bpp (uint32_t p) -{ - return (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff00ff); -} - -/* AVX2 has a useful instruction for this: __m256i _mm256_cvtepu8_epi16 (__m128i a); - * It results in a different channel ordering, so it'd be important to match with - * the right kind of re-pack. */ -static void -unpack_row_1234_p_to_1324_p_64bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels; - - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); - - while (row_out != row_out_max) + for (i = row_dest_index; i < row_dest_index + n_rows; i++) { - *(row_out++) = unpack_pixel_1234_p_to_1324_p_64bpp (*(row_in++)); + scale_dest_row (scale_ctx, &local_ctx, i, dest); + dest = (char *) dest + scale_ctx->dest_rowstride; } -} -static SMOL_INLINE uint64_t -unpack_pixel_123_p_to_132a_p_64bpp (const uint8_t *p) -{ - return ((uint64_t) p [0] << 48) | ((uint32_t) p [1] << 16) - | ((uint64_t) p [2] << 32) | 0xff; -} - -static void -unpack_row_123_p_to_132a_p_64bpp (const uint8_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels; - - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); - - while (row_out != row_out_max) + for (i = 0; i < n_stored_rows; i++) { - *(row_out++) = unpack_pixel_123_p_to_132a_p_64bpp (row_in); - row_in += 3; + smol_free (local_ctx.row_storage [i]); } -} -static SMOL_INLINE void -unpack_pixel_1234_p_to_1234_p_128bpp (uint32_t p, - uint64_t *out) -{ - uint64_t p64 = p; - out [0] = ((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16); - out [1] = ((p64 & 0x0000ff00) << 24) | (p64 & 0x000000ff); + /* Used to align row data if needed. May be allocated in scale_horizontal(). */ + if (local_ctx.src_aligned) + smol_free (local_ctx.src_aligned_storage); } -static void -unpack_row_1234_p_to_1234_p_128bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels * 2; - - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); - - while (row_out != row_out_max) - { - unpack_pixel_1234_p_to_1234_p_128bpp (*(row_in++), row_out); - row_out += 2; - } -} +/* -------------------- * + * Architecture support * + * -------------------- */ -static SMOL_INLINE void -unpack_pixel_123_p_to_123a_p_128bpp (const uint8_t *in, - uint64_t *out) -{ - out [0] = ((uint64_t) in [0] << 32) | in [1]; - out [1] = ((uint64_t) in [2] << 32) | 0xff; -} +#ifdef SMOL_WITH_AVX2 -static void -unpack_row_123_p_to_123a_p_128bpp (const uint8_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +static SmolBool +have_avx2 (void) { - uint64_t *row_out_max = row_out + n_pixels * 2; + __builtin_cpu_init (); - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); + if (__builtin_cpu_supports ("avx2")) + return TRUE; - while (row_out != row_out_max) - { - unpack_pixel_123_p_to_123a_p_128bpp (row_in, row_out); - row_in += 3; - row_out += 2; - } + return FALSE; } -/* Unpack u (alpha first) -> p */ - -static SMOL_INLINE uint64_t -unpack_pixel_a234_u_to_a324_p_64bpp (uint32_t p) -{ - uint64_t p64 = (((uint64_t) p & 0x0000ff00) << 24) | (p & 0x00ff00ff); - uint8_t alpha = p >> 24; - - return premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha << 48); -} +#endif -static void -unpack_row_a234_u_to_a324_p_64bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) +static SmolBool +host_is_little_endian (void) { - uint64_t *row_out_max = row_out + n_pixels; - - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); - - while (row_out != row_out_max) + static const union { - *(row_out++) = unpack_pixel_a234_u_to_a324_p_64bpp (*(row_in++)); + uint8_t u8 [4]; + uint32_t u32; } -} - -static SMOL_INLINE void -unpack_pixel_a234_u_to_a234_p_128bpp (uint32_t p, - uint64_t *out) -{ - uint64_t p64 = (((uint64_t) p & 0x0000ff00) << 24) | (p & 0x00ff00ff); - uint8_t alpha = p >> 24; - - p64 = premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha << 48); - out [0] = (p64 >> 16) & 0x000000ff000000ff; - out [1] = p64 & 0x000000ff000000ff; -} - -static void -unpack_row_a234_u_to_a234_p_128bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels * 2; + host_bytes = { { 0, 1, 2, 3 } }; - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); + if (host_bytes.u32 == 0x03020100UL) + return TRUE; - while (row_out != row_out_max) - { - unpack_pixel_a234_u_to_a234_p_128bpp (*(row_in++), row_out); - row_out += 2; - } -} - -/* Unpack u (alpha first) -> i */ - -static SMOL_INLINE void -unpack_pixel_a234_u_to_234a_i_128bpp (uint32_t p, - uint64_t *out) -{ - uint64_t p64 = p; - uint64_t alpha = p >> 24; - - out [0] = (((((p64 & 0x00ff0000) << 16) | ((p64 & 0x0000ff00) >> 8)) * alpha)); - out [1] = (((((p64 & 0x000000ff) << 32) * alpha))) | (alpha << 8) | 0x80; -} - -static void -unpack_row_a234_u_to_234a_i_128bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels * 2; - - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); - - while (row_out != row_out_max) - { - unpack_pixel_a234_u_to_234a_i_128bpp (*(row_in++), row_out); - row_out += 2; - } -} - -/* Unpack u (alpha last) -> p */ - -static SMOL_INLINE uint64_t -unpack_pixel_123a_u_to_132a_p_64bpp (uint32_t p) -{ - uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000); - uint8_t alpha = p & 0xff; - - return premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha); -} - -static void -unpack_row_123a_u_to_132a_p_64bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels; - - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); - - while (row_out != row_out_max) - { - *(row_out++) = unpack_pixel_123a_u_to_132a_p_64bpp (*(row_in++)); - } -} - -static SMOL_INLINE void -unpack_pixel_123a_u_to_123a_p_128bpp (uint32_t p, - uint64_t *out) -{ - uint64_t p64 = (((uint64_t) p & 0xff00ff00) << 24) | (p & 0x00ff0000); - uint8_t alpha = p & 0xff; - - p64 = premul_u_to_p_64bpp (p64, alpha) | ((uint64_t) alpha); - out [0] = (p64 >> 16) & 0x000000ff000000ff; - out [1] = p64 & 0x000000ff000000ff; -} - -static void -unpack_row_123a_u_to_123a_p_128bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels * 2; - - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); - - while (row_out != row_out_max) - { - unpack_pixel_123a_u_to_123a_p_128bpp (*(row_in++), row_out); - row_out += 2; - } -} - -/* Unpack u (alpha last) -> i */ - -static SMOL_INLINE void -unpack_pixel_123a_u_to_123a_i_128bpp (uint32_t p, - uint64_t *out) -{ - uint64_t p64 = p; - uint64_t alpha = p & 0xff; - - out [0] = (((((p64 & 0xff000000) << 8) | ((p64 & 0x00ff0000) >> 16)) * alpha)); - out [1] = (((((p64 & 0x0000ff00) << 24) * alpha))) | (alpha << 8) | 0x80; -} - -static void -unpack_row_123a_u_to_123a_i_128bpp (const uint32_t * SMOL_RESTRICT row_in, - uint64_t * SMOL_RESTRICT row_out, - uint32_t n_pixels) -{ - uint64_t *row_out_max = row_out + n_pixels * 2; - - SMOL_ASSUME_ALIGNED (row_out, uint64_t *); - - while (row_out != row_out_max) - { - unpack_pixel_123a_u_to_123a_i_128bpp (*(row_in++), row_out); - row_out += 2; - } -} - -/* --- Filter helpers --- */ - -static SMOL_INLINE const uint32_t * -inrow_ofs_to_pointer (const SmolScaleCtx *scale_ctx, - uint32_t inrow_ofs) -{ - return (const uint32_t *) (((const uint8_t *) scale_ctx->pixels_in) + scale_ctx->rowstride_in * inrow_ofs); -} - -static SMOL_INLINE uint32_t * -outrow_ofs_to_pointer (const SmolScaleCtx *scale_ctx, - uint32_t outrow_ofs) -{ - return (uint32_t *) (((uint8_t *) scale_ctx->pixels_out) + scale_ctx->rowstride_out * outrow_ofs); -} - -static SMOL_INLINE uint64_t -weight_pixel_64bpp (uint64_t p, - uint16_t w) -{ - return ((p * w) >> 8) & 0x00ff00ff00ff00ff; -} - -/* p and out may be the same address */ -static SMOL_INLINE void -weight_pixel_128bpp (uint64_t *p, - uint64_t *out, - uint16_t w) -{ - out [0] = ((p [0] * w) >> 8) & 0x00ffffff00ffffffULL; - out [1] = ((p [1] * w) >> 8) & 0x00ffffff00ffffffULL; -} - -static SMOL_INLINE void -sum_parts_64bpp (const uint64_t ** SMOL_RESTRICT parts_in, - uint64_t * SMOL_RESTRICT accum, - uint32_t n) -{ - const uint64_t *pp_end; - const uint64_t * SMOL_RESTRICT pp = *parts_in; - - SMOL_ASSUME_ALIGNED_TO (pp, const uint64_t *, sizeof (uint64_t)); - - for (pp_end = pp + n; pp < pp_end; pp++) - { - *accum += *pp; - } - - *parts_in = pp; -} - -static SMOL_INLINE void -sum_parts_128bpp (const uint64_t ** SMOL_RESTRICT parts_in, - uint64_t * SMOL_RESTRICT accum, - uint32_t n) -{ - const uint64_t *pp_end; - const uint64_t * SMOL_RESTRICT pp = *parts_in; - - SMOL_ASSUME_ALIGNED_TO (pp, const uint64_t *, sizeof (uint64_t) * 2); - - for (pp_end = pp + n * 2; pp < pp_end; ) - { - accum [0] += *(pp++); - accum [1] += *(pp++); - } - - *parts_in = pp; -} - -static SMOL_INLINE uint64_t -scale_64bpp (uint64_t accum, - uint64_t multiplier) -{ - uint64_t a, b; - - /* Average the inputs */ - a = ((accum & 0x0000ffff0000ffffULL) * multiplier - + (SMOL_BOXES_MULTIPLIER / 2) + ((SMOL_BOXES_MULTIPLIER / 2) << 32)) / SMOL_BOXES_MULTIPLIER; - b = (((accum & 0xffff0000ffff0000ULL) >> 16) * multiplier - + (SMOL_BOXES_MULTIPLIER / 2) + ((SMOL_BOXES_MULTIPLIER / 2) << 32)) / SMOL_BOXES_MULTIPLIER; - - /* Return pixel */ - return (a & 0x000000ff000000ffULL) | ((b & 0x000000ff000000ffULL) << 16); -} - -static SMOL_INLINE uint64_t -scale_128bpp_half (uint64_t accum, - uint64_t multiplier) -{ - uint64_t a, b; - - a = accum & 0x00000000ffffffffULL; - a = (a * multiplier + SMOL_BOXES_MULTIPLIER / 2) / SMOL_BOXES_MULTIPLIER; - - b = (accum & 0xffffffff00000000ULL) >> 32; - b = (b * multiplier + SMOL_BOXES_MULTIPLIER / 2) / SMOL_BOXES_MULTIPLIER; - - return (a & 0x000000000000ffffULL) - | ((b & 0x000000000000ffffULL) << 32); -} - -static SMOL_INLINE void -scale_and_store_128bpp (const uint64_t * SMOL_RESTRICT accum, - uint64_t multiplier, - uint64_t ** SMOL_RESTRICT row_parts_out) -{ - *(*row_parts_out)++ = scale_128bpp_half (accum [0], multiplier); - *(*row_parts_out)++ = scale_128bpp_half (accum [1], multiplier); -} - -static void -add_parts (const uint64_t * SMOL_RESTRICT parts_in, - uint64_t * SMOL_RESTRICT parts_acc_out, - uint32_t n) -{ - const uint64_t *parts_in_max = parts_in + n; - - SMOL_ASSUME_ALIGNED (parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (parts_acc_out, uint64_t *); - - while (parts_in < parts_in_max) - *(parts_acc_out++) += *(parts_in++); -} - -/* --- Precalculation --- */ - -static void -pick_filter_params (uint32_t dim_in, - uint32_t dim_out, - uint32_t *halvings_out, - uint32_t *dim_bilin_out, - SmolFilterType *filter_out, - SmolStorageType *storage_out) -{ - *dim_bilin_out = dim_out; - *storage_out = SMOL_STORAGE_64BPP; - - /* The box algorithms are only sufficiently precise when - * dim_in > dim_out * 5. box_64bpp typically starts outperforming - * bilinear+halving at dim_in > dim_out * 8. */ - - if (dim_in > dim_out * 255) - { - *filter_out = SMOL_FILTER_BOX; - *storage_out = SMOL_STORAGE_128BPP; - } - else if (dim_in > dim_out * 8) - { - *filter_out = SMOL_FILTER_BOX; - } - else if (dim_in == 1) - { - *filter_out = SMOL_FILTER_ONE; - } - else if (dim_in == dim_out) - { - *filter_out = SMOL_FILTER_COPY; - } - else - { - uint32_t n_halvings = 0; - uint32_t d = dim_out; - - for (;;) - { - d *= 2; - if (d >= dim_in) - break; - n_halvings++; - } - - dim_out <<= n_halvings; - *dim_bilin_out = dim_out; - *filter_out = SMOL_FILTER_BILINEAR_0H + n_halvings; - *halvings_out = n_halvings; - } -} - -static void -precalc_bilinear_array (uint16_t *array, - uint32_t dim_in, - uint32_t dim_out, - unsigned int make_absolute_offsets) -{ - uint64_t ofs_stepF, fracF, frac_stepF; - uint16_t *pu16 = array; - uint16_t last_ofs = 0; - - if (dim_in > dim_out) - { - /* Minification */ - frac_stepF = ofs_stepF = (dim_in * SMOL_BILIN_MULTIPLIER) / dim_out; - fracF = (frac_stepF - SMOL_BILIN_MULTIPLIER) / 2; - } - else - { - /* Magnification */ - frac_stepF = ofs_stepF = ((dim_in - 1) * SMOL_BILIN_MULTIPLIER) / (dim_out > 1 ? (dim_out - 1) : 1); - fracF = 0; - } - - do - { - uint16_t ofs = fracF / SMOL_BILIN_MULTIPLIER; - - /* We sample ofs and its neighbor -- prevent out of bounds access - * for the latter. */ - if (ofs >= dim_in - 1) - break; - - *(pu16++) = make_absolute_offsets ? ofs : ofs - last_ofs; - *(pu16++) = SMOL_SMALL_MUL - ((fracF / (SMOL_BILIN_MULTIPLIER / SMOL_SMALL_MUL)) % SMOL_SMALL_MUL); - fracF += frac_stepF; - - last_ofs = ofs; - } - while (--dim_out); - - /* Instead of going out of bounds, sample the final pair of pixels with a 100% - * bias towards the last pixel */ - while (dim_out) - { - *(pu16++) = make_absolute_offsets ? dim_in - 2 : (dim_in - 2) - last_ofs; - *(pu16++) = 0; - dim_out--; - - last_ofs = dim_in - 2; - } -} - -static void -precalc_boxes_array (uint16_t *array, - uint32_t *span_mul, - uint32_t dim_in, - uint32_t dim_out, - unsigned int make_absolute_offsets) -{ - uint64_t fracF, frac_stepF; - uint16_t *pu16 = array; - uint16_t ofs, next_ofs; - uint64_t f; - uint64_t stride; - uint64_t a, b; - - frac_stepF = ((uint64_t) dim_in * SMOL_BIG_MUL) / (uint64_t) dim_out; - fracF = 0; - ofs = 0; - - stride = frac_stepF / (uint64_t) SMOL_BIG_MUL; - f = (frac_stepF / SMOL_SMALL_MUL) % SMOL_SMALL_MUL; - - a = (SMOL_BOXES_MULTIPLIER * 255); - b = ((stride * 255) + ((f * 255) / 256)); - *span_mul = (a + (b / 2)) / b; - - do - { - fracF += frac_stepF; - next_ofs = (uint64_t) fracF / ((uint64_t) SMOL_BIG_MUL); - - /* Prevent out of bounds access */ - if (ofs >= dim_in - 1) - break; - - if (next_ofs > dim_in) - { - next_ofs = dim_in; - if (next_ofs <= ofs) - break; - } - - stride = next_ofs - ofs - 1; - f = (fracF / SMOL_SMALL_MUL) % SMOL_SMALL_MUL; - - /* Fraction is the other way around, since left pixel of each span - * comes first, and it's on the right side of the fractional sample. */ - *(pu16++) = make_absolute_offsets ? ofs : stride; - *(pu16++) = f; - - ofs = next_ofs; - } - while (--dim_out); - - /* Instead of going out of bounds, sample the final pair of pixels with a 100% - * bias towards the last pixel */ - while (dim_out) - { - *(pu16++) = make_absolute_offsets ? ofs : 0; - *(pu16++) = 0; - dim_out--; - } - - *(pu16++) = make_absolute_offsets ? ofs : 0; - *(pu16++) = 0; -} - -/* --- Horizontal scaling --- */ - -#define DEF_INTERP_HORIZONTAL_BILINEAR(n_halvings) \ -static void \ -interp_horizontal_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \ - const uint64_t * SMOL_RESTRICT row_parts_in, \ - uint64_t * SMOL_RESTRICT row_parts_out) \ -{ \ - uint64_t p, q; \ - const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x; \ - uint64_t F; \ - uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out; \ - int i; \ - \ - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); \ - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); \ - \ - do \ - { \ - uint64_t accum = 0; \ - \ - for (i = 0; i < (1 << (n_halvings)); i++) \ - { \ - row_parts_in += *(ofs_x++); \ - F = *(ofs_x++); \ - \ - p = *row_parts_in; \ - q = *(row_parts_in + 1); \ - \ - accum += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \ - } \ - *(row_parts_out++) = ((accum) >> (n_halvings)) & 0x00ff00ff00ff00ffULL; \ - } \ - while (row_parts_out != row_parts_out_max); \ -} \ - \ -static void \ -interp_horizontal_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \ - const uint64_t * SMOL_RESTRICT row_parts_in, \ - uint64_t * SMOL_RESTRICT row_parts_out) \ -{ \ - uint64_t p, q; \ - const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x; \ - uint64_t F; \ - uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out * 2; \ - int i; \ - \ - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); \ - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); \ - \ - do \ - { \ - uint64_t accum [2] = { 0 }; \ - \ - for (i = 0; i < (1 << (n_halvings)); i++) \ - { \ - row_parts_in += *(ofs_x++) * 2; \ - F = *(ofs_x++); \ - \ - p = row_parts_in [0]; \ - q = row_parts_in [2]; \ - \ - accum [0] += ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \ - \ - p = row_parts_in [1]; \ - q = row_parts_in [3]; \ - \ - accum [1] += ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \ - } \ - *(row_parts_out++) = ((accum [0]) >> (n_halvings)) & 0x00ffffff00ffffffULL; \ - *(row_parts_out++) = ((accum [1]) >> (n_halvings)) & 0x00ffffff00ffffffULL; \ - } \ - while (row_parts_out != row_parts_out_max); \ -} - -static void -interp_horizontal_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx, - const uint64_t * SMOL_RESTRICT row_parts_in, - uint64_t * SMOL_RESTRICT row_parts_out) -{ - uint64_t p, q; - const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x; - uint64_t F; - uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->width_out; - - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); - - do - { - row_parts_in += *(ofs_x++); - F = *(ofs_x++); - - p = *row_parts_in; - q = *(row_parts_in + 1); - - *(row_parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; - } - while (row_parts_out != row_parts_out_max); -} - -static void -interp_horizontal_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx, - const uint64_t * SMOL_RESTRICT row_parts_in, - uint64_t * SMOL_RESTRICT row_parts_out) -{ - uint64_t p, q; - const uint16_t * SMOL_RESTRICT ofs_x = scale_ctx->offsets_x; - uint64_t F; - uint64_t * SMOL_RESTRICT row_parts_out_max = row_parts_out + scale_ctx->width_out * 2; - - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); - - do - { - row_parts_in += *(ofs_x++) * 2; - F = *(ofs_x++); - - p = row_parts_in [0]; - q = row_parts_in [2]; - - *(row_parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; - - p = row_parts_in [1]; - q = row_parts_in [3]; - - *(row_parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; - } - while (row_parts_out != row_parts_out_max); -} - -DEF_INTERP_HORIZONTAL_BILINEAR(1) -DEF_INTERP_HORIZONTAL_BILINEAR(2) -DEF_INTERP_HORIZONTAL_BILINEAR(3) -DEF_INTERP_HORIZONTAL_BILINEAR(4) -DEF_INTERP_HORIZONTAL_BILINEAR(5) -DEF_INTERP_HORIZONTAL_BILINEAR(6) - -static void -interp_horizontal_boxes_64bpp (const SmolScaleCtx *scale_ctx, - const uint64_t *row_parts_in, - uint64_t * SMOL_RESTRICT row_parts_out) -{ - const uint64_t * SMOL_RESTRICT pp; - const uint16_t *ofs_x = scale_ctx->offsets_x; - uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out - 1; - uint64_t accum = 0; - uint64_t p, q, r, s; - uint32_t n; - uint64_t F; - - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); - - pp = row_parts_in; - p = weight_pixel_64bpp (*(pp++), 256); - n = *(ofs_x++); - - while (row_parts_out != row_parts_out_max) - { - sum_parts_64bpp ((const uint64_t ** SMOL_RESTRICT) &pp, &accum, n); - - F = *(ofs_x++); - n = *(ofs_x++); - - r = *(pp++); - s = r * F; - - q = (s >> 8) & 0x00ff00ff00ff00ffULL; - - accum += p + q; - - /* (255 * r) - (F * r) */ - p = (((r << 8) - r - s) >> 8) & 0x00ff00ff00ff00ffULL; - - *(row_parts_out++) = scale_64bpp (accum, scale_ctx->span_mul_x); - accum = 0; - } - - /* Final box optionally features the rightmost fractional pixel */ - - sum_parts_64bpp ((const uint64_t ** SMOL_RESTRICT) &pp, &accum, n); - - q = 0; - F = *(ofs_x); - if (F > 0) - q = weight_pixel_64bpp (*(pp), F); - - accum += p + q; - *(row_parts_out++) = scale_64bpp (accum, scale_ctx->span_mul_x); -} - -static void -interp_horizontal_boxes_128bpp (const SmolScaleCtx *scale_ctx, - const uint64_t *row_parts_in, - uint64_t * SMOL_RESTRICT row_parts_out) -{ - const uint64_t * SMOL_RESTRICT pp; - const uint16_t *ofs_x = scale_ctx->offsets_x; - uint64_t *row_parts_out_max = row_parts_out + (scale_ctx->width_out - /* 2 */ 1) * 2; - uint64_t accum [2] = { 0, 0 }; - uint64_t p [2], q [2], r [2], s [2]; - uint32_t n; - uint64_t F; - - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); - - pp = row_parts_in; - - p [0] = *(pp++); - p [1] = *(pp++); - weight_pixel_128bpp (p, p, 256); - - n = *(ofs_x++); - - while (row_parts_out != row_parts_out_max) - { - sum_parts_128bpp ((const uint64_t ** SMOL_RESTRICT) &pp, accum, n); - - F = *(ofs_x++); - n = *(ofs_x++); - - r [0] = *(pp++); - r [1] = *(pp++); - - s [0] = r [0] * F; - s [1] = r [1] * F; - - q [0] = (s [0] >> 8) & 0x00ffffff00ffffff; - q [1] = (s [1] >> 8) & 0x00ffffff00ffffff; - - accum [0] += p [0] + q [0]; - accum [1] += p [1] + q [1]; - - p [0] = (((r [0] << 8) - r [0] - s [0]) >> 8) & 0x00ffffff00ffffff; - p [1] = (((r [1] << 8) - r [1] - s [1]) >> 8) & 0x00ffffff00ffffff; - - scale_and_store_128bpp (accum, - scale_ctx->span_mul_x, - (uint64_t ** SMOL_RESTRICT) &row_parts_out); - - accum [0] = 0; - accum [1] = 0; - } - - /* Final box optionally features the rightmost fractional pixel */ - - sum_parts_128bpp ((const uint64_t ** SMOL_RESTRICT) &pp, accum, n); - - q [0] = 0; - q [1] = 0; - - F = *(ofs_x); - if (F > 0) - { - q [0] = *(pp++); - q [1] = *(pp++); - weight_pixel_128bpp (q, q, F); - } - - accum [0] += p [0] + q [0]; - accum [1] += p [1] + q [1]; - - scale_and_store_128bpp (accum, - scale_ctx->span_mul_x, - (uint64_t ** SMOL_RESTRICT) &row_parts_out); -} - -static void -interp_horizontal_one_64bpp (const SmolScaleCtx *scale_ctx, - const uint64_t * SMOL_RESTRICT row_parts_in, - uint64_t * SMOL_RESTRICT row_parts_out) -{ - uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out; - uint64_t part; - - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); - - part = *row_parts_in; - while (row_parts_out != row_parts_out_max) - *(row_parts_out++) = part; -} - -static void -interp_horizontal_one_128bpp (const SmolScaleCtx *scale_ctx, - const uint64_t * SMOL_RESTRICT row_parts_in, - uint64_t * SMOL_RESTRICT row_parts_out) -{ - uint64_t *row_parts_out_max = row_parts_out + scale_ctx->width_out * 2; - - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); - - while (row_parts_out != row_parts_out_max) - { - *(row_parts_out++) = row_parts_in [0]; - *(row_parts_out++) = row_parts_in [1]; - } -} - -static void -interp_horizontal_copy_64bpp (const SmolScaleCtx *scale_ctx, - const uint64_t * SMOL_RESTRICT row_parts_in, - uint64_t * SMOL_RESTRICT row_parts_out) -{ - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); - - memcpy (row_parts_out, row_parts_in, scale_ctx->width_out * sizeof (uint64_t)); -} - -static void -interp_horizontal_copy_128bpp (const SmolScaleCtx *scale_ctx, - const uint64_t * SMOL_RESTRICT row_parts_in, - uint64_t * SMOL_RESTRICT row_parts_out) -{ - SMOL_ASSUME_ALIGNED (row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (row_parts_out, uint64_t *); - - memcpy (row_parts_out, row_parts_in, scale_ctx->width_out * 2 * sizeof (uint64_t)); -} - -static void -scale_horizontal (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - const uint32_t *row_in, - uint64_t *row_parts_out) -{ - uint64_t * SMOL_RESTRICT unpacked_in; - - unpacked_in = vertical_ctx->parts_row [3]; - - /* 32-bit unpackers need 32-bit alignment */ - if ((((uintptr_t) row_in) & 3) - && scale_ctx->pixel_type_in != SMOL_PIXEL_RGB8 - && scale_ctx->pixel_type_in != SMOL_PIXEL_BGR8) - { - if (!vertical_ctx->in_aligned) - vertical_ctx->in_aligned = - smol_alloc_aligned (scale_ctx->width_in * sizeof (uint32_t), - &vertical_ctx->in_aligned_storage); - memcpy (vertical_ctx->in_aligned, row_in, scale_ctx->width_in * sizeof (uint32_t)); - row_in = vertical_ctx->in_aligned; - } - - scale_ctx->unpack_row_func (row_in, - unpacked_in, - scale_ctx->width_in); - scale_ctx->hfilter_func (scale_ctx, - unpacked_in, - row_parts_out); -} - -/* --- Vertical scaling --- */ - -static void -update_vertical_ctx_bilinear (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index) -{ - uint32_t new_in_ofs = scale_ctx->offsets_y [outrow_index * 2]; - - if (new_in_ofs == vertical_ctx->in_ofs) - return; - - if (new_in_ofs == vertical_ctx->in_ofs + 1) - { - uint64_t *t = vertical_ctx->parts_row [0]; - vertical_ctx->parts_row [0] = vertical_ctx->parts_row [1]; - vertical_ctx->parts_row [1] = t; - - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, new_in_ofs + 1), - vertical_ctx->parts_row [1]); - } - else - { - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, new_in_ofs), - vertical_ctx->parts_row [0]); - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, new_in_ofs + 1), - vertical_ctx->parts_row [1]); - } - - vertical_ctx->in_ofs = new_in_ofs; -} - -static void -interp_vertical_bilinear_store_64bpp (uint64_t F, - const uint64_t * SMOL_RESTRICT top_row_parts_in, - const uint64_t * SMOL_RESTRICT bottom_row_parts_in, - uint64_t * SMOL_RESTRICT parts_out, - uint32_t width) -{ - uint64_t *parts_out_last = parts_out + width; - - SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (parts_out, uint64_t *); - - do - { - uint64_t p, q; - - p = *(top_row_parts_in++); - q = *(bottom_row_parts_in++); - - *(parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; - } - while (parts_out != parts_out_last); -} - -static void -interp_vertical_bilinear_add_64bpp (uint64_t F, - const uint64_t * SMOL_RESTRICT top_row_parts_in, - const uint64_t * SMOL_RESTRICT bottom_row_parts_in, - uint64_t * SMOL_RESTRICT accum_out, - uint32_t width) -{ - uint64_t *accum_out_last = accum_out + width; - - SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (accum_out, uint64_t *); - - do - { - uint64_t p, q; - - p = *(top_row_parts_in++); - q = *(bottom_row_parts_in++); - - *(accum_out++) += ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; - } - while (accum_out != accum_out_last); -} - -static void -interp_vertical_bilinear_store_128bpp (uint64_t F, - const uint64_t * SMOL_RESTRICT top_row_parts_in, - const uint64_t * SMOL_RESTRICT bottom_row_parts_in, - uint64_t * SMOL_RESTRICT parts_out, - uint32_t width) -{ - uint64_t *parts_out_last = parts_out + width; - - SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (parts_out, uint64_t *); - - do - { - uint64_t p, q; - - p = *(top_row_parts_in++); - q = *(bottom_row_parts_in++); - - *(parts_out++) = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; - } - while (parts_out != parts_out_last); -} - -static void -interp_vertical_bilinear_add_128bpp (uint64_t F, - const uint64_t * SMOL_RESTRICT top_row_parts_in, - const uint64_t * SMOL_RESTRICT bottom_row_parts_in, - uint64_t * SMOL_RESTRICT accum_out, - uint32_t width) -{ - uint64_t *accum_out_last = accum_out + width; - - SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *); - SMOL_ASSUME_ALIGNED (accum_out, uint64_t *); - - do - { - uint64_t p, q; - - p = *(top_row_parts_in++); - q = *(bottom_row_parts_in++); - - *(accum_out++) += ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; - } - while (accum_out != accum_out_last); -} - -#define DEF_INTERP_VERTICAL_BILINEAR_FINAL(n_halvings) \ -static void \ -interp_vertical_bilinear_final_##n_halvings##h_64bpp (uint64_t F, \ - const uint64_t * SMOL_RESTRICT top_row_parts_in, \ - const uint64_t * SMOL_RESTRICT bottom_row_parts_in, \ - uint64_t * SMOL_RESTRICT accum_inout, \ - uint32_t width) \ -{ \ - uint64_t *accum_inout_last = accum_inout + width; \ - \ - SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *); \ - SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *); \ - SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \ - \ - do \ - { \ - uint64_t p, q; \ - \ - p = *(top_row_parts_in++); \ - q = *(bottom_row_parts_in++); \ - \ - p = ((((p - q) * F) >> 8) + q) & 0x00ff00ff00ff00ffULL; \ - p = ((p + *accum_inout) >> n_halvings) & 0x00ff00ff00ff00ffULL; \ - \ - *(accum_inout++) = p; \ - } \ - while (accum_inout != accum_inout_last); \ -} \ - \ -static void \ -interp_vertical_bilinear_final_##n_halvings##h_128bpp (uint64_t F, \ - const uint64_t * SMOL_RESTRICT top_row_parts_in, \ - const uint64_t * SMOL_RESTRICT bottom_row_parts_in, \ - uint64_t * SMOL_RESTRICT accum_inout, \ - uint32_t width) \ -{ \ - uint64_t *accum_inout_last = accum_inout + width; \ - \ - SMOL_ASSUME_ALIGNED (top_row_parts_in, const uint64_t *); \ - SMOL_ASSUME_ALIGNED (bottom_row_parts_in, const uint64_t *); \ - SMOL_ASSUME_ALIGNED (accum_inout, uint64_t *); \ - \ - do \ - { \ - uint64_t p, q; \ - \ - p = *(top_row_parts_in++); \ - q = *(bottom_row_parts_in++); \ - \ - p = ((((p - q) * F) >> 8) + q) & 0x00ffffff00ffffffULL; \ - p = ((p + *accum_inout) >> n_halvings) & 0x00ffffff00ffffffULL; \ - \ - *(accum_inout++) = p; \ - } \ - while (accum_inout != accum_inout_last); \ -} - -#define DEF_SCALE_OUTROW_BILINEAR(n_halvings) \ -static void \ -scale_outrow_bilinear_##n_halvings##h_64bpp (const SmolScaleCtx *scale_ctx, \ - SmolVerticalCtx *vertical_ctx, \ - uint32_t outrow_index, \ - uint32_t *row_out) \ -{ \ - uint32_t bilin_index = outrow_index << (n_halvings); \ - unsigned int i; \ - \ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \ - interp_vertical_bilinear_store_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \ - vertical_ctx->parts_row [0], \ - vertical_ctx->parts_row [1], \ - vertical_ctx->parts_row [2], \ - scale_ctx->width_out); \ - bilin_index++; \ - \ - for (i = 0; i < (1 << (n_halvings)) - 2; i++) \ - { \ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \ - interp_vertical_bilinear_add_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \ - vertical_ctx->parts_row [0], \ - vertical_ctx->parts_row [1], \ - vertical_ctx->parts_row [2], \ - scale_ctx->width_out); \ - bilin_index++; \ - } \ - \ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \ - interp_vertical_bilinear_final_##n_halvings##h_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \ - vertical_ctx->parts_row [0], \ - vertical_ctx->parts_row [1], \ - vertical_ctx->parts_row [2], \ - scale_ctx->width_out); \ - \ - scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); \ -} \ - \ -static void \ -scale_outrow_bilinear_##n_halvings##h_128bpp (const SmolScaleCtx *scale_ctx, \ - SmolVerticalCtx *vertical_ctx, \ - uint32_t outrow_index, \ - uint32_t *row_out) \ -{ \ - uint32_t bilin_index = outrow_index << (n_halvings); \ - unsigned int i; \ - \ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \ - interp_vertical_bilinear_store_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \ - vertical_ctx->parts_row [0], \ - vertical_ctx->parts_row [1], \ - vertical_ctx->parts_row [2], \ - scale_ctx->width_out * 2); \ - bilin_index++; \ - \ - for (i = 0; i < (1 << (n_halvings)) - 2; i++) \ - { \ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \ - interp_vertical_bilinear_add_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \ - vertical_ctx->parts_row [0], \ - vertical_ctx->parts_row [1], \ - vertical_ctx->parts_row [2], \ - scale_ctx->width_out * 2); \ - bilin_index++; \ - } \ - \ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); \ - interp_vertical_bilinear_final_##n_halvings##h_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], \ - vertical_ctx->parts_row [0], \ - vertical_ctx->parts_row [1], \ - vertical_ctx->parts_row [2], \ - scale_ctx->width_out * 2); \ - \ - scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); \ -} - -static void -scale_outrow_bilinear_0h_64bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) -{ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, outrow_index); - interp_vertical_bilinear_store_64bpp (scale_ctx->offsets_y [outrow_index * 2 + 1], - vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - scale_ctx->width_out); - scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); -} - -static void -scale_outrow_bilinear_0h_128bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) -{ - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, outrow_index); - interp_vertical_bilinear_store_128bpp (scale_ctx->offsets_y [outrow_index * 2 + 1], - vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - scale_ctx->width_out * 2); - scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); -} - -DEF_INTERP_VERTICAL_BILINEAR_FINAL(1) - -static void -scale_outrow_bilinear_1h_64bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) -{ - uint32_t bilin_index = outrow_index << 1; - - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); - interp_vertical_bilinear_store_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], - vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - scale_ctx->width_out); - bilin_index++; - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); - interp_vertical_bilinear_final_1h_64bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], - vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - scale_ctx->width_out); - scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); + return FALSE; } -static void -scale_outrow_bilinear_1h_128bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) +/* The generic unpack/pack functions fetch and store pixels as u32. + * This means the byte order will be reversed on little endian, with + * consequences for the alpha channel and reordering logic. We deal + * with this by using the apparent byte order internally. */ +static SmolPixelType +get_host_pixel_type (SmolPixelType pixel_type) { - uint32_t bilin_index = outrow_index << 1; - - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); - interp_vertical_bilinear_store_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], - vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - scale_ctx->width_out * 2); - bilin_index++; - update_vertical_ctx_bilinear (scale_ctx, vertical_ctx, bilin_index); - interp_vertical_bilinear_final_1h_128bpp (scale_ctx->offsets_y [bilin_index * 2 + 1], - vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - scale_ctx->width_out * 2); - scale_ctx->pack_row_func (vertical_ctx->parts_row [2], row_out, scale_ctx->width_out); + if (host_is_little_endian ()) + return pixel_type_u32_le [pixel_type]; + + return pixel_type; } -DEF_INTERP_VERTICAL_BILINEAR_FINAL(2) -DEF_SCALE_OUTROW_BILINEAR(2) -DEF_INTERP_VERTICAL_BILINEAR_FINAL(3) -DEF_SCALE_OUTROW_BILINEAR(3) -DEF_INTERP_VERTICAL_BILINEAR_FINAL(4) -DEF_SCALE_OUTROW_BILINEAR(4) -DEF_INTERP_VERTICAL_BILINEAR_FINAL(5) -DEF_SCALE_OUTROW_BILINEAR(5) -DEF_INTERP_VERTICAL_BILINEAR_FINAL(6) -DEF_SCALE_OUTROW_BILINEAR(6) +/* ---------------------- * + * Context initialization * + * ---------------------- */ static void -finalize_vertical_64bpp (const uint64_t * SMOL_RESTRICT accums, - uint64_t multiplier, - uint64_t * SMOL_RESTRICT parts_out, - uint32_t n) +pick_filter_params (uint32_t src_dim, + uint32_t src_dim_spx, + int32_t dest_ofs_spx, + uint32_t dest_dim, + uint32_t dest_dim_spx, + uint32_t *dest_halvings, + uint32_t *dest_dim_prehalving, + uint32_t *dest_dim_prehalving_spx, + SmolFilterType *dest_filter, + SmolStorageType *dest_storage, + uint16_t *first_opacity, + uint16_t *last_opacity, + SmolFlags flags) { - uint64_t *parts_out_max = parts_out + n; + *dest_dim_prehalving = dest_dim; + *dest_storage = (flags & SMOL_DISABLE_SRGB_LINEARIZATION) ? SMOL_STORAGE_64BPP : SMOL_STORAGE_128BPP; - SMOL_ASSUME_ALIGNED (accums, const uint64_t *); - SMOL_ASSUME_ALIGNED (parts_out, uint64_t *); + *first_opacity = SMOL_SUBPIXEL_MOD (-dest_ofs_spx - 1) + 1; + *last_opacity = SMOL_SUBPIXEL_MOD (dest_ofs_spx + dest_dim_spx - 1) + 1; - while (parts_out != parts_out_max) - { - *(parts_out++) = scale_64bpp (*(accums++), multiplier); - } -} - -static void -weight_edge_row_64bpp (uint64_t *row, - uint16_t w, - uint32_t n) -{ - uint64_t *row_max = row + n; + /* Special handling when the output is a single pixel */ - SMOL_ASSUME_ALIGNED (row, uint64_t *); - - while (row != row_max) + if (dest_dim == 1) { - *row = ((*row * w) >> 8) & 0x00ff00ff00ff00ffULL; - row++; + *first_opacity = dest_dim_spx; + *last_opacity = 256; } -} -static void -scale_and_weight_edge_rows_box_64bpp (const uint64_t * SMOL_RESTRICT first_row, - uint64_t * SMOL_RESTRICT last_row, - uint64_t * SMOL_RESTRICT accum, - uint16_t w2, - uint32_t n) -{ - const uint64_t *first_row_max = first_row + n; - - SMOL_ASSUME_ALIGNED (first_row, const uint64_t *); - SMOL_ASSUME_ALIGNED (last_row, uint64_t *); - SMOL_ASSUME_ALIGNED (accum, uint64_t *); + /* The box algorithms are only sufficiently precise when + * src_dim > dest_dim * 5. box_64bpp typically starts outperforming + * bilinear+halving at src_dim > dest_dim * 8. */ - while (first_row != first_row_max) + if (src_dim > dest_dim * 255) { - uint64_t r, s, p, q; - - p = *(first_row++); - - r = *(last_row); - s = r * w2; - q = (s >> 8) & 0x00ff00ff00ff00ffULL; - /* (255 * r) - (F * r) */ - *(last_row++) = (((r << 8) - r - s) >> 8) & 0x00ff00ff00ff00ffULL; - - *(accum++) = p + q; + *dest_storage = SMOL_STORAGE_128BPP; + *dest_filter = SMOL_FILTER_BOX; } -} - -static void -update_vertical_ctx_box_64bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t ofs_y, - uint32_t ofs_y_max, - uint16_t w1, - uint16_t w2) -{ - /* Old in_ofs is the previous max */ - if (ofs_y == vertical_ctx->in_ofs) + else if (src_dim > dest_dim * 8) { - uint64_t *t = vertical_ctx->parts_row [0]; - vertical_ctx->parts_row [0] = vertical_ctx->parts_row [1]; - vertical_ctx->parts_row [1] = t; + *dest_filter = SMOL_FILTER_BOX; } - else + else if (src_dim <= 1) { - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, ofs_y), - vertical_ctx->parts_row [0]); - weight_edge_row_64bpp (vertical_ctx->parts_row [0], w1, scale_ctx->width_out); + *dest_filter = SMOL_FILTER_ONE; + *last_opacity = ((dest_ofs_spx + dest_dim_spx - 1) % SMOL_SUBPIXEL_MUL) + 1; } - - /* When w2 == 0, the final inrow may be out of bounds. Don't try to access it in - * that case. */ - if (w2 || ofs_y_max < scale_ctx->height_in) + else if ((dest_ofs_spx & 0xff) == 0 && src_dim_spx == dest_dim_spx) { - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, ofs_y_max), - vertical_ctx->parts_row [1]); + *dest_filter = SMOL_FILTER_COPY; + *first_opacity = 256; + *last_opacity = 256; } else { - memset (vertical_ctx->parts_row [1], 0, scale_ctx->width_out * sizeof (uint64_t)); + uint32_t n_halvings = 0; + uint32_t d = dest_dim_spx; + + for (;;) + { + d *= 2; + if (d >= src_dim_spx) + break; + n_halvings++; + } + + *dest_dim_prehalving = dest_dim << n_halvings; + *dest_dim_prehalving_spx = dest_dim_spx << n_halvings; + *dest_filter = SMOL_FILTER_BILINEAR_0H + n_halvings; + *dest_halvings = n_halvings; } - vertical_ctx->in_ofs = ofs_y_max; } -static void -scale_outrow_box_64bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) +static const SmolRepackMeta * +find_repack_match (const SmolRepackMeta *meta, uint16_t sig, uint16_t mask) { - uint32_t ofs_y, ofs_y_max; - uint16_t w1, w2; - - /* Get the inrow range for this outrow: [ofs_y .. ofs_y_max> */ + sig &= mask; - ofs_y = scale_ctx->offsets_y [outrow_index * 2]; - ofs_y_max = scale_ctx->offsets_y [(outrow_index + 1) * 2]; - - /* Scale the first and last rows, weight them and store in accumulator */ - - w1 = (outrow_index == 0) ? 256 : 255 - scale_ctx->offsets_y [outrow_index * 2 - 1]; - w2 = scale_ctx->offsets_y [outrow_index * 2 + 1]; - - update_vertical_ctx_box_64bpp (scale_ctx, vertical_ctx, ofs_y, ofs_y_max, w1, w2); - - scale_and_weight_edge_rows_box_64bpp (vertical_ctx->parts_row [0], - vertical_ctx->parts_row [1], - vertical_ctx->parts_row [2], - w2, - scale_ctx->width_out); - - ofs_y++; - - /* Add up whole rows */ - - while (ofs_y < ofs_y_max) + for (;; meta++) { - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, ofs_y), - vertical_ctx->parts_row [0]); - add_parts (vertical_ctx->parts_row [0], - vertical_ctx->parts_row [2], - scale_ctx->width_out); - - ofs_y++; + if (!meta->repack_row_func) + { + meta = NULL; + break; + } + + if (sig == (meta->signature & mask)) + break; } - finalize_vertical_64bpp (vertical_ctx->parts_row [2], - scale_ctx->span_mul_y, - vertical_ctx->parts_row [0], - scale_ctx->width_out); - scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out); + return meta; } static void -finalize_vertical_128bpp (const uint64_t * SMOL_RESTRICT accums, - uint64_t multiplier, - uint64_t * SMOL_RESTRICT parts_out, - uint32_t n) +do_reorder (const uint8_t *order_in, uint8_t *order_out, const uint8_t *reorder) { - uint64_t *parts_out_max = parts_out + n * 2; + int i; - SMOL_ASSUME_ALIGNED (accums, const uint64_t *); - SMOL_ASSUME_ALIGNED (parts_out, uint64_t *); - - while (parts_out != parts_out_max) + for (i = 0; i < 4; i++) { - *(parts_out++) = scale_128bpp_half (*(accums++), multiplier); - *(parts_out++) = scale_128bpp_half (*(accums++), multiplier); - } -} - -static void -weight_row_128bpp (uint64_t *row, - uint16_t w, - uint32_t n) -{ - uint64_t *row_max = row + (n * 2); + uint8_t r = reorder [i]; + uint8_t o; - SMOL_ASSUME_ALIGNED (row, uint64_t *); + if (r == 0) + { + o = 0; + } + else + { + o = order_in [r - 1]; + if (o == 0) + o = i + 1; + } - while (row != row_max) - { - row [0] = ((row [0] * w) >> 8) & 0x00ffffff00ffffffULL; - row [1] = ((row [1] * w) >> 8) & 0x00ffffff00ffffffULL; - row += 2; + order_out [i] = o; } } static void -scale_outrow_box_128bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) +find_repacks (const SmolImplementation **implementations, + SmolStorageType src_storage, SmolStorageType mid_storage, SmolStorageType dest_storage, + SmolAlphaType src_alpha, SmolAlphaType mid_alpha, SmolAlphaType dest_alpha, + SmolGammaType src_gamma, SmolGammaType mid_gamma, SmolGammaType dest_gamma, + const SmolPixelTypeMeta *src_pmeta, const SmolPixelTypeMeta *dest_pmeta, + const SmolRepackMeta **src_repack, const SmolRepackMeta **dest_repack) { - uint32_t ofs_y, ofs_y_max; - uint16_t w; + int src_impl, dest_impl; + const SmolRepackMeta *src_meta, *dest_meta = NULL; + uint16_t src_to_mid_sig, mid_to_dest_sig; + uint16_t sig_mask; + int reorder_dest_alpha_ch; - /* Get the inrow range for this outrow: [ofs_y .. ofs_y_max> */ + sig_mask = SMOL_REPACK_SIGNATURE_ANY_ORDER_MASK (1, 1, 1, 1, 1, 1); + src_to_mid_sig = SMOL_MAKE_REPACK_SIGNATURE_ANY_ORDER (src_storage, src_alpha, src_gamma, + mid_storage, mid_alpha, mid_gamma); + mid_to_dest_sig = SMOL_MAKE_REPACK_SIGNATURE_ANY_ORDER (mid_storage, mid_alpha, mid_gamma, + dest_storage, dest_alpha, dest_gamma); - ofs_y = scale_ctx->offsets_y [outrow_index * 2]; - ofs_y_max = scale_ctx->offsets_y [(outrow_index + 1) * 2]; + /* The initial conversion must always leave alpha in position #4, so further + * processing knows where to find it. The order of the other channels + * doesn't matter, as long as there's a repack chain that ultimately + * produces the desired result. */ + reorder_dest_alpha_ch = src_pmeta->order [0] == 4 ? 1 : 4; - /* Scale the first inrow and store it */ + for (src_impl = 0; implementations [src_impl]; src_impl++) + { + src_meta = &implementations [src_impl]->repack_meta [0]; - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, ofs_y), - vertical_ctx->parts_row [0]); - weight_row_128bpp (vertical_ctx->parts_row [0], - outrow_index == 0 ? 256 : 255 - scale_ctx->offsets_y [outrow_index * 2 - 1], - scale_ctx->width_out); - ofs_y++; + for (;; src_meta++) + { + uint8_t mid_order [4]; - /* Add up whole rows */ + src_meta = find_repack_match (src_meta, src_to_mid_sig, sig_mask); + if (!src_meta) + break; - while (ofs_y < ofs_y_max) - { - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, ofs_y), - vertical_ctx->parts_row [1]); - add_parts (vertical_ctx->parts_row [1], - vertical_ctx->parts_row [0], - scale_ctx->width_out * 2); - - ofs_y++; - } + if (reorder_meta [SMOL_REPACK_SIGNATURE_GET_REORDER (src_meta->signature)].dest [3] != reorder_dest_alpha_ch) + continue; - /* Final row is optional; if this is the bottommost outrow it could be out of bounds */ + do_reorder (src_pmeta->order, mid_order, + reorder_meta [SMOL_REPACK_SIGNATURE_GET_REORDER (src_meta->signature)].dest); - w = scale_ctx->offsets_y [outrow_index * 2 + 1]; - if (w > 0) - { - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, ofs_y), - vertical_ctx->parts_row [1]); - weight_row_128bpp (vertical_ctx->parts_row [1], - w - 1, /* Subtract 1 to avoid overflow */ - scale_ctx->width_out); - add_parts (vertical_ctx->parts_row [1], - vertical_ctx->parts_row [0], - scale_ctx->width_out * 2); - } + for (dest_impl = 0; implementations [dest_impl]; dest_impl++) + { + dest_meta = &implementations [dest_impl]->repack_meta [0]; - finalize_vertical_128bpp (vertical_ctx->parts_row [0], - scale_ctx->span_mul_y, - vertical_ctx->parts_row [1], - scale_ctx->width_out); - scale_ctx->pack_row_func (vertical_ctx->parts_row [1], row_out, scale_ctx->width_out); -} + for (;; dest_meta++) + { + uint8_t dest_order [4]; -static void -scale_outrow_one_64bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t row_index, - uint32_t *row_out) -{ - SMOL_UNUSED (row_index); + dest_meta = find_repack_match (dest_meta, mid_to_dest_sig, sig_mask); + if (!dest_meta) + break; - /* Scale the row and store it */ + do_reorder (mid_order, dest_order, + reorder_meta [SMOL_REPACK_SIGNATURE_GET_REORDER (dest_meta->signature)].dest); - if (vertical_ctx->in_ofs != 0) - { - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, 0), - vertical_ctx->parts_row [0]); - vertical_ctx->in_ofs = 0; + if (*((uint32_t *) dest_order) == *((uint32_t *) dest_pmeta->order)) + { + /* Success */ + goto out; + } + } + } + } } - scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out); +out: + if (src_repack) + *src_repack = src_meta; + if (dest_repack) + *dest_repack = dest_meta; } static void -scale_outrow_one_128bpp (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t row_index, - uint32_t *row_out) +populate_clear_batch (SmolScaleCtx *scale_ctx) { - SMOL_UNUSED (row_index); + uint8_t dest_color [16]; + int pixel_stride; + int i; - /* Scale the row and store it */ + scale_ctx->pack_row_func (scale_ctx->color_pixel, dest_color, 1); + pixel_stride = pixel_type_meta [scale_ctx->dest_pixel_type].pixel_stride; - if (vertical_ctx->in_ofs != 0) + for (i = 0; i != SMOL_CLEAR_BATCH_SIZE; i += pixel_stride) { - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, 0), - vertical_ctx->parts_row [0]); - vertical_ctx->in_ofs = 0; - } - - scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out); -} - -static void -scale_outrow_copy (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t row_index, - uint32_t *row_out) -{ - scale_horizontal (scale_ctx, - vertical_ctx, - inrow_ofs_to_pointer (scale_ctx, row_index), - vertical_ctx->parts_row [0]); + /* Must be an exact fit */ + SMOL_ASSERT (i + pixel_stride <= SMOL_CLEAR_BATCH_SIZE); - scale_ctx->pack_row_func (vertical_ctx->parts_row [0], row_out, scale_ctx->width_out); + memcpy (scale_ctx->color_pixels_clear_batch + i, dest_color, pixel_stride); + } } -static void -scale_outrow (const SmolScaleCtx *scale_ctx, - SmolVerticalCtx *vertical_ctx, - uint32_t outrow_index, - uint32_t *row_out) -{ - scale_ctx->vfilter_func (scale_ctx, - vertical_ctx, - outrow_index, - row_out); - - if (scale_ctx->post_row_func) - scale_ctx->post_row_func (row_out, scale_ctx->width_out, scale_ctx->user_data); -} +#define IMPLEMENTATION_MAX 8 +/* scale_ctx->storage_type must be initialized first by pick_filter_params() */ static void -do_rows (const SmolScaleCtx *scale_ctx, - void *outrows_dest, - uint32_t row_out_index, - uint32_t n_rows) +get_implementations (SmolScaleCtx *scale_ctx, const void *color_pixel, SmolPixelType color_pixel_type) { - SmolVerticalCtx vertical_ctx = { 0 }; - uint32_t n_parts_per_pixel = 1; - uint32_t n_stored_rows = 4; - uint32_t i; + SmolPixelType src_ptype, dest_ptype; + const SmolPixelTypeMeta *src_pmeta, *dest_pmeta; + const SmolRepackMeta *src_rmeta, *dest_rmeta; + SmolAlphaType internal_alpha = SMOL_ALPHA_PREMUL8; + const SmolImplementation *implementations [IMPLEMENTATION_MAX]; + int i = 0; - if (scale_ctx->storage_type == SMOL_STORAGE_128BPP) - n_parts_per_pixel = 2; + if (color_pixel) + scale_ctx->have_composite_color = TRUE; - /* Must be one less, or this test in update_vertical_ctx() will wrap around: - * if (new_in_ofs == vertical_ctx->in_ofs + 1) { ... } */ - vertical_ctx.in_ofs = UINT_MAX - 1; + /* Check for noop (direct copy) */ - for (i = 0; i < n_stored_rows; i++) + if (scale_ctx->hdim.src_size_spx == scale_ctx->hdim.dest_size_spx + && scale_ctx->vdim.src_size_spx == scale_ctx->vdim.dest_size_spx + && scale_ctx->src_pixel_type == scale_ctx->dest_pixel_type + && scale_ctx->composite_op != SMOL_COMPOSITE_SRC_OVER_DEST) { - vertical_ctx.parts_row [i] = - smol_alloc_aligned (MAX (scale_ctx->width_in, scale_ctx->width_out) - * n_parts_per_pixel * sizeof (uint64_t), - &vertical_ctx.row_storage [i]); + /* The scaling and packing is a no-op, but we may still need to + * clear dest, so allow the rest of the function to run so we get + * the clear functions etc. */ + scale_ctx->is_noop = TRUE; } - for (i = row_out_index; i < row_out_index + n_rows; i++) - { - scale_outrow (scale_ctx, &vertical_ctx, i, outrows_dest); - outrows_dest = (uint8_t *) outrows_dest + scale_ctx->rowstride_out; - } + /* Enumerate implementations, preferred first */ - for (i = 0; i < n_stored_rows; i++) + if (!(scale_ctx->flags & SMOL_DISABLE_ACCELERATION)) { - smol_free (vertical_ctx.row_storage [i]); +#ifdef SMOL_WITH_AVX2 + if (have_avx2 ()) + implementations [i++] = _smol_get_avx2_implementation (); +#endif } - /* Used to align row data if needed. May be allocated in scale_horizontal(). */ - if (vertical_ctx.in_aligned) - smol_free (vertical_ctx.in_aligned_storage); -} + implementations [i++] = _smol_get_generic_implementation (); + implementations [i] = NULL; -/* --- Conversion tables --- */ + /* Install repackers */ -static const SmolConversionTable generic_conversions = -{ -{ { - /* Conversions where accumulators must hold the sum of fewer than - * 256 pixels. This can be done in 64bpp, but 128bpp may be used - * e.g. for 16 bits per channel internally premultiplied data. */ + src_ptype = get_host_pixel_type (scale_ctx->src_pixel_type); + dest_ptype = get_host_pixel_type (scale_ctx->dest_pixel_type); - /* RGBA8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 3214, p, 64), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4123, p, 64), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64), - /* RGBA8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 1234, u, 64), - /* BGRA8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 3214, u, 64), - /* ARGB8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4123, u, 64), - /* ABGR8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4321, u, 64), - /* RGB8 */ SMOL_CONV (1234, p, 1324, p, 132a, p, 123, u, 64), - /* BGR8 */ SMOL_CONV (1234, p, 1324, p, 132a, p, 321, u, 64), - }, - /* BGRA8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 3214, p, 64), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4123, p, 64), - /* RGBA8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 3214, u, 64), - /* BGRA8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 1234, u, 64), - /* ARGB8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4321, u, 64), - /* ABGR8 un */ SMOL_CONV (1234, p, 1324, p, 132a, p, 4123, u, 64), - /* RGB8 */ SMOL_CONV (1234, p, 1324, p, 132a, p, 321, u, 64), - /* BGR8 */ SMOL_CONV (1234, p, 1324, p, 132a, p, 123, u, 64), - }, - /* ARGB8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 2341, p, 64), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1432, p, 64), - /* RGBA8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 2341, u, 64), - /* BGRA8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 4321, u, 64), - /* ARGB8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 1234, u, 64), - /* ABGR8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 1432, u, 64), - /* RGB8 */ SMOL_CONV (1234, p, 1324, p, a324, p, 234, u, 64), - /* BGR8 */ SMOL_CONV (1234, p, 1324, p, a324, p, 432, u, 64), - }, - /* ABGR8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 4321, p, 64), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 2341, p, 64), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1432, p, 64), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1324, p, 1324, p, 1234, p, 64), - /* RGBA8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 4321, u, 64), - /* BGRA8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 2341, u, 64), - /* ARGB8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 1432, u, 64), - /* ABGR8 un */ SMOL_CONV (1234, p, 1324, p, a324, p, 1234, u, 64), - /* RGB8 */ SMOL_CONV (1234, p, 1324, p, a324, p, 432, u, 64), - /* BGR8 */ SMOL_CONV (1234, p, 1324, p, a324, p, 234, u, 64), - }, - /* RGBA8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 1234, p, 64), - /* BGRA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 3214, p, 64), - /* ARGB8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4123, p, 64), - /* ABGR8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4321, p, 64), - /* RGBA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128), - /* BGRA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128), - /* ARGB8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128), - /* ABGR8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128), - /* RGB8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128), - /* BGR8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128), - }, - /* BGRA8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 3214, p, 64), - /* BGRA8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 1234, p, 64), - /* ARGB8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4321, p, 64), - /* ABGR8 pre */ SMOL_CONV (123a, u, 132a, p, 1324, p, 4123, p, 64), - /* RGBA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128), - /* BGRA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128), - /* ARGB8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128), - /* ABGR8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128), - /* RGB8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128), - /* BGR8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128), - }, - /* ARGB8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 2341, p, 64), - /* BGRA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 4321, p, 64), - /* ARGB8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1234, p, 64), - /* ABGR8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1432, p, 64), - /* RGBA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128), - /* BGRA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128), - /* ARGB8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128), - /* ABGR8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128), - /* RGB8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128), - /* BGR8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128), - }, - /* ABGR8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 4321, p, 64), - /* BGRA8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 2341, p, 64), - /* ARGB8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1432, p, 64), - /* ABGR8 pre */ SMOL_CONV (a234, u, a324, p, 1324, p, 1234, p, 64), - /* RGBA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128), - /* BGRA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128), - /* ARGB8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128), - /* ABGR8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128), - /* RGB8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128), - /* BGR8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128), - }, - /* RGB8 -> */ - { - /* RGBA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64), - /* BGRA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64), - /* ARGB8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64), - /* ABGR8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64), - /* RGBA8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64), - /* BGRA8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64), - /* ARGB8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64), - /* ABGR8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64), - /* RGB8 */ SMOL_CONV (123, p, 132a, p, 132a, p, 123, p, 64), - /* BGR8 */ SMOL_CONV (123, p, 132a, p, 132a, p, 321, p, 64), - }, - /* BGR8 -> */ - { - /* RGBA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64), - /* BGRA8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64), - /* ARGB8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64), - /* ABGR8 pre */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64), - /* RGBA8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 3214, p, 64), - /* BGRA8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 1234, p, 64), - /* ARGB8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 4321, p, 64), - /* ABGR8 un */ SMOL_CONV (123, p, 132a, p, 1324, p, 4123, p, 64), - /* RGB8 */ SMOL_CONV (123, p, 132a, p, 132a, p, 321, p, 64), - /* BGR8 */ SMOL_CONV (123, p, 132a, p, 132a, p, 123, p, 64), - } - }, + src_pmeta = &pixel_type_meta [src_ptype]; + dest_pmeta = &pixel_type_meta [dest_ptype]; + if (src_pmeta->alpha == SMOL_ALPHA_UNASSOCIATED + && dest_pmeta->alpha == SMOL_ALPHA_UNASSOCIATED) { - /* Conversions where accumulators must hold the sum of up to - * 65535 pixels. We need 128bpp for this. */ + /* In order to preserve the color range in transparent pixels when going + * from unassociated to unassociated, we use 16 bits per channel internally. */ + internal_alpha = SMOL_ALPHA_PREMUL16; + scale_ctx->storage_type = SMOL_STORAGE_128BPP; + } - /* RGBA8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 3214, p, 128), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4123, p, 128), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128), - /* RGBA8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 1234, u, 128), - /* BGRA8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 3214, u, 128), - /* ARGB8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4123, u, 128), - /* ABGR8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4321, u, 128), - /* RGB8 */ SMOL_CONV (1234, p, 1234, p, 123a, p, 123, u, 128), - /* BGR8 */ SMOL_CONV (1234, p, 1234, p, 123a, p, 321, u, 128), - }, - /* BGRA8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 3214, p, 128), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4123, p, 128), - /* RGBA8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 3214, u, 128), - /* BGRA8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 1234, u, 128), - /* ARGB8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4321, u, 128), - /* ABGR8 un */ SMOL_CONV (1234, p, 1234, p, 123a, p, 4123, u, 128), - /* RGB8 */ SMOL_CONV (1234, p, 1234, p, 123a, p, 321, u, 128), - /* BGR8 */ SMOL_CONV (1234, p, 1234, p, 123a, p, 123, u, 128), - }, - /* ARGB8 pre -> */ - { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 2341, p, 128), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1432, p, 128), - /* RGBA8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 2341, u, 128), - /* BGRA8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 4321, u, 128), - /* ARGB8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 1234, u, 128), - /* ABGR8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 1432, u, 128), - /* RGB8 */ SMOL_CONV (1234, p, 1234, p, a234, p, 234, u, 128), - /* BGR8 */ SMOL_CONV (1234, p, 1234, p, a234, p, 432, u, 128), - }, - /* ABGR8 pre -> */ + if (scale_ctx->hdim.src_size_px > scale_ctx->hdim.dest_size_px * 8191 + || scale_ctx->vdim.src_size_px > scale_ctx->vdim.dest_size_px * 8191) { - /* RGBA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 4321, p, 128), - /* BGRA8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 2341, p, 128), - /* ARGB8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1432, p, 128), - /* ABGR8 pre */ SMOL_CONV (1234, p, 1234, p, 1234, p, 1234, p, 128), - /* RGBA8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 4321, u, 128), - /* BGRA8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 2341, u, 128), - /* ARGB8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 1432, u, 128), - /* ABGR8 un */ SMOL_CONV (1234, p, 1234, p, a234, p, 1234, u, 128), - /* RGB8 */ SMOL_CONV (1234, p, 1234, p, a234, p, 432, u, 128), - /* BGR8 */ SMOL_CONV (1234, p, 1234, p, a234, p, 234, u, 128), - }, - /* RGBA8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 1234, p, 128), - /* BGRA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 3214, p, 128), - /* ARGB8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4123, p, 128), - /* ABGR8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4321, p, 128), - /* RGBA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128), - /* BGRA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128), - /* ARGB8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128), - /* ABGR8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128), - /* RGB8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128), - /* BGR8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128), - }, - /* BGRA8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 3214, p, 128), - /* BGRA8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 1234, p, 128), - /* ARGB8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4321, p, 128), - /* ABGR8 pre */ SMOL_CONV (123a, u, 123a, p, 1234, p, 4123, p, 128), - /* RGBA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 3214, u, 128), - /* BGRA8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 1234, u, 128), - /* ARGB8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4321, u, 128), - /* ABGR8 un */ SMOL_CONV (123a, u, 123a, i, 123a, i, 4123, u, 128), - /* RGB8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 321, u, 128), - /* BGR8 */ SMOL_CONV (123a, u, 123a, i, 123a, i, 123, u, 128), - }, - /* ARGB8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 2341, p, 128), - /* BGRA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 4321, p, 128), - /* ARGB8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1234, p, 128), - /* ABGR8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1432, p, 128), - /* RGBA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128), - /* BGRA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128), - /* ARGB8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128), - /* ABGR8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128), - /* RGB8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128), - /* BGR8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128), - }, - /* ABGR8 un -> */ - { - /* RGBA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 4321, p, 128), - /* BGRA8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 2341, p, 128), - /* ARGB8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1432, p, 128), - /* ABGR8 pre */ SMOL_CONV (a234, u, a234, p, 1234, p, 1234, p, 128), - /* RGBA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 3214, u, 128), - /* BGRA8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 1234, u, 128), - /* ARGB8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4321, u, 128), - /* ABGR8 un */ SMOL_CONV (a234, u, 234a, i, 123a, i, 4123, u, 128), - /* RGB8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 321, u, 128), - /* BGR8 */ SMOL_CONV (a234, u, 234a, i, 123a, i, 123, u, 128), - }, - /* RGB8 -> */ - { - /* RGBA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128), - /* BGRA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128), - /* ARGB8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128), - /* ABGR8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128), - /* RGBA8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128), - /* BGRA8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128), - /* ARGB8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128), - /* ABGR8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128), - /* RGB8 */ SMOL_CONV (123, p, 123a, p, 123a, p, 123, p, 128), - /* BGR8 */ SMOL_CONV (123, p, 123a, p, 123a, p, 321, p, 128), - }, - /* BGR8 -> */ - { - /* RGBA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128), - /* BGRA8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128), - /* ARGB8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128), - /* ABGR8 pre */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128), - /* RGBA8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 3214, p, 128), - /* BGRA8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 1234, p, 128), - /* ARGB8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 4321, p, 128), - /* ABGR8 un */ SMOL_CONV (123, p, 123a, p, 1234, p, 4123, p, 128), - /* RGB8 */ SMOL_CONV (123, p, 123a, p, 123a, p, 321, p, 128), - /* BGR8 */ SMOL_CONV (123, p, 123a, p, 123a, p, 123, p, 128), + /* Even with 128bpp, there's only enough bits to store 11-bit linearized + * times 13 bits of summed pixels plus 8 bits of scratch space for + * multiplying with an 8-bit weight -> 32 bits total per channel. + * + * For now, just turn off sRGB linearization if the input is bigger + * than the output by a factor of 2^13 or more. */ + scale_ctx->gamma_type = SMOL_GAMMA_SRGB_COMPRESSED; } -} } -}; -static const SmolImplementation generic_implementation = -{ - { - /* Horizontal filters */ - { - /* 64bpp */ - interp_horizontal_copy_64bpp, - interp_horizontal_one_64bpp, - interp_horizontal_bilinear_0h_64bpp, - interp_horizontal_bilinear_1h_64bpp, - interp_horizontal_bilinear_2h_64bpp, - interp_horizontal_bilinear_3h_64bpp, - interp_horizontal_bilinear_4h_64bpp, - interp_horizontal_bilinear_5h_64bpp, - interp_horizontal_bilinear_6h_64bpp, - interp_horizontal_boxes_64bpp - }, - { - /* 128bpp */ - interp_horizontal_copy_128bpp, - interp_horizontal_one_128bpp, - interp_horizontal_bilinear_0h_128bpp, - interp_horizontal_bilinear_1h_128bpp, - interp_horizontal_bilinear_2h_128bpp, - interp_horizontal_bilinear_3h_128bpp, - interp_horizontal_bilinear_4h_128bpp, - interp_horizontal_bilinear_5h_128bpp, - interp_horizontal_bilinear_6h_128bpp, - interp_horizontal_boxes_128bpp - } - }, - { - /* Vertical filters */ - { - /* 64bpp */ - scale_outrow_copy, - scale_outrow_one_64bpp, - scale_outrow_bilinear_0h_64bpp, - scale_outrow_bilinear_1h_64bpp, - scale_outrow_bilinear_2h_64bpp, - scale_outrow_bilinear_3h_64bpp, - scale_outrow_bilinear_4h_64bpp, - scale_outrow_bilinear_5h_64bpp, - scale_outrow_bilinear_6h_64bpp, - scale_outrow_box_64bpp - }, - { - /* 128bpp */ - scale_outrow_copy, - scale_outrow_one_128bpp, - scale_outrow_bilinear_0h_128bpp, - scale_outrow_bilinear_1h_128bpp, - scale_outrow_bilinear_2h_128bpp, - scale_outrow_bilinear_3h_128bpp, - scale_outrow_bilinear_4h_128bpp, - scale_outrow_bilinear_5h_128bpp, - scale_outrow_bilinear_6h_128bpp, - scale_outrow_box_128bpp - } - }, - &generic_conversions -}; + find_repacks (implementations, + src_pmeta->storage, scale_ctx->storage_type, dest_pmeta->storage, + src_pmeta->alpha, internal_alpha, dest_pmeta->alpha, + SMOL_GAMMA_SRGB_COMPRESSED, scale_ctx->gamma_type, SMOL_GAMMA_SRGB_COMPRESSED, + src_pmeta, dest_pmeta, + &src_rmeta, &dest_rmeta); -/* In the absence of a proper build system, runtime detection is more - portable than compiler macros. WFM. */ -static SmolBool -host_is_little_endian (void) -{ - static const union - { - uint8_t u8 [4]; - uint32_t u32; - } - host_bytes = { { 0, 1, 2, 3 } }; + SMOL_ASSERT (src_rmeta != NULL); + SMOL_ASSERT (dest_rmeta != NULL); - if (host_bytes.u32 == 0x03020100UL) - return TRUE; + scale_ctx->src_unpack_row_func = src_rmeta->repack_row_func; + scale_ctx->pack_row_func = dest_rmeta->repack_row_func; - return FALSE; -} + if (scale_ctx->composite_op == SMOL_COMPOSITE_SRC_OVER_DEST) + { + const SmolRepackMeta *dest_unpack_rmeta; -/* The generic unpack/pack functions fetch and store pixels as u32. - * This means the byte order will be reversed on little endian, with - * consequences for the alpha channel and reordering logic. We deal - * with this by using the apparent byte order internally. */ -static SmolPixelType -get_host_pixel_type (SmolPixelType pixel_type) -{ - SmolPixelType host_pixel_type = SMOL_PIXEL_MAX; + /* Need to unpack destination rows and composite on them */ - if (!host_is_little_endian ()) - return pixel_type; + find_repacks (implementations, + dest_pmeta->storage, scale_ctx->storage_type, dest_pmeta->storage, + dest_pmeta->alpha, internal_alpha, dest_pmeta->alpha, + SMOL_GAMMA_SRGB_COMPRESSED, scale_ctx->gamma_type, SMOL_GAMMA_SRGB_COMPRESSED, + dest_pmeta, dest_pmeta, + &dest_unpack_rmeta, NULL); - /* We use a switch for this so the compiler can remind us - * to keep it in sync with the SmolPixelType enum. */ - switch (pixel_type) - { - case SMOL_PIXEL_RGBA8_PREMULTIPLIED: - host_pixel_type = SMOL_PIXEL_ABGR8_PREMULTIPLIED; break; - case SMOL_PIXEL_BGRA8_PREMULTIPLIED: - host_pixel_type = SMOL_PIXEL_ARGB8_PREMULTIPLIED; break; - case SMOL_PIXEL_ARGB8_PREMULTIPLIED: - host_pixel_type = SMOL_PIXEL_BGRA8_PREMULTIPLIED; break; - case SMOL_PIXEL_ABGR8_PREMULTIPLIED: - host_pixel_type = SMOL_PIXEL_RGBA8_PREMULTIPLIED; break; - case SMOL_PIXEL_RGBA8_UNASSOCIATED: - host_pixel_type = SMOL_PIXEL_ABGR8_UNASSOCIATED; break; - case SMOL_PIXEL_BGRA8_UNASSOCIATED: - host_pixel_type = SMOL_PIXEL_ARGB8_UNASSOCIATED; break; - case SMOL_PIXEL_ARGB8_UNASSOCIATED: - host_pixel_type = SMOL_PIXEL_BGRA8_UNASSOCIATED; break; - case SMOL_PIXEL_ABGR8_UNASSOCIATED: - host_pixel_type = SMOL_PIXEL_RGBA8_UNASSOCIATED; break; - case SMOL_PIXEL_RGB8: - host_pixel_type = SMOL_PIXEL_RGB8; break; - case SMOL_PIXEL_BGR8: - host_pixel_type = SMOL_PIXEL_BGR8; break; - case SMOL_PIXEL_MAX: - host_pixel_type = SMOL_PIXEL_MAX; break; + SMOL_ASSERT (dest_unpack_rmeta != NULL); + + scale_ctx->dest_unpack_row_func = dest_unpack_rmeta->repack_row_func; } + else + { + /* Compositing on solid color */ - return host_pixel_type; -} + if (color_pixel) + { + SmolPixelType color_ptype; + const SmolPixelTypeMeta *color_pmeta; + const SmolRepackMeta *color_rmeta; -#ifdef SMOL_WITH_AVX2 + color_ptype = get_host_pixel_type (color_pixel_type); + color_pmeta = &pixel_type_meta [color_ptype]; -static SmolBool -have_avx2 (void) -{ -#ifdef HAVE_GCC_X86_FEATURE_BUILTINS - __builtin_cpu_init (); + find_repacks (implementations, + color_pmeta->storage, scale_ctx->storage_type, dest_pmeta->storage, + color_pmeta->alpha, internal_alpha, dest_pmeta->alpha, + SMOL_GAMMA_SRGB_COMPRESSED, scale_ctx->gamma_type, SMOL_GAMMA_SRGB_COMPRESSED, + color_pmeta, dest_pmeta, + &color_rmeta, NULL); - if (__builtin_cpu_supports ("avx2")) - return TRUE; -#endif + SMOL_ASSERT (color_rmeta != NULL); - return FALSE; -} + color_rmeta->repack_row_func (color_pixel, scale_ctx->color_pixel, 1); + } + else + { + /* No color provided; use fully transparent black */ + memset (scale_ctx->color_pixel, 0, sizeof (scale_ctx->color_pixel)); + } -#endif + populate_clear_batch (scale_ctx); + } -static void -try_override_conversion (SmolScaleCtx *scale_ctx, - const SmolImplementation *impl, - SmolPixelType ptype_in, - SmolPixelType ptype_out, - uint8_t *n_bytes_per_pixel) -{ - const SmolConversion *conv; + /* Install filters and compositors */ - conv = &impl->ctab->conversions - [scale_ctx->storage_type] [ptype_in] [ptype_out]; + scale_ctx->hfilter_func = NULL; + scale_ctx->vfilter_func = NULL; + scale_ctx->composite_over_color_func = NULL; + scale_ctx->composite_over_dest_func = NULL; + scale_ctx->clear_dest_func = NULL; - if (conv->unpack_row_func && conv->pack_row_func) + for (i = 0; implementations [i]; i++) { - *n_bytes_per_pixel = conv->n_bytes_per_pixel; - scale_ctx->unpack_row_func = conv->unpack_row_func; - scale_ctx->pack_row_func = conv->pack_row_func; + SmolHFilterFunc *hfilter_func = + implementations [i]->hfilter_funcs [scale_ctx->storage_type] [scale_ctx->hdim.filter_type]; + SmolVFilterFunc *vfilter_func = + implementations [i]->vfilter_funcs [scale_ctx->storage_type] [scale_ctx->vdim.filter_type]; + SmolCompositeOverColorFunc *composite_over_color_func = + implementations [i]->composite_over_color_funcs [scale_ctx->storage_type]; + SmolCompositeOverDestFunc *composite_over_dest_func = + implementations [i]->composite_over_dest_funcs [scale_ctx->storage_type]; + SmolClearFunc *clear_dest_func = + implementations [i]->clear_funcs [dest_pmeta->storage]; + + if (!scale_ctx->hfilter_func && hfilter_func) + { + scale_ctx->hfilter_func = hfilter_func; + if (implementations [i]->init_h_func) + implementations [i]->init_h_func (scale_ctx); + } + + if (!scale_ctx->vfilter_func && vfilter_func) + { + scale_ctx->vfilter_func = vfilter_func; + if (implementations [i]->init_v_func) + implementations [i]->init_v_func (scale_ctx); + } + + if (!scale_ctx->composite_over_color_func && composite_over_color_func) + scale_ctx->composite_over_color_func = composite_over_color_func; + if (!scale_ctx->composite_over_dest_func && composite_over_dest_func) + scale_ctx->composite_over_dest_func = composite_over_dest_func; + if (!scale_ctx->clear_dest_func && clear_dest_func) + scale_ctx->clear_dest_func = clear_dest_func; } -} -static void -try_override_filters (SmolScaleCtx *scale_ctx, - const SmolImplementation *impl) -{ - SmolHFilterFunc *hfilter_func; - SmolVFilterFunc *vfilter_func; - - hfilter_func = impl->hfilter_funcs - [scale_ctx->storage_type] [scale_ctx->filter_h]; - vfilter_func = impl->vfilter_funcs - [scale_ctx->storage_type] [scale_ctx->filter_v]; - - if (hfilter_func) - scale_ctx->hfilter_func = hfilter_func; - if (vfilter_func) - scale_ctx->vfilter_func = vfilter_func; + SMOL_ASSERT (scale_ctx->hfilter_func != NULL); + SMOL_ASSERT (scale_ctx->vfilter_func != NULL); } static void -get_implementations (SmolScaleCtx *scale_ctx) +init_dim (SmolDim *dim, + uint32_t src_size_spx, + uint32_t dest_size_spx, + int32_t placement_ofs_spx, + int32_t placement_size_spx, + SmolFlags flags, + SmolStorageType *storage_type_out) { - const SmolConversion *conv; - SmolPixelType ptype_in, ptype_out; - uint8_t n_bytes_per_pixel; - const SmolImplementation *avx2_impl = NULL; - -#ifdef SMOL_WITH_AVX2 - if (have_avx2 ()) - avx2_impl = _smol_get_avx2_implementation (); -#endif - - ptype_in = get_host_pixel_type (scale_ctx->pixel_type_in); - ptype_out = get_host_pixel_type (scale_ctx->pixel_type_out); - - /* Install generic unpack()/pack() */ - - conv = &generic_implementation.ctab->conversions - [scale_ctx->storage_type] [ptype_in] [ptype_out]; - - n_bytes_per_pixel = conv->n_bytes_per_pixel; - scale_ctx->unpack_row_func = conv->unpack_row_func; - scale_ctx->pack_row_func = conv->pack_row_func; + dim->src_size_spx = src_size_spx; + dim->src_size_px = SMOL_SPX_TO_PX (src_size_spx); + dim->dest_size_spx = dest_size_spx; + dim->dest_size_px = SMOL_SPX_TO_PX (dest_size_spx); + dim->placement_ofs_spx = placement_ofs_spx; + if (placement_ofs_spx < 0) + dim->placement_ofs_px = (placement_ofs_spx - 255) / SMOL_SUBPIXEL_MUL; + else + dim->placement_ofs_px = placement_ofs_spx / SMOL_SUBPIXEL_MUL; + dim->placement_size_spx = placement_size_spx; + dim->placement_size_px = SMOL_SPX_TO_PX (placement_size_spx + SMOL_SUBPIXEL_MOD (placement_ofs_spx)); - /* Try to override with better unpack()/pack() implementations */ + pick_filter_params (dim->src_size_px, + dim->src_size_spx, + dim->placement_ofs_spx, + dim->placement_size_px, + dim->placement_size_spx, + &dim->n_halvings, + &dim->placement_size_prehalving_px, + &dim->placement_size_prehalving_spx, + &dim->filter_type, + storage_type_out, + &dim->first_opacity, + &dim->last_opacity, + flags); - if (avx2_impl) - try_override_conversion (scale_ctx, avx2_impl, - ptype_in, ptype_out, - &n_bytes_per_pixel); + /* Calculate clip and clear intervals */ - /* Some conversions require extra precision. This can only ever - * upgrade the storage from 64bpp to 128bpp, but we handle both - * cases here for clarity. */ - if (n_bytes_per_pixel == 8) - scale_ctx->storage_type = SMOL_STORAGE_64BPP; - else if (n_bytes_per_pixel == 16) - scale_ctx->storage_type = SMOL_STORAGE_128BPP; - else + if (dim->placement_ofs_px > 0) + { + dim->clear_before_px = dim->placement_ofs_px; + dim->clip_before_px = 0; + } + else if (dim->placement_ofs_px < 0) { - assert (n_bytes_per_pixel == 8 || n_bytes_per_pixel == 16); + dim->clear_before_px = 0; + dim->clip_before_px = -dim->placement_ofs_px; + dim->first_opacity = 256; } - /* Install generic filters */ + if (dim->placement_ofs_px + dim->placement_size_px < dim->dest_size_px) + { + dim->clear_after_px = dim->dest_size_px - dim->placement_ofs_px - dim->placement_size_px; + dim->clip_after_px = 0; + } + else if (dim->placement_ofs_px + dim->placement_size_px > dim->dest_size_px) + { + dim->clear_after_px = 0; + dim->clip_after_px = dim->placement_ofs_px + dim->placement_size_px - dim->dest_size_px; + dim->last_opacity = 256; + } - scale_ctx->hfilter_func = generic_implementation.hfilter_funcs - [scale_ctx->storage_type] [scale_ctx->filter_h]; - scale_ctx->vfilter_func = generic_implementation.vfilter_funcs - [scale_ctx->storage_type] [scale_ctx->filter_v]; + /* Clamp placement */ - /* Try to override with better filter implementations */ + if (dim->placement_ofs_px < 0) + { + dim->placement_size_px += dim->placement_ofs_px; + dim->placement_ofs_px = 0; + } - if (avx2_impl) - try_override_filters (scale_ctx, avx2_impl); + if (dim->placement_ofs_px + dim->placement_size_px > dim->dest_size_px) + { + dim->placement_size_px = dim->dest_size_px - dim->placement_ofs_px; + } } static void smol_scale_init (SmolScaleCtx *scale_ctx, - SmolPixelType pixel_type_in, - const void *pixels_in, - uint32_t width_in, - uint32_t height_in, - uint32_t rowstride_in, - SmolPixelType pixel_type_out, - void *pixels_out, - uint32_t width_out, - uint32_t height_out, - uint32_t rowstride_out, + const void *src_pixels, + SmolPixelType src_pixel_type, + uint32_t src_width_spx, + uint32_t src_height_spx, + uint32_t src_rowstride, + const void *color_pixel, + SmolPixelType color_pixel_type, + void *dest_pixels, + SmolPixelType dest_pixel_type, + uint32_t dest_width_spx, + uint32_t dest_height_spx, + uint32_t dest_rowstride, + int32_t placement_x_spx, + int32_t placement_y_spx, + int32_t placement_width_spx, + int32_t placement_height_spx, + SmolCompositeOp composite_op, + SmolFlags flags, SmolPostRowFunc post_row_func, void *user_data) { SmolStorageType storage_type [2]; - scale_ctx->pixel_type_in = pixel_type_in; - scale_ctx->pixels_in = pixels_in; - scale_ctx->width_in = width_in; - scale_ctx->height_in = height_in; - scale_ctx->rowstride_in = rowstride_in; - scale_ctx->pixel_type_out = pixel_type_out; - scale_ctx->pixels_out = pixels_out; - scale_ctx->width_out = width_out; - scale_ctx->height_out = height_out; - scale_ctx->rowstride_out = rowstride_out; + if (placement_width_spx <= 0 || placement_height_spx <= 0) + { + placement_width_spx = 0; + placement_height_spx = 0; + placement_x_spx = 0; + placement_y_spx = 0; + } + + scale_ctx->src_pixels = src_pixels; + scale_ctx->src_pixel_type = src_pixel_type; + scale_ctx->src_rowstride = src_rowstride; + + scale_ctx->dest_pixels = dest_pixels; + scale_ctx->dest_pixel_type = dest_pixel_type; + scale_ctx->dest_rowstride = dest_rowstride; + + scale_ctx->composite_op = composite_op; + scale_ctx->flags = flags; + scale_ctx->gamma_type = (flags & SMOL_DISABLE_SRGB_LINEARIZATION) + ? SMOL_GAMMA_SRGB_COMPRESSED : SMOL_GAMMA_SRGB_LINEAR; scale_ctx->post_row_func = post_row_func; scale_ctx->user_data = user_data; - pick_filter_params (width_in, width_out, - &scale_ctx->width_halvings, - &scale_ctx->width_bilin_out, - &scale_ctx->filter_h, - &storage_type [0]); - pick_filter_params (height_in, height_out, - &scale_ctx->height_halvings, - &scale_ctx->height_bilin_out, - &scale_ctx->filter_v, - &storage_type [1]); + init_dim (&scale_ctx->hdim, + src_width_spx, dest_width_spx, + placement_x_spx, placement_width_spx, + flags, &storage_type [0]); + init_dim (&scale_ctx->vdim, + src_height_spx, dest_height_spx, + placement_y_spx, placement_height_spx, + flags, &storage_type [1]); scale_ctx->storage_type = MAX (storage_type [0], storage_type [1]); - scale_ctx->offsets_x = malloc (((scale_ctx->width_bilin_out + 1) * 2 - + (scale_ctx->height_bilin_out + 1) * 2) * sizeof (uint16_t)); - scale_ctx->offsets_y = scale_ctx->offsets_x + (scale_ctx->width_bilin_out + 1) * 2; - - if (scale_ctx->filter_h == SMOL_FILTER_ONE) - { - } - else if (scale_ctx->filter_h == SMOL_FILTER_BOX) - { - precalc_boxes_array (scale_ctx->offsets_x, &scale_ctx->span_mul_x, - width_in, scale_ctx->width_out, FALSE); - } - else /* SMOL_FILTER_BILINEAR_?H */ - { - precalc_bilinear_array (scale_ctx->offsets_x, - width_in, scale_ctx->width_bilin_out, FALSE); - } - - if (scale_ctx->filter_v == SMOL_FILTER_ONE) - { - } - else if (scale_ctx->filter_v == SMOL_FILTER_BOX) - { - precalc_boxes_array (scale_ctx->offsets_y, &scale_ctx->span_mul_y, - height_in, scale_ctx->height_out, TRUE); - } - else /* SMOL_FILTER_BILINEAR_?H */ - { - precalc_bilinear_array (scale_ctx->offsets_y, - height_in, scale_ctx->height_bilin_out, TRUE); - } + scale_ctx->hdim.precalc = smol_alloc_aligned (((scale_ctx->hdim.placement_size_prehalving_px + 1) * 2 + + (scale_ctx->vdim.placement_size_prehalving_px + 1) * 2) + * sizeof (uint16_t), + &scale_ctx->precalc_storage); + scale_ctx->vdim.precalc = ((uint16_t *) scale_ctx->hdim.precalc) + (scale_ctx->hdim.placement_size_prehalving_px + 1) * 2; - get_implementations (scale_ctx); + get_implementations (scale_ctx, color_pixel, color_pixel_type); } static void smol_scale_finalize (SmolScaleCtx *scale_ctx) { - free (scale_ctx->offsets_x); + free (scale_ctx->precalc_storage); } -/* --- Public API --- */ +/* ---------- * + * Public API * + * ---------- */ SmolScaleCtx * -smol_scale_new (SmolPixelType pixel_type_in, - const void *pixels_in, - uint32_t width_in, - uint32_t height_in, - uint32_t rowstride_in, - SmolPixelType pixel_type_out, - void *pixels_out, - uint32_t width_out, - uint32_t height_out, - uint32_t rowstride_out) +smol_scale_new_simple (const void *src_pixels, + SmolPixelType src_pixel_type, + uint32_t src_width, + uint32_t src_height, + uint32_t src_rowstride, + void *dest_pixels, + SmolPixelType dest_pixel_type, + uint32_t dest_width, + uint32_t dest_height, + uint32_t dest_rowstride, + SmolFlags flags) { SmolScaleCtx *scale_ctx; scale_ctx = calloc (sizeof (SmolScaleCtx), 1); smol_scale_init (scale_ctx, - pixel_type_in, - pixels_in, - width_in, - height_in, - rowstride_in, - pixel_type_out, - pixels_out, - width_out, - height_out, - rowstride_out, + src_pixels, + src_pixel_type, + SMOL_PX_TO_SPX (src_width), + SMOL_PX_TO_SPX (src_height), + src_rowstride, + NULL, + 0, + dest_pixels, + dest_pixel_type, + SMOL_PX_TO_SPX (dest_width), + SMOL_PX_TO_SPX (dest_height), + dest_rowstride, + 0, + 0, + SMOL_PX_TO_SPX (dest_width), + SMOL_PX_TO_SPX (dest_height), + SMOL_COMPOSITE_SRC, + flags, NULL, NULL); return scale_ctx; } +void +smol_scale_simple (const void *src_pixels, + SmolPixelType src_pixel_type, + uint32_t src_width, + uint32_t src_height, + uint32_t src_rowstride, + void *dest_pixels, + SmolPixelType dest_pixel_type, + uint32_t dest_width, + uint32_t dest_height, + uint32_t dest_rowstride, + SmolFlags flags) +{ + SmolScaleCtx scale_ctx = { 0 }; + int first_row, n_rows; + + smol_scale_init (&scale_ctx, + src_pixels, + src_pixel_type, + SMOL_PX_TO_SPX (src_width), + SMOL_PX_TO_SPX (src_height), + src_rowstride, + NULL, + 0, + dest_pixels, + dest_pixel_type, + SMOL_PX_TO_SPX (dest_width), + SMOL_PX_TO_SPX (dest_height), + dest_rowstride, + 0, + 0, + SMOL_PX_TO_SPX (dest_width), + SMOL_PX_TO_SPX (dest_height), + SMOL_COMPOSITE_SRC, + flags, + NULL, NULL); + + first_row = 0; + n_rows = scale_ctx.vdim.dest_size_px; + + if (check_row_range (&scale_ctx, &first_row, &n_rows)) + { + do_rows (&scale_ctx, + dest_row_ofs_to_pointer (&scale_ctx, 0), + first_row, + n_rows); + } + + smol_scale_finalize (&scale_ctx); +} + SmolScaleCtx * -smol_scale_new_full (SmolPixelType pixel_type_in, - const void *pixels_in, - uint32_t width_in, - uint32_t height_in, - uint32_t rowstride_in, - SmolPixelType pixel_type_out, - void *pixels_out, - uint32_t width_out, - uint32_t height_out, - uint32_t rowstride_out, +smol_scale_new_full (const void *src_pixels, + SmolPixelType src_pixel_type, + uint32_t src_width, + uint32_t src_height, + uint32_t src_rowstride, + const void *color_pixel, + SmolPixelType color_pixel_type, + void *dest_pixels, + SmolPixelType dest_pixel_type, + uint32_t dest_width, + uint32_t dest_height, + uint32_t dest_rowstride, + int32_t placement_x, + int32_t placement_y, + uint32_t placement_width, + uint32_t placement_height, + SmolCompositeOp composite_op, + SmolFlags flags, SmolPostRowFunc post_row_func, void *user_data) { @@ -3032,16 +1346,24 @@ smol_scale_new_full (SmolPixelType pixel_type_in, scale_ctx = calloc (sizeof (SmolScaleCtx), 1); smol_scale_init (scale_ctx, - pixel_type_in, - pixels_in, - width_in, - height_in, - rowstride_in, - pixel_type_out, - pixels_out, - width_out, - height_out, - rowstride_out, + src_pixels, + src_pixel_type, + SMOL_PX_TO_SPX (src_width), + SMOL_PX_TO_SPX (src_height), + src_rowstride, + color_pixel, + color_pixel_type, + dest_pixels, + dest_pixel_type, + SMOL_PX_TO_SPX (dest_width), + SMOL_PX_TO_SPX (dest_height), + dest_rowstride, + placement_x, + placement_y, + placement_width, + placement_height, + composite_op, + flags, post_row_func, user_data); return scale_ctx; @@ -3054,52 +1376,31 @@ smol_scale_destroy (SmolScaleCtx *scale_ctx) free (scale_ctx); } -void -smol_scale_simple (SmolPixelType pixel_type_in, - const void *pixels_in, - uint32_t width_in, - uint32_t height_in, - uint32_t rowstride_in, - SmolPixelType pixel_type_out, - void *pixels_out, - uint32_t width_out, - uint32_t height_out, - uint32_t rowstride_out) -{ - SmolScaleCtx scale_ctx; - - smol_scale_init (&scale_ctx, - pixel_type_in, pixels_in, - width_in, height_in, rowstride_in, - pixel_type_out, pixels_out, - width_out, height_out, rowstride_out, - NULL, NULL); - do_rows (&scale_ctx, - outrow_ofs_to_pointer (&scale_ctx, 0), - 0, - scale_ctx.height_out); - smol_scale_finalize (&scale_ctx); -} - void smol_scale_batch (const SmolScaleCtx *scale_ctx, - uint32_t first_out_row, - uint32_t n_out_rows) + int32_t first_dest_row, + int32_t n_dest_rows) { + if (!check_row_range (scale_ctx, &first_dest_row, &n_dest_rows)) + return; + do_rows (scale_ctx, - outrow_ofs_to_pointer (scale_ctx, first_out_row), - first_out_row, - n_out_rows); + dest_row_ofs_to_pointer (scale_ctx, first_dest_row), + first_dest_row, + n_dest_rows); } void smol_scale_batch_full (const SmolScaleCtx *scale_ctx, - void *outrows_dest, - uint32_t first_out_row, - uint32_t n_out_rows) + void *dest, + int32_t first_dest_row, + int32_t n_dest_rows) { + if (!check_row_range (scale_ctx, &first_dest_row, &n_dest_rows)) + return; + do_rows (scale_ctx, - outrows_dest, - first_out_row, - n_out_rows); + dest, + first_dest_row, + n_dest_rows); } diff --git a/chafa/internal/smolscale/smolscale.h b/chafa/internal/smolscale/smolscale.h index c7cc99e6..2a6fe3a1 100644 --- a/chafa/internal/smolscale/smolscale.h +++ b/chafa/internal/smolscale/smolscale.h @@ -11,6 +11,23 @@ extern "C" { #endif +#define SMOL_SUBPIXEL_SHIFT 8 +#define SMOL_SUBPIXEL_MUL (1 << (SMOL_SUBPIXEL_SHIFT)) + +/* Applies modulo twice, yielding a positive fraction for negative offsets */ +#define SMOL_SUBPIXEL_MOD(n) ((((n) % SMOL_SUBPIXEL_MUL) + SMOL_SUBPIXEL_MUL) % SMOL_SUBPIXEL_MUL) + +#define SMOL_PX_TO_SPX(px) ((px) * (SMOL_SUBPIXEL_MUL)) +#define SMOL_SPX_TO_PX(spx) (((spx) + (SMOL_SUBPIXEL_MUL) - 1) / (SMOL_SUBPIXEL_MUL)) + +typedef enum +{ + SMOL_NO_FLAGS = 0, + SMOL_DISABLE_ACCELERATION = (1 << 0), + SMOL_DISABLE_SRGB_LINEARIZATION = (1 << 1) +} +SmolFlags; + typedef enum { /* 32 bits per pixel */ @@ -34,7 +51,15 @@ typedef enum } SmolPixelType; -typedef void (SmolPostRowFunc) (uint32_t *row_inout, +typedef enum +{ + SMOL_COMPOSITE_SRC, + SMOL_COMPOSITE_SRC_CLEAR_DEST, + SMOL_COMPOSITE_SRC_OVER_DEST +} +SmolCompositeOp; + +typedef void (SmolPostRowFunc) (void *row_inout, int width, void *user_data); @@ -44,23 +69,52 @@ typedef struct SmolScaleCtx SmolScaleCtx; * the source memory and an existing allocation to receive the output data. * This interface can only be used from a single thread. */ -void smol_scale_simple (SmolPixelType pixel_type_in, const void *pixels_in, - uint32_t width_in, uint32_t height_in, uint32_t rowstride_in, - SmolPixelType pixel_type_out, void *pixels_out, - uint32_t width_out, uint32_t height_out, uint32_t rowstride_out); +void smol_scale_simple (const void *src_pixels, + SmolPixelType src_pixel_type, + uint32_t src_width, + uint32_t src_height, + uint32_t src_rowstride, + void *dest_pixels, + SmolPixelType dest_pixel_type, + uint32_t dest_width, + uint32_t dest_height, + uint32_t dest_rowstride, + SmolFlags flags); /* Batch API: Allows scaling a few rows at a time. Suitable for multithreading. */ -SmolScaleCtx *smol_scale_new (SmolPixelType pixel_type_in, const void *pixels_in, - uint32_t width_in, uint32_t height_in, uint32_t rowstride_in, - SmolPixelType pixel_type_out, void *pixels_out, - uint32_t width_out, uint32_t height_out, uint32_t rowstride_out); - -SmolScaleCtx *smol_scale_new_full (SmolPixelType pixel_type_in, const void *pixels_in, - uint32_t width_in, uint32_t height_in, uint32_t rowstride_in, - SmolPixelType pixel_type_out, void *pixels_out, - uint32_t width_out, uint32_t height_out, uint32_t rowstride_out, - SmolPostRowFunc post_row_func, void *user_data); +SmolScaleCtx *smol_scale_new_simple (const void *src_pixels, + SmolPixelType src_pixel_type, + uint32_t src_width, + uint32_t src_height, + uint32_t src_rowstride, + void *dest_pixels, + SmolPixelType dest_pixel_type, + uint32_t dest_width, + uint32_t dest_height, + uint32_t dest_rowstride, + SmolFlags flags); + +SmolScaleCtx *smol_scale_new_full (const void *src_pixels, + SmolPixelType src_pixel_type, + uint32_t src_width, + uint32_t src_height, + uint32_t src_rowstride, + const void *color_pixel, + SmolPixelType color_pixel_type, + void *dest_pixels, + SmolPixelType dest_pixel_type, + uint32_t dest_width, + uint32_t dest_height, + uint32_t dest_rowstride, + int32_t placement_x, + int32_t placement_y, + uint32_t placement_width, + uint32_t placement_height, + SmolCompositeOp composite_op, + SmolFlags flags, + SmolPostRowFunc post_row_func, + void *user_data); void smol_scale_destroy (SmolScaleCtx *scale_ctx); @@ -68,7 +122,7 @@ void smol_scale_destroy (SmolScaleCtx *scale_ctx); * threads, as long as the outrows do not overlap. Make sure all workers are * finished before you call smol_scale_destroy(). */ -void smol_scale_batch (const SmolScaleCtx *scale_ctx, uint32_t first_outrow, uint32_t n_outrows); +void smol_scale_batch (const SmolScaleCtx *scale_ctx, int32_t first_outrow, int32_t n_outrows); /* Like smol_scale_batch(), but will write the output rows to outrows_dest * instead of relative to pixels_out address handed to smol_scale_new(). The @@ -76,7 +130,7 @@ void smol_scale_batch (const SmolScaleCtx *scale_ctx, uint32_t first_outrow, uin void smol_scale_batch_full (const SmolScaleCtx *scale_ctx, void *outrows_dest, - uint32_t first_outrow, uint32_t n_outrows); + int32_t first_outrow, int32_t n_outrows); #ifdef __cplusplus }