Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SIMD versions of the invert transform #2534

Merged
merged 13 commits into from
Apr 12, 2024
4 changes: 4 additions & 0 deletions src_c/simd_transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@

// SSE2 functions
#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
void
invert_sse2(SDL_Surface *src, SDL_Surface *newsurf);
#endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */

// AVX2 functions
void
invert_avx2(SDL_Surface *src, SDL_Surface *newsurf);
91 changes: 91 additions & 0 deletions src_c/simd_transform_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,94 @@ pg_avx2_at_runtime_but_uncompiled()
}
return 0;
}

#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H)
void
invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
{
int s_row_skip = (src->pitch - src->w * 4) / 4;

// generate number of batches of pixels we need to loop through
int pixel_batch_length = src->w * src->h;
int num_batches = 1;
if (s_row_skip > 0) {
pixel_batch_length = src->w;
num_batches = src->h;
}

int remaining_pixels = pixel_batch_length % 8;
int perfect_8_pixels = pixel_batch_length / 8;

int perfect_8_pixels_batch_counter = perfect_8_pixels;
int remaining_pixels_batch_counter = remaining_pixels;

Uint32 *srcp = (Uint32 *)src->pixels;
Uint32 *dstp = (Uint32 *)newsurf->pixels;

Uint32 amask =
~(src->format->Rmask | src->format->Gmask | src->format->Bmask);

__m256i *srcp256 = (__m256i *)src->pixels;
__m256i *dstp256 = (__m256i *)newsurf->pixels;

__m256i mm256_src, mm256_dst, mm256_two_five_fives, mm256_alpha,
mm256_rgb_mask, mm256_alpha_mask;

mm256_two_five_fives = _mm256_set1_epi16(0xFFFF);
mm256_alpha_mask = _mm256_set1_epi32(amask);

__m256i _partial8_mask =
_mm256_set_epi32(0x00, (remaining_pixels > 6) ? 0x80000000 : 0x00,
(remaining_pixels > 5) ? 0x80000000 : 0x00,
(remaining_pixels > 4) ? 0x80000000 : 0x00,
(remaining_pixels > 3) ? 0x80000000 : 0x00,
(remaining_pixels > 2) ? 0x80000000 : 0x00,
(remaining_pixels > 1) ? 0x80000000 : 0x00,
(remaining_pixels > 0) ? 0x80000000 : 0x00);
MyreMylar marked this conversation as resolved.
Show resolved Hide resolved

while (num_batches--) {
perfect_8_pixels_batch_counter = perfect_8_pixels;
remaining_pixels_batch_counter = remaining_pixels;
while (perfect_8_pixels_batch_counter--) {
mm256_src = _mm256_loadu_si256(srcp256);

/* do the invert */
mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src);
/* blend the original alpha in */
mm256_dst =
_mm256_blendv_epi8(mm256_dst, mm256_src, mm256_alpha_mask);

_mm256_storeu_si256(dstp256, mm256_dst);

srcp256++;
dstp256++;
itzpr3d4t0r marked this conversation as resolved.
Show resolved Hide resolved
}
srcp = (Uint32 *)srcp256;
dstp = (Uint32 *)dstp256;
if (remaining_pixels_batch_counter > 0) {
mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask);

/* do the invert */
mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src);
/* blend the original alpha in */
mm256_dst =
_mm256_blendv_epi8(mm256_dst, mm256_src, mm256_alpha_mask);

_mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst);

srcp += remaining_pixels_batch_counter;
dstp += remaining_pixels_batch_counter;
}
srcp += s_row_skip;
srcp256 = (__m256i *)srcp;
}
}
#else
void
invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
{
BAD_AVX2_FUNCTION_CALL;
}
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H) */
89 changes: 89 additions & 0 deletions src_c/simd_transform_sse2.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,92 @@ pg_neon_at_runtime_but_uncompiled()
}
return 0;
}

#if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON))

void
invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
{
int s_row_skip = (src->pitch - src->w * 4) / 4;

// generate number of batches of pixels we need to loop through
int pixel_batch_length = src->w * src->h;
int num_batches = 1;
if (s_row_skip > 0) {
pixel_batch_length = src->w;
num_batches = src->h;
}
int remaining_pixels = pixel_batch_length % 4;
int perfect_4_pixels = pixel_batch_length / 4;

int perfect_4_pixels_batch_counter = perfect_4_pixels;
int remaining_pixels_batch_counter = remaining_pixels;

Uint32 *srcp = (Uint32 *)src->pixels;
Uint32 *dstp = (Uint32 *)newsurf->pixels;

Uint32 rgbmask =
(src->format->Rmask | src->format->Gmask | src->format->Bmask);
MyreMylar marked this conversation as resolved.
Show resolved Hide resolved
Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask;
Uint64 amask64 = ~rgbmask64;

__m128i mm_src, mm_dst, mm_alpha, mm_two_five_fives, mm_alpha_mask,
mm_rgb_mask;

__m128i *srcp128 = (__m128i *)src->pixels;
__m128i *dstp128 = (__m128i *)newsurf->pixels;

mm_rgb_mask = _mm_set1_epi64x(rgbmask64);
mm_alpha_mask = _mm_set1_epi64x(amask64);
mm_two_five_fives = _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF);

while (num_batches--) {
perfect_4_pixels_batch_counter = perfect_4_pixels;
remaining_pixels_batch_counter = remaining_pixels;
while (perfect_4_pixels_batch_counter--) {
mm_src = _mm_loadu_si128(srcp128);
/*mm_src = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
/* First we strip out the alpha so we have one of our 4 channels
empty for the rest of the calculation */
mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
/*mm_src = 0x00RRGGBB00RRGGBB00RRGGBB00RRGGBB*/

/*invert the colours*/
mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src);

/*add the alpha channel back*/
mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
mm_dst = _mm_adds_epu8(mm_dst, mm_alpha);
/*mm_dst = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
_mm_storeu_si128(dstp128, mm_dst);
/*dstp = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
srcp128++;
dstp128++;
}
srcp = (Uint32 *)srcp128;
dstp = (Uint32 *)dstp128;
if (remaining_pixels_batch_counter > 0) {
MyreMylar marked this conversation as resolved.
Show resolved Hide resolved
mm_src = _mm_cvtsi32_si128(*srcp);
/*mm_src = 0x000000000000000000000000AARRGGBB*/
/* First we strip out the alpha so we have one of our 4 channels
empty for the rest of the calculation */
mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
/*mm_src = 0x00000000000000000000000000RRGGBB*/

/*invert the colours*/
mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src);

/*add the alpha channel back*/
mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
mm_dst = _mm_adds_epu8(mm_dst, mm_alpha);
/*mm_dst = 0x000000000000000000000000AARRGGBB*/
*dstp = _mm_cvtsi128_si32(mm_dst);
/*dstp = 0xAARRGGBB*/
srcp++;
dstp++;
}
srcp += s_row_skip;
srcp128 = (__m128i *)srcp;
}
}
#endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/
49 changes: 37 additions & 12 deletions src_c/transform.c
Original file line number Diff line number Diff line change
Expand Up @@ -3279,6 +3279,25 @@ surf_gaussian_blur(PyObject *self, PyObject *args, PyObject *kwargs)
return (PyObject *)pgSurface_New(new_surf);
}

void
invert_non_simd(SDL_Surface *src, SDL_Surface *newsurf)
{
int x, y;
for (y = 0; y < src->h; y++) {
for (x = 0; x < src->w; x++) {
Uint32 pixel;
Uint8 *pix;
SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
pix);
unsigned char r, g, b, a;
SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
Uint32 new_pixel = SDL_MapRGBA(newsurf->format, ~r, ~g, ~b, a);
SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
newsurf->format, pix);
}
}
}

SDL_Surface *
invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
{
Expand Down Expand Up @@ -3306,19 +3325,25 @@ invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
"Source and destination surfaces need the same format."));
}

int x, y;
for (y = 0; y < src->h; y++) {
for (x = 0; x < src->w; x++) {
Uint32 pixel;
Uint8 *pix;
SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
pix);
unsigned char r, g, b, a;
SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
Uint32 new_pixel = SDL_MapRGBA(newsurf->format, ~r, ~g, ~b, a);
SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
newsurf->format, pix);
if (src->format->BytesPerPixel == 4 &&
MyreMylar marked this conversation as resolved.
Show resolved Hide resolved
src->format->Rmask == newsurf->format->Rmask &&
src->format->Gmask == newsurf->format->Gmask &&
src->format->Bmask == newsurf->format->Bmask &&
(src->pitch % 4 == 0) && (newsurf->pitch == (newsurf->w * 4))) {
if (pg_has_avx2()) {
invert_avx2(src, newsurf);
}
#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
else if (pg_HasSSE_NEON()) {
invert_sse2(src, newsurf);
}
#endif // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
else {
invert_non_simd(src, newsurf);
}
}
else {
invert_non_simd(src, newsurf);
}

SDL_UnlockSurface(newsurf);
Expand Down
Loading