From 20f2f4c5818de9a7c1da947b135732b510fa1bca Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Thu, 9 Jan 2025 12:15:36 -0500 Subject: [PATCH] Thread-local arenas Currently, all threads use the same arena for imaging. This can result in a lot of contention when there are enough workers and the mutex is constantly being checked. This commit instead introduces lockless thread-local arenas for environments that support it. --- setup.py | 65 +++++++++++++++++++++++++++++++++++----- src/_imaging.c | 36 +++++++++++----------- src/libImaging/Imaging.h | 31 +++++++++++++++++-- src/libImaging/Storage.c | 20 ++++++------- 4 files changed, 114 insertions(+), 38 deletions(-) diff --git a/setup.py b/setup.py index a85731db936..a79b968f3c3 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,14 @@ # ------------------------------ from __future__ import annotations +import distutils.ccompiler import os import re import shutil import struct import subprocess import sys +import tempfile import warnings from collections.abc import Iterator from typing import Any @@ -292,6 +294,47 @@ def _pkg_config(name: str) -> tuple[list[str], list[str]] | None: return None +def _try_compile(compiler: distutils.ccompiler.CCompiler, code: str) -> bool: + try: + with tempfile.TemporaryDirectory() as d: + fn = os.path.join(d, "test.c") + with open(fn, "w") as f: + f.write(code) + compiler.compile([fn], output_dir=d, extra_preargs=["-Werror"]) + return True + except Exception: + return False + + +def _try_compile_attr(compiler: distutils.ccompiler.CCompiler, attr: str) -> bool: + code = f""" + #pragma GCC diagnostic error "-Wattributes" + #pragma clang diagnostic error "-Wattributes" + + int {attr} foo; + int main() {{ + return 0; + }} + """ + + return _try_compile(compiler, code) + + +def _try_compile_tls_define_macros( + compiler: distutils.ccompiler.CCompiler, +) -> list[tuple[str, str | None]]: + if _try_compile_attr(compiler, "thread_local"): # C23 + return [("HAVE_THREAD_LOCAL", None)] + elif _try_compile_attr(compiler, "_Thread_local"): # C11/C17 + return [("HAVE__THREAD_LOCAL", None)] + elif _try_compile_attr(compiler, "__thread"): # GCC/clang + return [("HAVE___THREAD", None)] + elif _try_compile_attr(compiler, "__declspec(thread)"): # MSVC + return [("HAVE___DECLSPEC_THREAD_", None)] + else: + return [] + + class pil_build_ext(build_ext): class ext_feature: features = [ @@ -426,13 +469,14 @@ def finalize_options(self) -> None: def _update_extension( self, name: str, - libraries: list[str] | list[str | bool | None], + libraries: list[str] | list[str | bool | None] | None = None, define_macros: list[tuple[str, str | None]] | None = None, sources: list[str] | None = None, ) -> None: for extension in self.extensions: if extension.name == name: - extension.libraries += libraries + if libraries is not None: + extension.libraries += libraries if define_macros is not None: extension.define_macros += define_macros if sources is not None: @@ -890,7 +934,10 @@ def build_extensions(self) -> None: defs.append(("PILLOW_VERSION", f'"{PILLOW_VERSION}"')) - self._update_extension("PIL._imaging", libs, defs) + tls_define_macros = _try_compile_tls_define_macros(self.compiler) + self._update_extension("PIL._imaging", libs, defs + tls_define_macros) + self._update_extension("PIL._imagingmath", define_macros=tls_define_macros) + self._update_extension("PIL._imagingmorph", define_macros=tls_define_macros) # # additional libraries @@ -913,7 +960,9 @@ def build_extensions(self) -> None: libs.append(feature.get("fribidi")) else: # building FriBiDi shim from src/thirdparty srcs.append("src/thirdparty/fribidi-shim/fribidi.c") - self._update_extension("PIL._imagingft", libs, defs, srcs) + self._update_extension( + "PIL._imagingft", libs, defs + tls_define_macros, srcs + ) else: self._remove_extension("PIL._imagingft") @@ -922,19 +971,19 @@ def build_extensions(self) -> None: libs = [feature.get("lcms")] if sys.platform == "win32": libs.extend(["user32", "gdi32"]) - self._update_extension("PIL._imagingcms", libs) + self._update_extension("PIL._imagingcms", libs, tls_define_macros) else: self._remove_extension("PIL._imagingcms") webp = feature.get("webp") if isinstance(webp, str): libs = [webp, webp + "mux", webp + "demux"] - self._update_extension("PIL._webp", libs) + self._update_extension("PIL._webp", libs, tls_define_macros) else: self._remove_extension("PIL._webp") tk_libs = ["psapi"] if sys.platform in ("win32", "cygwin") else [] - self._update_extension("PIL._imagingtk", tk_libs) + self._update_extension("PIL._imagingtk", tk_libs, tls_define_macros) build_ext.build_extensions(self) @@ -972,7 +1021,7 @@ def summary_report(self, feature: ext_feature) -> None: (feature.get("tiff"), "LIBTIFF"), (feature.get("freetype"), "FREETYPE2"), (feature.get("raqm"), "RAQM (Text shaping)", raqm_extra_info), - (feature.get("lcms"), "LITTLECMS2"), + (feature.get("lcms"), "LITTLECoMS2"), (feature.get("webp"), "WEBP"), (feature.get("xcb"), "XCB (X protocol)"), ] diff --git a/src/_imaging.c b/src/_imaging.c index 5d6d97bedab..ed9cf3d358b 100644 --- a/src/_imaging.c +++ b/src/_imaging.c @@ -3931,7 +3931,7 @@ _get_stats(PyObject *self, PyObject *args) { return NULL; } - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); ImagingMemoryArena arena = &ImagingDefaultArena; v = PyLong_FromLong(arena->stats_new_count); @@ -3958,7 +3958,7 @@ _get_stats(PyObject *self, PyObject *args) { PyDict_SetItemString(d, "blocks_cached", v ? v : Py_None); Py_XDECREF(v); - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); return d; } @@ -3968,14 +3968,14 @@ _reset_stats(PyObject *self, PyObject *args) { return NULL; } - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); ImagingMemoryArena arena = &ImagingDefaultArena; arena->stats_new_count = 0; arena->stats_allocated_blocks = 0; arena->stats_reused_blocks = 0; arena->stats_reallocated_blocks = 0; arena->stats_freed_blocks = 0; - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); Py_INCREF(Py_None); return Py_None; @@ -3987,9 +3987,9 @@ _get_alignment(PyObject *self, PyObject *args) { return NULL; } - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); int alignment = ImagingDefaultArena.alignment; - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); return PyLong_FromLong(alignment); } @@ -3999,9 +3999,9 @@ _get_block_size(PyObject *self, PyObject *args) { return NULL; } - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); int block_size = ImagingDefaultArena.block_size; - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); return PyLong_FromLong(block_size); } @@ -4011,9 +4011,9 @@ _get_blocks_max(PyObject *self, PyObject *args) { return NULL; } - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); int blocks_max = ImagingDefaultArena.blocks_max; - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); return PyLong_FromLong(blocks_max); } @@ -4034,9 +4034,9 @@ _set_alignment(PyObject *self, PyObject *args) { return NULL; } - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); ImagingDefaultArena.alignment = alignment; - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); Py_INCREF(Py_None); return Py_None; @@ -4059,9 +4059,9 @@ _set_block_size(PyObject *self, PyObject *args) { return NULL; } - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); ImagingDefaultArena.block_size = block_size; - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); Py_INCREF(Py_None); return Py_None; @@ -4085,9 +4085,9 @@ _set_blocks_max(PyObject *self, PyObject *args) { return NULL; } - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); int status = ImagingMemorySetBlocksMax(&ImagingDefaultArena, blocks_max); - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); if (!status) { return ImagingError_MemoryError(); } @@ -4104,9 +4104,9 @@ _clear_cache(PyObject *self, PyObject *args) { return NULL; } - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); ImagingMemoryClearCache(&ImagingDefaultArena, i); - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); Py_INCREF(Py_None); return Py_None; diff --git a/src/libImaging/Imaging.h b/src/libImaging/Imaging.h index 31052c68a97..0b23b3b2bce 100644 --- a/src/libImaging/Imaging.h +++ b/src/libImaging/Imaging.h @@ -149,6 +149,32 @@ struct ImagingPaletteInstance { int keep_cache; /* This palette will be reused; keep cache */ }; +#define IMAGING_ARENA_LOCK(m) +#define IMAGING_ARENA_UNLOCK(m) + +#if defined(__cplusplus) +#define IMAGING_ARENA_TLS thread_local +#elif defined(HAVE_THREAD_LOCAL) +#define IMAGING_ARENA_TLS thread_local +#elif defined(HAVE__THREAD_LOCAL) +#define IMAGING_ARENA_TLS _Thread_local +#elif defined(HAVE___THREAD) +#define IMAGING_ARENA_TLS __thread +#elif defined(HAVE___DECLSPEC_THREAD_) +#define IMAGING_ARENA_TLS __declspec(thread) +#elif defined(Py_GIL_DISABLED) +#define IMAGING_ARENA_TLS +#define IMAGING_ARENA_LOCKING + +#undef IMAGING_ARENA_LOCK +#undef IMAGING_ARENA_UNLOCK + +#define IMAGING_ARENA_LOCK(m) PyMutex_Lock(m) +#define IMAGING_ARENA_UNLOCK(m) PyMutex_Unlock(m) +#else +#define IMAGING_ARENA_TLS +#endif + typedef struct ImagingMemoryArena { int alignment; /* Alignment in memory of each line of an image */ int block_size; /* Preferred block size, bytes */ @@ -161,7 +187,8 @@ typedef struct ImagingMemoryArena { int stats_reallocated_blocks; /* Number of blocks which were actually reallocated after retrieving */ int stats_freed_blocks; /* Number of freed blocks */ -#ifdef Py_GIL_DISABLED + +#ifdef IMAGING_ARENA_LOCKING PyMutex mutex; #endif } *ImagingMemoryArena; @@ -169,7 +196,7 @@ typedef struct ImagingMemoryArena { /* Objects */ /* ------- */ -extern struct ImagingMemoryArena ImagingDefaultArena; +extern IMAGING_ARENA_TLS struct ImagingMemoryArena ImagingDefaultArena; extern int ImagingMemorySetBlocksMax(ImagingMemoryArena arena, int blocks_max); extern void diff --git a/src/libImaging/Storage.c b/src/libImaging/Storage.c index 522e9f37557..5c87714467c 100644 --- a/src/libImaging/Storage.c +++ b/src/libImaging/Storage.c @@ -218,9 +218,9 @@ ImagingNewPrologueSubtype(const char *mode, int xsize, int ysize, int size) { break; } - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); ImagingDefaultArena.stats_new_count += 1; - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); return im; } @@ -259,7 +259,7 @@ ImagingDelete(Imaging im) { #define IMAGING_PAGE_SIZE (4096) -struct ImagingMemoryArena ImagingDefaultArena = { +IMAGING_ARENA_TLS struct ImagingMemoryArena ImagingDefaultArena = { 1, // alignment 16 * 1024 * 1024, // block_size 0, // blocks_max @@ -270,7 +270,7 @@ struct ImagingMemoryArena ImagingDefaultArena = { 0, 0, 0, // Stats -#ifdef Py_GIL_DISABLED +#ifdef IMAGING_ARENA_LOCKING {0}, #endif }; @@ -369,12 +369,12 @@ ImagingDestroyArray(Imaging im) { int y = 0; if (im->blocks) { - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); while (im->blocks[y].ptr) { memory_return_block(&ImagingDefaultArena, im->blocks[y]); y += 1; } - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); free(im->blocks); } } @@ -504,11 +504,11 @@ ImagingNewInternal(const char *mode, int xsize, int ysize, int dirty) { return NULL; } - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); Imaging tmp = ImagingAllocateArray( im, &ImagingDefaultArena, dirty, ImagingDefaultArena.block_size ); - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); if (tmp) { return im; } @@ -516,9 +516,9 @@ ImagingNewInternal(const char *mode, int xsize, int ysize, int dirty) { ImagingError_Clear(); // Try to allocate the image once more with smallest possible block size - MUTEX_LOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex); tmp = ImagingAllocateArray(im, &ImagingDefaultArena, dirty, IMAGING_PAGE_SIZE); - MUTEX_UNLOCK(&ImagingDefaultArena.mutex); + IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex); if (tmp) { return im; }