diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 858de67525..5ccd4d18e5 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -660,6 +660,8 @@ class internals_pp_manager { char const *holder_id_ = nullptr; on_fetch_function *on_fetch_ = nullptr; + // Pointer-to-pointer to the singleton internals for the first seen interpreter (may not be the + // main interpreter) std::unique_ptr *internals_singleton_pp_; }; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 2abd8fc326..2268ca3ac7 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -3,13 +3,18 @@ #pragma once #include "detail/common.h" +#include "detail/internals.h" #include "gil.h" #include #include -#ifdef Py_GIL_DISABLED +#if defined(Py_GIL_DISABLED) || defined(PYBIND11_HAS_SUBINTERPRETER_SUPPORT) # include + +using atomic_bool = std::atomic_bool; +#else +using atomic_bool = bool; #endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) @@ -48,12 +53,19 @@ PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) // functions, which is usually the case. // // For in-depth background, see docs/advanced/deadlock.md +#ifndef PYBIND11_HAS_SUBINTERPRETER_SUPPORT +// Subinterpreter support is disabled. +// In this case, we can store the result globally, because there is only a single interpreter. +// +// The life span of the stored result is the entire process lifetime. It is leaked on process +// termination to avoid destructor calls after the Python interpreter was finalized. template class gil_safe_call_once_and_store { public: // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. template - gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn) { + gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, + void (*)(T &) /*unused*/ = nullptr) { if (!is_initialized_) { // This read is guarded by the GIL. // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. @@ -74,29 +86,187 @@ class gil_safe_call_once_and_store { T &get_stored() { assert(is_initialized_); PYBIND11_WARNING_PUSH -#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 +# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 // Needed for gcc 4.8.5 PYBIND11_WARNING_DISABLE_GCC("-Wstrict-aliasing") -#endif +# endif return *reinterpret_cast(storage_); PYBIND11_WARNING_POP } constexpr gil_safe_call_once_and_store() = default; + // The instance is a global static, so its destructor runs when the process + // is terminating. Therefore, do nothing here because the Python interpreter + // may have been finalized already. PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: + // Global static storage (per process) when subinterpreter support is disabled. alignas(T) char storage_[sizeof(T)] = {}; std::once_flag once_flag_; -#ifdef Py_GIL_DISABLED - std::atomic_bool -#else - bool -#endif - is_initialized_{false}; + // The `is_initialized_`-`storage_` pair is very similar to `std::optional`, // but the latter does not have the triviality properties of former, // therefore `std::optional` is not a viable alternative here. + atomic_bool is_initialized_{false}; +}; +#else +// Subinterpreter support is enabled. +// In this case, we should store the result per-interpreter instead of globally, because each +// subinterpreter has its own separate state. The cached result may not shareable across +// interpreters (e.g., imported modules and their members). + +struct call_once_storage_base { + call_once_storage_base() = default; + virtual ~call_once_storage_base() = default; + call_once_storage_base(const call_once_storage_base &) = delete; + call_once_storage_base(call_once_storage_base &&) = delete; + call_once_storage_base &operator=(const call_once_storage_base &) = delete; + call_once_storage_base &operator=(call_once_storage_base &&) = delete; +}; + +template +struct call_once_storage : call_once_storage_base { + alignas(T) char storage[sizeof(T)] = {}; + std::once_flag once_flag; + void (*finalize)(T &) = nullptr; + std::atomic_bool is_initialized{false}; + + call_once_storage() = default; + ~call_once_storage() override { + if (is_initialized) { + if (finalize != nullptr) { + finalize(*reinterpret_cast(storage)); + } else { + reinterpret_cast(storage)->~T(); + } + } + }; + call_once_storage(const call_once_storage &) = delete; + call_once_storage(call_once_storage &&) = delete; + call_once_storage &operator=(const call_once_storage &) = delete; + call_once_storage &operator=(call_once_storage &&) = delete; }; +/// Storage map for `gil_safe_call_once_and_store`. Stored in a capsule in the interpreter's state +/// dict with proper destructor to ensure cleanup when the interpreter is destroyed. +using call_once_storage_map_type = std::unordered_map; + +# define PYBIND11_CALL_ONCE_STORAGE_MAP_ID PYBIND11_INTERNALS_ID "_call_once_storage_map__" + +// The life span of the stored result is the entire interpreter lifetime. An additional +// `finalize_fn` can be provided to clean up the stored result when the interpreter is destroyed. +template +class gil_safe_call_once_and_store { +public: + // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. + template + gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, + void (*finalize_fn)(T &) = nullptr) { + if (!is_last_storage_valid()) { + // Multiple threads may enter here, because the GIL is released in the next line and + // CPython API calls in the `fn()` call below may release and reacquire the GIL. + gil_scoped_release gil_rel; // Needed to establish lock ordering. + const void *const key = reinterpret_cast(this); + // There can be multiple threads going through here. + call_once_storage *value = nullptr; + { + gil_scoped_acquire gil_acq; + // Only one thread will enter here at a time. + auto &storage_map = *get_or_create_call_once_storage_map(); + const auto it = storage_map.find(key); + if (it != storage_map.end()) { + value = static_cast *>(it->second); + } else { + value = new call_once_storage{}; + storage_map.emplace(key, value); + } + } + assert(value != nullptr); + std::call_once(value->once_flag, [&] { + // Only one thread will ever enter here. + gil_scoped_acquire gil_acq; + // fn may release, but will reacquire, the GIL. + ::new (value->storage) T(fn()); + value->finalize = finalize_fn; + value->is_initialized = true; + last_storage_ptr_ = reinterpret_cast(value->storage); + is_initialized_by_atleast_one_interpreter_ = true; + }); + // All threads will observe `is_initialized_by_atleast_one_interpreter_` as true here. + } + // Intentionally not returning `T &` to ensure the calling code is self-documenting. + return *this; + } + + // This must only be called after `call_once_and_store_result()` was called. + T &get_stored() { + T *result = last_storage_ptr_; + if (!is_last_storage_valid()) { + gil_scoped_acquire gil_acq; + const void *const key = reinterpret_cast(this); + auto &storage_map = *get_or_create_call_once_storage_map(); + auto *value = static_cast *>(storage_map.at(key)); + result = last_storage_ptr_ = reinterpret_cast(value->storage); + } + assert(result != nullptr); + return *result; + } + + gil_safe_call_once_and_store() = default; + // The instance is a global static, so its destructor runs when the process + // is terminating. Therefore, do nothing here because the Python interpreter + // may have been finalized already. + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; + +private: + bool is_last_storage_valid() const { + return is_initialized_by_atleast_one_interpreter_ + && detail::get_num_interpreters_seen() == 1; + } + + static call_once_storage_map_type *get_or_create_call_once_storage_map() { + error_scope err_scope; + dict state_dict = detail::get_python_state_dict(); + auto storage_map_obj = reinterpret_steal( + detail::dict_getitemstringref(state_dict.ptr(), PYBIND11_CALL_ONCE_STORAGE_MAP_ID)); + call_once_storage_map_type *storage_map = nullptr; + if (storage_map_obj) { + void *raw_ptr = PyCapsule_GetPointer(storage_map_obj.ptr(), /*name=*/nullptr); + if (!raw_ptr) { + raise_from(PyExc_SystemError, + "pybind11::gil_safe_call_once_and_store::" + "get_or_create_call_once_storage_map() FAILED"); + throw error_already_set(); + } + storage_map = reinterpret_cast(raw_ptr); + } else { + storage_map = new call_once_storage_map_type(); + // Create capsule with destructor to clean up the storage map when the interpreter + // shuts down + state_dict[PYBIND11_CALL_ONCE_STORAGE_MAP_ID] + = capsule(storage_map, [](void *ptr) noexcept { + auto *map = reinterpret_cast(ptr); + for (const auto &entry : *map) { + delete entry.second; + } + delete map; + }); + } + return storage_map; + } + + // No storage needed when subinterpreter support is enabled. + // The actual storage is stored in the per-interpreter state dict via + // `get_or_create_call_once_storage_map()`. + + // Fast local cache to avoid repeated lookups when there are no multiple interpreters. + // This is only valid if there is a single interpreter. Otherwise, it is not used. + T *last_storage_ptr_ = nullptr; + // This flag is true if the value has been initialized by any interpreter (may not be the + // current one). + atomic_bool is_initialized_by_atleast_one_interpreter_{false}; +}; +#endif + PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)