From 949b5ec8e6136c125b12566d869307899ba1c3b4 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sat, 17 Jan 2026 08:34:39 +0800 Subject: [PATCH 1/2] gh-142913: Remove JIT interpreter for interpreter generator tests (GH-143944) --- Modules/_testinternalcapi/interpreter.c | 125 +++--------------------- Tools/c-analyzer/cpython/_parser.py | 1 + 2 files changed, 13 insertions(+), 113 deletions(-) diff --git a/Modules/_testinternalcapi/interpreter.c b/Modules/_testinternalcapi/interpreter.c index c3191ac20a3199..2cd23fa3c58849 100644 --- a/Modules/_testinternalcapi/interpreter.c +++ b/Modules/_testinternalcapi/interpreter.c @@ -11,6 +11,18 @@ int Test_EvalFrame_Resumes, Test_EvalFrame_Loads; +#ifdef _Py_TIER2 +static int +stop_tracing_and_jit(PyThreadState *tstate, _PyInterpreterFrame *frame) +{ + (void)(tstate); + (void)(frame); + return 0; +} +#endif + +_PyJitEntryFuncPtr _Py_jit_entry; + #if _Py_TAIL_CALL_INTERP #include "test_targets.h" #include "test_cases.c.h" @@ -78,12 +90,6 @@ Test_EvalFrame(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag) frame->previous = &entry.frame; tstate->current_frame = frame; entry.frame.localsplus[0] = PyStackRef_NULL; -#ifdef _Py_TIER2 - if (tstate->current_executor != NULL) { - entry.frame.localsplus[0] = PyStackRef_FromPyObjectNew(tstate->current_executor); - tstate->current_executor = NULL; - } -#endif /* support for generator.throw() */ if (throwflag) { @@ -119,11 +125,6 @@ Test_EvalFrame(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag) #endif } - #if defined(_Py_TIER2) && !defined(_Py_JIT) - /* Tier 2 interpreter state */ - _PyExecutorObject *current_executor = NULL; - const _PyUOpInstruction *next_uop = NULL; -#endif #if _Py_TAIL_CALL_INTERP # if Py_STATS return _TAIL_CALL_start_frame(frame, NULL, tstate, NULL, instruction_funcptr_handler_table, 0, lastopcode); @@ -136,108 +137,6 @@ Test_EvalFrame(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag) #endif -#ifdef _Py_TIER2 - -// Tier 2 is also here! -enter_tier_two: - -#ifdef _Py_JIT - assert(0); -#else - -#undef LOAD_IP -#define LOAD_IP(UNUSED) (void)0 - -#ifdef Py_STATS -// Disable these macros that apply to Tier 1 stats when we are in Tier 2 -#undef STAT_INC -#define STAT_INC(opname, name) ((void)0) -#undef STAT_DEC -#define STAT_DEC(opname, name) ((void)0) -#endif - -#undef ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZATION 0 -#undef ENABLE_SPECIALIZATION_FT -#define ENABLE_SPECIALIZATION_FT 0 - - ; // dummy statement after a label, before a declaration - uint16_t uopcode; -#ifdef Py_STATS - int lastuop = 0; - uint64_t trace_uop_execution_counter = 0; -#endif - - assert(next_uop->opcode == _START_EXECUTOR); -tier2_dispatch: - for (;;) { - uopcode = next_uop->opcode; -#ifdef Py_DEBUG - if (frame->lltrace >= 3) { - dump_stack(frame, stack_pointer); - if (next_uop->opcode == _START_EXECUTOR) { - printf("%4d uop: ", 0); - } - else { - printf("%4d uop: ", (int)(next_uop - current_executor->trace)); - } - _PyUOpPrint(next_uop); - printf("\n"); - } -#endif - next_uop++; - OPT_STAT_INC(uops_executed); - UOP_STAT_INC(uopcode, execution_count); - UOP_PAIR_INC(uopcode, lastuop); -#ifdef Py_STATS - trace_uop_execution_counter++; - ((_PyUOpInstruction *)next_uop)[-1].execution_count++; -#endif - - switch (uopcode) { - -#include "executor_cases.c.h" - - default: -#ifdef Py_DEBUG - { - printf("Unknown uop: "); - _PyUOpPrint(&next_uop[-1]); - printf(" @ %d\n", (int)(next_uop - current_executor->trace - 1)); - Py_FatalError("Unknown uop"); - } -#else - Py_UNREACHABLE(); -#endif - - } - } - -jump_to_error_target: -#ifdef Py_DEBUG - if (frame->lltrace >= 2) { - printf("Error: [UOp "); - _PyUOpPrint(&next_uop[-1]); - printf(" @ %d -> %s]\n", - (int)(next_uop - current_executor->trace - 1), - _PyOpcode_OpName[frame->instr_ptr->op.code]); - } -#endif - assert(next_uop[-1].format == UOP_FORMAT_JUMP); - uint16_t target = uop_get_error_target(&next_uop[-1]); - next_uop = current_executor->trace + target; - goto tier2_dispatch; - -jump_to_jump_target: - assert(next_uop[-1].format == UOP_FORMAT_JUMP); - target = uop_get_jump_target(&next_uop[-1]); - next_uop = current_executor->trace + target; - goto tier2_dispatch; - -#endif // _Py_JIT - -#endif // _Py_TIER2 - early_exit: assert(_PyErr_Occurred(tstate)); _Py_LeaveRecursiveCallPy(tstate); diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py index 34009538991e98..bad1a3eb048782 100644 --- a/Tools/c-analyzer/cpython/_parser.py +++ b/Tools/c-analyzer/cpython/_parser.py @@ -83,6 +83,7 @@ def format_tsv_lines(lines): 'Python/opcode_targets.h', 'Modules/_testinternalcapi/test_targets.h', 'Modules/_testinternalcapi/test_cases.c.h', + 'Modules/_testinternalcapi/interpreter.c', # XXX: Throws errors if PY_VERSION_HEX is not mocked out 'Modules/clinic/_testclinic_depr.c.h', From bb25f7280af30831fffa3345b4fc93798949c6c6 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Fri, 16 Jan 2026 17:54:43 -0800 Subject: [PATCH 2/2] gh-132657: Add maybe_enable_deferred_ref_count() (gh-142843) If we are specializing to `LOAD_GLOBAL_MODULE` or `LOAD_ATTR_MODULE`, try to enable deferred reference counting for the value, if the object is owned by a different thread. This applies to the free-threaded build only and should improve scaling of multi-threaded programs. --- Include/internal/pycore_dict.h | 1 + ...-12-24-13-19-16.gh-issue-132657._P4DDb.rst | 6 ++++ Objects/dictobject.c | 12 ++++++-- Python/specialize.c | 30 +++++++++++++++++-- Tools/ftscalingbench/ftscalingbench.py | 9 ++++++ 5 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst diff --git a/Include/internal/pycore_dict.h b/Include/internal/pycore_dict.h index a7005a3b8e2fab..950547cb002f4c 100644 --- a/Include/internal/pycore_dict.h +++ b/Include/internal/pycore_dict.h @@ -114,6 +114,7 @@ extern Py_ssize_t _Py_dict_lookup_threadsafe_stackref(PyDictObject *mp, PyObject extern int _PyDict_GetMethodStackRef(PyDictObject *dict, PyObject *name, _PyStackRef *method); +extern Py_ssize_t _PyDict_LookupIndexAndValue(PyDictObject *, PyObject *, PyObject **); extern Py_ssize_t _PyDict_LookupIndex(PyDictObject *, PyObject *); extern Py_ssize_t _PyDictKeys_StringLookup(PyDictKeysObject* dictkeys, PyObject *key); diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst new file mode 100644 index 00000000000000..bbc9611b748fde --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst @@ -0,0 +1,6 @@ +If we are specializing to ``LOAD_GLOBAL_MODULE`` or ``LOAD_ATTR_MODULE``, try +to enable deferred reference counting for the value, if the object is owned by +a different thread. This applies to the free-threaded build only and should +improve scaling of multi-threaded programs. Note that when deferred reference +counting is enabled, the object will be deallocated by the GC, rather than by +:c:func:`Py_DECREF`. diff --git a/Objects/dictobject.c b/Objects/dictobject.c index a4e2fd19cefb63..aea9ea84202b07 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -2349,10 +2349,9 @@ dict_unhashable_type(PyObject *key) } Py_ssize_t -_PyDict_LookupIndex(PyDictObject *mp, PyObject *key) +_PyDict_LookupIndexAndValue(PyDictObject *mp, PyObject *key, PyObject **value) { // TODO: Thread safety - PyObject *value; assert(PyDict_CheckExact((PyObject*)mp)); assert(PyUnicode_CheckExact(key)); @@ -2362,7 +2361,14 @@ _PyDict_LookupIndex(PyDictObject *mp, PyObject *key) return -1; } - return _Py_dict_lookup(mp, key, hash, &value); + return _Py_dict_lookup(mp, key, hash, value); +} + +Py_ssize_t +_PyDict_LookupIndex(PyDictObject *mp, PyObject *key) +{ + PyObject *value; // discarded + return _PyDict_LookupIndexAndValue(mp, key, &value); } /* Same as PyDict_GetItemWithError() but with hash supplied by caller. diff --git a/Python/specialize.c b/Python/specialize.c index 2f82fb4ff4ef84..432053f85221a3 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -358,6 +358,21 @@ static int function_kind(PyCodeObject *code); static bool function_check_args(PyObject *o, int expected_argcount, int opcode); static uint32_t function_get_version(PyObject *o, int opcode); +#ifdef Py_GIL_DISABLED +static void +maybe_enable_deferred_ref_count(PyObject *op) +{ + if (!_Py_IsOwnedByCurrentThread(op)) { + // For module level variables that are heavily used from multiple + // threads, deferred reference counting provides good scaling + // benefits. The downside is that the object will only be deallocated + // by a GC run. + PyUnstable_Object_EnableDeferredRefcount(op); + } +} +#endif + + static int specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, PyObject *name) { @@ -366,7 +381,8 @@ specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, P SPECIALIZATION_FAIL(LOAD_ATTR, SPEC_FAIL_ATTR_NON_STRING); return -1; } - Py_ssize_t index = _PyDict_LookupIndex(dict, name); + PyObject *value; + Py_ssize_t index = _PyDict_LookupIndexAndValue(dict, name, &value); assert(index != DKIX_ERROR); if (index != (uint16_t)index) { SPECIALIZATION_FAIL(LOAD_ATTR, @@ -381,6 +397,9 @@ specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, P SPECIALIZATION_FAIL(LOAD_ATTR, SPEC_FAIL_OUT_OF_VERSIONS); return -1; } +#ifdef Py_GIL_DISABLED + maybe_enable_deferred_ref_count(value); +#endif write_u32(cache->version, keys_version); cache->index = (uint16_t)index; specialize(instr, LOAD_ATTR_MODULE); @@ -1269,7 +1288,6 @@ specialize_attr_loadclassattr(PyObject *owner, _Py_CODEUNIT *instr, return 1; } - static void specialize_load_global_lock_held( PyObject *globals, PyObject *builtins, @@ -1289,7 +1307,12 @@ specialize_load_global_lock_held( SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_LOAD_GLOBAL_NON_STRING_OR_SPLIT); goto fail; } +#ifdef Py_GIL_DISABLED + PyObject *value; + Py_ssize_t index = _PyDict_LookupIndexAndValue((PyDictObject *)globals, name, &value); +#else Py_ssize_t index = _PyDictKeys_StringLookup(globals_keys, name); +#endif if (index == DKIX_ERROR) { SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_EXPECTED_ERROR); goto fail; @@ -1310,6 +1333,9 @@ specialize_load_global_lock_held( SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_OUT_OF_RANGE); goto fail; } +#ifdef Py_GIL_DISABLED + maybe_enable_deferred_ref_count(value); +#endif cache->index = (uint16_t)index; cache->module_keys_version = (uint16_t)keys_version; specialize(instr, LOAD_GLOBAL_MODULE); diff --git a/Tools/ftscalingbench/ftscalingbench.py b/Tools/ftscalingbench/ftscalingbench.py index 097a065f368f30..c2bd7c3880bc90 100644 --- a/Tools/ftscalingbench/ftscalingbench.py +++ b/Tools/ftscalingbench/ftscalingbench.py @@ -21,6 +21,7 @@ # > echo "0" | sudo tee /sys/devices/system/cpu/cpufreq/boost # +import copy import math import os import queue @@ -214,6 +215,14 @@ def instantiate_dataclass(): for _ in range(1000 * WORK_SCALE): obj = MyDataClass(x=1, y=2, z=3) + +@register_benchmark +def deepcopy(): + x = {'list': [1, 2], 'tuple': (1, None)} + for i in range(40 * WORK_SCALE): + copy.deepcopy(x) + + def bench_one_thread(func): t0 = time.perf_counter_ns() func()