Commits

Armin Rigo  committed e319f44

in-progress

  • Participants
  • Parent commits 515386d
  • Branches fast-gil

Comments (0)

Files changed (7)

File rpython/jit/backend/llsupport/assembler.py

 
     @staticmethod
     @rgc.no_collect
-    def _release_gil_asmgcc(css):
-        # similar to trackgcroot.py:pypy_asm_stackwalk, first part
-        from rpython.memory.gctransform import asmgcroot
-        new = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
-        next = asmgcroot.gcrootanchor.next
-        new.next = next
-        new.prev = asmgcroot.gcrootanchor
-        asmgcroot.gcrootanchor.next = new
-        next.prev = new
-        # and now release the GIL
-        before = rffi.aroundstate.before
-        if before:
-            before()
-
-    @staticmethod
-    @rgc.no_collect
-    def _reacquire_gil_asmgcc(css):
-        # first reacquire the GIL
-        after = rffi.aroundstate.after
-        if after:
-            after()
-        # similar to trackgcroot.py:pypy_asm_stackwalk, second part
+    def _reacquire_gil_asmgcc(css, old_rpy_fastgil):
+        # Only called if rpy_fastgil was reset to a different value
+        # by another thread or by a callback.  See description in
+        # transator/c/src/thread_pthread.c.
+        if not old_rpy_fastgil:
+            # first reacquire the GIL
+            after = rffi.aroundstate.after
+            if after:
+                after()
+        else:
+            # stole the GIL from a different thread that is also
+            # currently in an external call from the jit.  Attach
+            # the 'old_rpy_fastgil' into the chained list.
+            from rpython.memory.gctransform import asmgcroot
+            oth = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, old_rpy_fastgil)
+            next = asmgcroot.gcrootanchor.next
+            oth.next = next
+            oth.prev = asmgcroot.gcrootanchor
+            asmgcroot.gcrootanchor.next = oth
+            next.prev = oth
+        # similar to trackgcroot.py:pypy_asm_stackwalk, second part:
+        # detach the 'css' from the chained list
         from rpython.memory.gctransform import asmgcroot
         old = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
         prev = old.prev
 
     @staticmethod
     @rgc.no_collect
-    def _release_gil_shadowstack():
-        before = rffi.aroundstate.before
-        if before:
-            before()
-
-    @staticmethod
-    @rgc.no_collect
     def _reacquire_gil_shadowstack():
+        # Simplified version of _reacquire_gil_asmgcc(): in shadowstack mode,
+        # rpy_fastgil contains only 0 or 1, and this must only be called when
+        # the old value stored in rpy_fastgil was 0.
         after = rffi.aroundstate.after
         if after:
             after()
 
-    @staticmethod
-    def _no_op():
-        pass
-
-    _NOARG_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
-    _CLOSESTACK_FUNC = lltype.Ptr(lltype.FuncType([rffi.LONGP],
-                                                  lltype.Void))
+    _REACQGIL0_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
+    _REACQGIL2_FUNC = lltype.Ptr(lltype.FuncType([rffi.CCHARP, rffi.CCHARP],
+                                                 lltype.Void))
 
     def _build_release_gil(self, gcrootmap):
         if gcrootmap is None:
-            releasegil_func = llhelper(self._NOARG_FUNC, self._no_op)
-            reacqgil_func = llhelper(self._NOARG_FUNC, self._no_op)
+            pass
         elif gcrootmap.is_shadow_stack:
-            releasegil_func = llhelper(self._NOARG_FUNC,
-                                       self._release_gil_shadowstack)
-            reacqgil_func = llhelper(self._NOARG_FUNC,
+            reacqgil_func = llhelper(self._REACQGIL0_FUNC,
                                      self._reacquire_gil_shadowstack)
+            self.reacqgil_addr = self.cpu.cast_ptr_to_int(reacqgil_func)
         else:
-            releasegil_func = llhelper(self._CLOSESTACK_FUNC,
-                                       self._release_gil_asmgcc)
-            reacqgil_func = llhelper(self._CLOSESTACK_FUNC,
+            reacqgil_func = llhelper(self._REACQGIL2_FUNC,
                                      self._reacquire_gil_asmgcc)
-        self.releasegil_addr  = self.cpu.cast_ptr_to_int(releasegil_func)
-        self.reacqgil_addr = self.cpu.cast_ptr_to_int(reacqgil_func)
+            self.reacqgil_addr = self.cpu.cast_ptr_to_int(reacqgil_func)
 
     def _is_asmgcc(self):
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap

File rpython/jit/backend/llsupport/callbuilder.py

 from rpython.rlib.clibffi import FFI_DEFAULT_ABI
+from rpython.rlib import objectmodel
+
 
 class AbstractCallBuilder(object):
 
     def emit_call_release_gil(self):
         """Emit a CALL_RELEASE_GIL, including calls to releasegil_addr
         and reacqgil_addr."""
+        asmgcc = self.asm._is_asmgcc()
+        fastgil = objectmodel.prepare_enter_callback_from_jit(is_asmgcc)
         self.select_call_release_gil_mode()
         self.prepare_arguments()
         self.push_gcmap_for_call_release_gil()
-        self.call_releasegil_addr_and_move_real_arguments()
+        self.call_releasegil_addr_and_move_real_arguments(fastgil)
         self.emit_raw_call()
         self.restore_stack_pointer()
-        self.move_real_result_and_call_reacqgil_addr()
+        self.move_real_result_and_call_reacqgil_addr(fastgil)
         self.pop_gcmap()
         self.load_result()
 
-    def call_releasegil_addr_and_move_real_arguments(self):
+    def call_releasegil_addr_and_move_real_arguments(self, fastgil):
         raise NotImplementedError
 
-    def move_real_result_and_call_reacqgil_addr(self):
+    def move_real_result_and_call_reacqgil_addr(self, fastgil):
         raise NotImplementedError
 
     def select_call_release_gil_mode(self):

File rpython/jit/backend/x86/callbuilder.py

         self.current_esp = 0     # 0 or (usually) negative, counted in bytes
 
     def select_call_release_gil_mode(self):
-        """Overridden in CallBuilder64"""
         AbstractCallBuilder.select_call_release_gil_mode(self)
         if self.asm._is_asmgcc():
             from rpython.memory.gctransform import asmgcroot
             self.asm.set_extra_stack_depth(self.mc, 0)
         self.asm.pop_gcmap(self.mc)
 
-    def call_releasegil_addr_and_move_real_arguments(self):
-        initial_esp = self.current_esp
-        self.save_register_arguments()
+    def call_releasegil_addr_and_move_real_arguments(self, fastgil):
+        from rpython.jit.backend.x86.assembler import heap
         #
         if not self.asm._is_asmgcc():
             # the helper takes no argument
             self.change_extra_stack_depth = False
+            css_value = imm(1)
         else:
             from rpython.memory.gctransform import asmgcroot
             # build a 'css' structure on the stack: 2 words for the linkage,
             index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
             self.mc.MOV_sr(index_of_ebp, ebp.value)  # MOV [css.ebp], EBP
             # Save the "return address": we pretend that it's css
-            if IS_X86_32:
-                reg = eax
-            elif IS_X86_64:
-                reg = edi
-            self.mc.LEA_rs(reg.value, css)           # LEA reg, [css]
+            self.mc.LEA_rs(eax.value, css)           # LEA eax, [css]
             frame_ptr = css + WORD * (2+asmgcroot.FRAME_PTR)
-            self.mc.MOV_sr(frame_ptr, reg.value)     # MOV [css.frame], reg
+            self.mc.MOV_sr(frame_ptr, eax.value)     # MOV [css.frame], eax
             # Set up jf_extra_stack_depth to pretend that the return address
             # was at css, and so our stack frame is supposedly shorter by
             # (PASS_ON_MY_FRAME-JIT_USE_WORDS+1) words
             delta = PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS + 1
             self.change_extra_stack_depth = True
             self.asm.set_extra_stack_depth(self.mc, -delta * WORD)
-            # Call the closestack() function (also releasing the GIL)
-            # with 'reg' as argument
-            if IS_X86_32:
-                self.subtract_esp_aligned(1)
-                self.mc.MOV_sr(0, reg.value)
-            #else:
-            #   on x86_64, reg is edi so that it is already correct
+            css_value = eax
         #
-        self.mc.CALL(imm(self.asm.releasegil_addr))
+        self.mc.MOV(heap(fastgil), css_value)
         #
         if not we_are_translated():        # for testing: we should not access
             self.mc.ADD(ebp, imm(1))       # ebp any more
+
+    def move_real_result_and_call_reacqgil_addr(self, fastgil):
+        from rpython.jit.backend.x86.assembler import heap
+        from rpython.jit.backend.x86 import rx86
         #
-        self.restore_register_arguments()
-        self.restore_stack_pointer(initial_esp)
-
-    def save_register_arguments(self):
-        """Overridden in CallBuilder64"""
-
-    def restore_register_arguments(self):
-        """Overridden in CallBuilder64"""
-
-    def move_real_result_and_call_reacqgil_addr(self):
         # save the result we just got (in eax/eax+edx/st(0)/xmm0)
         self.save_result_value()
         # call the reopenstack() function (also reacquiring the GIL)
+        mc = self.mc
         if not self.asm._is_asmgcc():
-            css = 0     # the helper takes no argument
+            css_value = imm(1)
+            old_value = edx
         else:
             from rpython.memory.gctransform import asmgcroot
             css = WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS)
             if IS_X86_32:
-                reg = eax
+                css_value = ecx
+                old_value = edx
             elif IS_X86_64:
-                reg = edi
-            self.mc.LEA_rs(reg.value, css)
-            if IS_X86_32:
-                self.mc.MOV_sr(0, reg.value)
+                css_value = edi
+                old_value = esi
+            mc.LEA_rs(css_value.value, css)
         #
-        self.mc.CALL(imm(self.asm.reacqgil_addr))
+        mc.XOR_rr(old_value.value, old_value.value)
+        if rx86.fits_in_32bits(fastgil):
+            mc.XCHG_rj(old_value.value, fastgil)
+        else:
+            mc.MOV_ri(X86_64_SCRATCH_REG.value, fastgil)
+            mc.XCHG_rm(old_value.value, (X86_64_SCRATCH_REG.value, 0))
+        mc.CMP(old_value, css_value)
+        mc.J_il8(rx86.Conditions['E'], 0)
+        je_location = mc.get_relative_pos()
+        #
+        if IS_X86_32:
+            mc.MOV_sr(4, css_value.value)
+            mc.MOV_sr(0, old_value.value)
+        mc.CALL(imm(self.asm.reacqgil_addr))
+        #
+        # patch the JE above
+        offset = mc.get_relative_pos() - je_location
+        assert 0 < offset <= 127
+        mc.overwrite(je_location-1, chr(offset))
         #
         if not we_are_translated():        # for testing: now we can accesss
             self.mc.SUB(ebp, imm(1))       # ebp again
             CallBuilderX86.load_result(self)
 
     def save_result_value(self):
-        # Temporarily save the result value into [ESP+4].  We use "+4"
-        # in order to leave the word at [ESP+0] free, in case it's needed
+        # Temporarily save the result value into [ESP+8].  We use "+8"
+        # in order to leave the two initial words free, in case it's needed
         if self.ressize == 0:      # void return
             return
         if self.resloc.is_float():
             # a float or a long long return
-            self.tmpresloc = RawEspLoc(4, FLOAT)
+            self.tmpresloc = RawEspLoc(8, FLOAT)
             if self.restype == 'L':
-                self.mc.MOV_sr(4, eax.value)      # long long
-                self.mc.MOV_sr(8, edx.value)
+                self.mc.MOV_sr(8, eax.value)      # long long
+                self.mc.MOV_sr(12, edx.value)
             else:
-                self.mc.FSTPL_s(4)                # float return
+                self.mc.FSTPL_s(8)                # float return
         else:
-            self.tmpresloc = RawEspLoc(4, INT)
+            self.tmpresloc = RawEspLoc(8, INT)
             if self.restype == 'S':
-                self.mc.FSTPS_s(4)
+                self.mc.FSTPS_s(8)
             else:
                 assert self.restype == INT
                 assert self.ressize <= WORD
-                self.mc.MOV_sr(4, eax.value)
+                self.mc.MOV_sr(8, eax.value)
 
 
 class CallBuilder64(CallBuilderX86):
 
     ARGUMENTS_GPR = [edi, esi, edx, ecx, r8, r9]
     ARGUMENTS_XMM = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7]
-    DONT_MOVE_GPR = []
     _ALL_CALLEE_SAVE_GPR = [ebx, r12, r13, r14, r15]
 
     next_arg_gpr = 0
             res = self.ARGUMENTS_GPR[i]
         except IndexError:
             return None
-        if hint in self.DONT_MOVE_GPR:
-            for j in range(i):
-                if hint is self.ARGUMENTS_GPR[j]:
-                    break
-            else:
-                self.ARGUMENTS_GPR[i] = hint
-                res = hint
         return res
 
     def _unused_xmm(self):
         except IndexError:
             return None
 
-    def _permute_to_prefer_unused_registers(self, lst):
-        # permute 'lst' so that it starts with registers that are not
-        # in 'self.already_used', and ends with registers that are.
-        N = len(lst)
-        i = 0
-        while i < N:
-            reg = lst[i]
-            if reg in self.already_used:
-                # move this reg to the end, and decrement N
-                N -= 1
-                assert N >= i
-                lst[N], lst[i] = lst[i], lst[N]
-            else:
-                i += 1
-
-    def select_call_release_gil_mode(self):
-        CallBuilderX86.select_call_release_gil_mode(self)
-        # We have to copy the arguments around a bit more in this mode,
-        # but on the other hand we don't need prepare_arguments() moving
-        # them in precisely the final registers.  Here we look around for
-        # unused registers that may be more likely usable.
-        from rpython.jit.backend.x86.regalloc import X86_64_RegisterManager
-        from rpython.jit.backend.x86.regalloc import X86_64_XMMRegisterManager
-        self.already_used = {}
-        for loc in self.arglocs:
-            self.already_used[loc] = None
-        #
-        lst = X86_64_RegisterManager.save_around_call_regs[:]
-        self._permute_to_prefer_unused_registers(lst)
-        # <optimization>
-        extra = []
-        for reg in self.asm._regalloc.rm.free_regs:
-            if (reg not in self.already_used and
-                    reg in self._ALL_CALLEE_SAVE_GPR):
-                extra.append(reg)
-        self.free_callee_save_gprs = extra
-        lst = extra + lst
-        # </optimization>
-        self.ARGUMENTS_GPR = lst[:len(self.ARGUMENTS_GPR)]
-        self.DONT_MOVE_GPR = self._ALL_CALLEE_SAVE_GPR
-        #
-        lst = X86_64_XMMRegisterManager.save_around_call_regs[:]
-        self._permute_to_prefer_unused_registers(lst)
-        self.ARGUMENTS_XMM = lst[:len(self.ARGUMENTS_XMM)]
-
     def prepare_arguments(self):
         src_locs = []
         dst_locs = []
             assert self.restype == INT
             self.mc.MOV(self.tmpresloc, eax)
 
-    def save_register_arguments(self):
-        # Save the argument registers, which are given by self.ARGUMENTS_xxx.
-        n_gpr = min(self.next_arg_gpr, len(self.ARGUMENTS_GPR))
-        n_xmm = min(self.next_arg_xmm, len(self.ARGUMENTS_XMM))
-        n_saved_regs = n_gpr + n_xmm
-        for i in range(n_gpr):
-            if self.ARGUMENTS_GPR[i] in self._ALL_CALLEE_SAVE_GPR:
-                n_saved_regs -= 1     # don't need to save it
-        self.subtract_esp_aligned(n_saved_regs)
-        #
-        n = 0
-        for i in range(n_gpr):
-            if self.ARGUMENTS_GPR[i] not in self._ALL_CALLEE_SAVE_GPR:
-                self.mc.MOV_sr(n * WORD, self.ARGUMENTS_GPR[i].value)
-                n += 1
-        for i in range(n_xmm):
-            self.mc.MOVSD_sx(n * WORD, self.ARGUMENTS_XMM[i].value)
-            n += 1
-        assert n == n_saved_regs
-        self.n_saved_regs = n_saved_regs
-
-    def restore_register_arguments(self):
-        # Restore the saved values into the *real* registers used for calls
-        # --- which are not self.ARGUMENTS_xxx!
-        n_gpr = min(self.next_arg_gpr, len(self.ARGUMENTS_GPR))
-        n_xmm = min(self.next_arg_xmm, len(self.ARGUMENTS_XMM))
-        #
-        n = 0
-        for i in range(n_gpr):
-            tgtvalue = CallBuilder64.ARGUMENTS_GPR[i].value
-            if self.ARGUMENTS_GPR[i] not in self._ALL_CALLEE_SAVE_GPR:
-                self.mc.MOV_rs(tgtvalue, n * WORD)
-                n += 1
-            else:
-                self.mc.MOV_rr(tgtvalue, self.ARGUMENTS_GPR[i].value)
-        for i in range(n_xmm):
-            self.mc.MOVSD_xs(CallBuilder64.ARGUMENTS_XMM[i].value, n * WORD)
-            n += 1
-        assert n == self.n_saved_regs
-        #
-        if isinstance(self.fnloc, RegLoc):    # fix this register
-            self.fnloc = CallBuilder64.ARGUMENTS_GPR[n_gpr - 1]
-
 
 if IS_X86_32:
     CallBuilder = CallBuilder32

File rpython/jit/backend/x86/rx86.py

     # XXX: Only here for testing purposes..."as" happens the encode the
     # registers in the opposite order that we would otherwise do in a
     # register-register exchange.
-    #XCHG_rr = insn(rex_w, '\x87', register(1), register(2,8), '\xC0')
+    XCHG_rr = insn(rex_w, '\x87', register(1), register(2,8), '\xC0')
 
     JMP_l = insn('\xE9', relative(1))
     JMP_r = insn(rex_nw, '\xFF', orbyte(4<<3), register(1), '\xC0')
 
 define_modrm_modes('SQRTSD_x*', ['\xF2', rex_nw, '\x0F\x51', register(1,8)], regtype='XMM')
 
-#define_modrm_modes('XCHG_r*', [rex_w, '\x87', register(1, 8)])
+define_modrm_modes('XCHG_r*', [rex_w, '\x87', register(1, 8)])
 
 define_modrm_modes('ADDSD_x*', ['\xF2', rex_nw, '\x0F\x58', register(1, 8)], regtype='XMM')
 define_modrm_modes('ADDPD_x*', ['\x66', rex_nw, '\x0F\x58', register(1, 8)], regtype='XMM')

File rpython/rlib/objectmodel.py

     llhelper(rffi.AroundFnPtr, before)
     llhelper(rffi.AroundFnPtr, after)
 
+def _enter_callback_from_jit():
+    from rpython.rlib import rthread
+    rthread.gil_enter_callback_without_gil()    # no need for errno saving
+
+def prepare_enter_callback_from_jit(is_asmgcc):
+    from rpython.rlib import rthread
+    from rpython.rtyper.lltypesystem import rffi
+    if rffi.aroundstate.after is None:
+        rffi.aroundstate.after = _enter_callback_from_jit
+        from rpython.rtyper.annlowlevel import llhelper
+        llhelper(rffi.AroundFnPtr, _enter_callback_from_jit)
+    return rthread.get_fastgil_addr_raw(is_asmgcc)
+
 def is_in_callback():
     from rpython.rtyper.lltypesystem import rffi
     return rffi.stackcounter.stacks_counter > 1

File rpython/rlib/rthread.py

                               _nowrapper=True)
 gil_acquire      = llexternal('RPyGilAcquire', [], lltype.Void,
                               _nowrapper=True)
+gil_enter_callback_without_gil = (
+                   llexternal('RPyEnterCallbackWithoutGil', [], lltype.Void,
+                              _nowrapper=True))
+
+@specialize.memo()
+def _fetch_fastgil(rpy_fastgil_value):
+    eci = ExternalCompilationInfo(
+        pre_include_bits = ['#define RPY_FASTGIL %d' % rpy_fastgil_value])
+    return rffi.llexternal('RPyFetchFastGil', [], lltype.Signed,
+                           compilation_info=eci, sandboxsafe=True)
+
+def get_fastgil_addr_raw(is_asmgcc):
+    if is_asmgcc:   # must be constant!
+        return _fetch_fastgil(42)
+    else:
+        return _fetch_fastgil(1)
+
 
 def allocate_lock():
     return Lock(allocate_ll_lock())

File rpython/translator/c/src/thread_pthread.c

     if (pending_acquires >= 0)
         assert_has_the_gil();
 #endif
+    /* Note that 'pending_acquires' is only manipulated when we hold the
+       GIL, with one exception: RPyGilAcquire() increases it by one
+       before it waits for the GIL mutex.  Thus the only race condition
+       here should be harmless: the other thread already increased
+       'pending_acquires' but is still not in the pthread_mutex_lock().
+       That's fine.  Note that we release the mutex in the
+       pthread_cond_wait() below.
+    */
     if (pending_acquires <= 0)
         return 0;
     atomic_add(&pending_acquires, 1L);
     ASSERT_STATUS(pthread_cond_signal(&cond_gil));
 }
 
-#ifdef RPY_FASTGIL_VARNAME
+#ifdef RPY_FASTGIL
 #include <time.h>
 
+static void *rpy_fastgil = NULL;
+
+Signed RPyFetchFastGil(void)
+{
+    return (Signed)(&rpy_fastgil);
+}
+
 static inline void *atomic_xchg(void **ptr, void *value)
 {
     void *result;
-    asm volatile (
 #if defined(__amd64__)
-                  "xchgq %0, %1  /* automatically locked */"
+    asm volatile ("xchgq %0, %1  /* automatically locked */"
+                  : "r"(result) : "0"(value), "m"(*ptr) : "memory");
 #elif defined(__i386__)
-                  "xchgl %0, %1  /* automatically locked */"
+    asm volatile ("xchgl %0, %1  /* automatically locked */"
+                  : "r"(result) : "0"(value), "m"(*ptr) : "memory");
 #else
-#  error "RPY_FASTGIL_VARNAME: only for x86 right now"
+    /* requires gcc >= 4.1 */
+    while (1) {
+        result = *ptr;
+        if (__sync_bool_compare_and_swap(ptr, result, value))
+            break;
+    }
 #endif
-                  : "r"(result) : "0"(value), "m"(*ptr) : "memory");
     return result;
 }
 
+int RPyEnterCallbackWithoutGil(void)
+{
+    /* this function must be used when entering callbacks as long as
+       we don't have a real GIL.  It only checks for a non-null value
+       in 'rpy_fastgil'.
+
+       Note: doesn't use any pthread_xx function, so is errno-safe.
+    */
+    void *fastgilvalue;
+    fastgilvalue = atomic_xchg(&rpy_fastgil, NULL);
+    if (fastgilvalue != NULL) {
+        /* yes, succeeded.  We know that the other thread is before
+           the return to JITted assembler from the C function call.
+           The JITted assembler will definitely call RPyGilAcquire()
+           then.  So we can just pretend that the GIL --- which is
+           still acquired --- is ours now.  We only need to fix
+           the asmgcc linked list.
+        */
+#if RPY_FASTGIL == 42    /* special value to mean "asmgcc" */
+        struct pypy_ASM_FRAMEDATA_HEAD0 *new =
+            (struct pypy_ASM_FRAMEDATA_HEAD0 *)fastgilvalue;
+        struct pypy_ASM_FRAMEDATA_HEAD0 *root = &pypy_g_ASM_FRAMEDATA_HEAD;
+        struct pypy_ASM_FRAMEDATA_HEAD0 *next = root->as_next;
+        new->as_next = next;
+        new->as_prev = root;
+        root->as_next = new;
+        next->as_prev = new;
+#endif
+        return 1;
+    }
+    return 0;
+}
+
 static inline timespec_add(struct timespec *t, long incr)
 {
     long nsec = t->tv_nsec + incr;
     /* Support for the JIT, which generates calls to external C
        functions using the following very fast pattern:
 
-       * the global variable 'RPY_FASTGIL_VARNAME' (a macro naming the
-         real variable) contains normally 0
+       * the global variable 'rpy_fastgil' contains normally 0
 
        * before doing an external C call, the generated assembler sets
          this global variable to an in-stack pointer to its
     while (1) {
 
         /* try to see if we can steal the fast GIL */
-        void *fastgilvalue;
-        fastgilvalue = atomic_xchg(&RPY_FASTGIL_VARNAME, NULL);
-        if (fastgilvalue != NULL) {
-            /* yes, succeeded.  We know that the other thread is before
-               the return to JITted assembler from the C function call.
-               The JITted assembler will definitely call RPyGilAcquire()
-               then.  So we can just pretend that the GIL --- which is
-               still acquired --- is ours now.  We only need to fix
-               the asmgcc linked list.
-            */
-            struct pypy_ASM_FRAMEDATA_HEAD0 *new =
-                (struct pypy_ASM_FRAMEDATA_HEAD0 *)fastgilvalue;
-            struct pypy_ASM_FRAMEDATA_HEAD0 *root = &pypy_g_ASM_FRAMEDATA_HEAD;
-            struct pypy_ASM_FRAMEDATA_HEAD0 *next = root->as_next;
-            new->as_next = next;
-            new->as_prev = root;
-            root->as_next = new;
-            next->as_prev = new;
+        if (RPyEnterCallbackWithoutGil())
             return;
-        }
 
         /* sleep for a bit of time */
         clock_gettime(CLOCK_REALTIME, &t);
         return;
     }
     atomic_add(&pending_acquires, 1L);
-#ifdef RPY_FASTGIL_VARNAME
+#ifdef RPY_FASTGIL
     _acquire_gil_or_wait_for_fastgil_to_be_nonzero();
 #else
     ASSERT_STATUS(pthread_mutex_lock(&mutex_gil));
     assert_has_the_gil();
     _debug_print("RPyGilAcquire\n");
 }
-
-XXX even without a gil, we need to check at least for a RPY_FASTGIL_VARNAME
-that is not null, in callbacks