Armin Rigo avatar Armin Rigo committed 7c80121 Merge

hg merge emit-call-x86: fix for multithreaded programs, particularly
those that run more threads than cores. The issue was that a
call_release_gil instruction compiles to code that still accesses
ebp/rsp after it released the GIL.

Comments (0)

Files changed (10)

pypy/module/pypyjit/test_pypy_c/bug1.py

+import cffi, thread, time, sys
+
+
+ffi = cffi.FFI()
+
+ffi.cdef("""
+    long foobar(long a, long b, long c, long d, long e, long f,
+                long a2, long b2, long c2, long d2, long e2, long f2,
+                long a3, long b3, long c3, long d3, long e3, long f3,
+                long a4, long b4, long c4, long d4, long e4, long f4);
+""")
+
+lib = ffi.verify("""
+    long foobar(long a, long b, long c, long d, long e, long f,
+                long a2, long b2, long c2, long d2, long e2, long f2,
+                long a3, long b3, long c3, long d3, long e3, long f3,
+                long a4, long b4, long c4, long d4, long e4, long f4)
+    {
+        return a * 1 + b * 2 + c * 3 + d * 4 + e * 5 + f * 6 +
+               (a2 * 1 + b2 * 2 + c2 * 3 + d2 * 4 + e2 * 5 + f2 * 6) * (-3) +
+               (a3 * 1 + b3 * 2 + c3 * 3 + d3 * 4 + e3 * 5 + f3 * 6) * (-5) +
+               (a4 * 1 + b4 * 2 + c4 * 3 + d4 * 4 + e4 * 5 + f4 * 6) * (-7);
+    }
+""")
+
+
+def runme():
+    for j in range(10):
+        for i in range(10000):
+            args = [i-k for k in range(24)]
+            x = lib.foobar(*args)
+            (a,b,c,d,e,f,a2,b2,c2,d2,e2,f2,
+             a3,b3,c3,d3,e3,f3,a4,b4,c4,d4,e4,f4) = args
+            assert x == (
+                a * 1 + b * 2 + c * 3 + d * 4 + e * 5 + f * 6 +
+                (a2 * 1 + b2 * 2 + c2 * 3 + d2 * 4 + e2 * 5 + f2 * 6) * (-3) +
+                (a3 * 1 + b3 * 2 + c3 * 3 + d3 * 4 + e3 * 5 + f3 * 6) * (-5) +
+                (a4 * 1 + b4 * 2 + c4 * 3 + d4 * 4 + e4 * 5 + f4 * 6) * (-7))
+
+done = []
+
+def submain():
+    try:
+        runme()
+        err = None
+    except:
+        err = sys.exc_info()
+    done.append(err)
+
+for i in range(2):
+    thread.start_new_thread(submain, ())
+while len(done) < 2:
+    time.sleep(0.1)
+
+for err in done:
+    if err is not None:
+        raise err[0], err[1], err[2]

pypy/module/pypyjit/test_pypy_c/test_bug.py

+import os, sys, py, subprocess
+
+localdir = os.path.dirname(os.path.abspath(__file__))
+
+
+def test_bug1():
+    if not sys.platform.startswith('linux'):
+        py.test.skip("linux-only test")
+
+    cmdline = ['taskset', '-c', '0',
+               sys.executable, os.path.join(localdir, 'bug1.py')]
+    popen = subprocess.Popen(cmdline)
+    err = popen.wait()
+    assert err == 0

rpython/jit/backend/llgraph/test/test_llgraph.py

     def test_memoryerror(self):
         py.test.skip("does not make much sense on the llgraph backend")
 
+    def test_call_release_gil_variable_function_and_arguments(self):
+        py.test.skip("the arguments seem not correctly casted")
+
 
 def test_cast_adr_to_int_and_back():
     X = lltype.Struct('X', ('foo', lltype.Signed))

rpython/jit/backend/llsupport/assembler.py

             self.malloc_slowpath_unicode = None
 
         self._build_stack_check_slowpath()
-        if gc_ll_descr.gcrootmap:
-            self._build_release_gil(gc_ll_descr.gcrootmap)
+        self._build_release_gil(gc_ll_descr.gcrootmap)
         if not self._debug:
             # if self._debug is already set it means that someone called
             # set_debug by hand before initializing the assembler. Leave it
         if after:
             after()
 
+    @staticmethod
+    def _no_op():
+        pass
+
     _NOARG_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
     _CLOSESTACK_FUNC = lltype.Ptr(lltype.FuncType([rffi.LONGP],
                                                   lltype.Void))
 
     def _build_release_gil(self, gcrootmap):
-        if gcrootmap.is_shadow_stack:
+        if gcrootmap is None:
+            releasegil_func = llhelper(self._NOARG_FUNC, self._no_op)
+            reacqgil_func = llhelper(self._NOARG_FUNC, self._no_op)
+        elif gcrootmap.is_shadow_stack:
             releasegil_func = llhelper(self._NOARG_FUNC,
                                        self._release_gil_shadowstack)
             reacqgil_func = llhelper(self._NOARG_FUNC,

rpython/jit/backend/test/runner_test.py

         assert rffi.charp2strn(buffer, buflen) == cwd
         lltype.free(buffer, flavor='raw')
 
+    def test_call_release_gil_return_types(self):
+        from rpython.rlib.libffi import types
+        from rpython.rlib.rarithmetic import r_uint, r_longlong, r_ulonglong
+        from rpython.rlib.rarithmetic import r_singlefloat
+        cpu = self.cpu
+
+        for ffitype, result, TP in [
+            (types.ulong,  r_uint(sys.maxint + 10), lltype.Unsigned),
+            (types.slong,  -4321, lltype.Signed),
+            (types.uint8,  200, rffi.UCHAR),
+            (types.sint8,  -42, rffi.SIGNEDCHAR),
+            (types.uint16, 50000, rffi.USHORT),
+            (types.sint16, -20000, rffi.SHORT),
+            (types.uint32, r_uint(3000000000), rffi.UINT),
+            (types.sint32, -2000000000, rffi.INT),
+            (types.uint64, r_ulonglong(9999999999999999999),
+                                                   lltype.UnsignedLongLong),
+            (types.sint64, r_longlong(-999999999999999999),
+                                                   lltype.SignedLongLong),
+            (types.double, 12.3475226, rffi.DOUBLE),
+            (types.float,  r_singlefloat(-592.75), rffi.FLOAT),
+            ]:
+            if sys.maxint < 2**32 and TP in (lltype.SignedLongLong,
+                                             lltype.UnsignedLongLong):
+                if not cpu.supports_longlong:
+                    continue
+            if TP == rffi.DOUBLE:
+                if not cpu.supports_floats:
+                    continue
+            if TP == rffi.FLOAT:
+                if not cpu.supports_singlefloats:
+                    continue
+            #
+            result = rffi.cast(TP, result)
+            #
+            def pseudo_c_function():
+                return result
+            #
+            FPTR = self.Ptr(self.FuncType([], TP))
+            func_ptr = llhelper(FPTR, pseudo_c_function)
+            funcbox = self.get_funcbox(cpu, func_ptr)
+            calldescr = cpu._calldescr_dynamic_for_tests([], ffitype)
+            faildescr = BasicFailDescr(1)
+            kind = types.getkind(ffitype)
+            if kind in 'uis':
+                b3 = BoxInt()
+            elif kind in 'fUI':
+                b3 = BoxFloat()
+            else:
+                assert 0, kind
+            #
+            ops = [
+                ResOperation(rop.CALL_RELEASE_GIL, [funcbox], b3,
+                             descr=calldescr),
+                ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+                ResOperation(rop.FINISH, [b3], None, descr=BasicFinalDescr(0))
+                ]
+            ops[1].setfailargs([])
+            looptoken = JitCellToken()
+            self.cpu.compile_loop([], ops, looptoken)
+
+            deadframe = self.cpu.execute_token(looptoken)
+            fail = self.cpu.get_latest_descr(deadframe)
+            assert fail.identifier == 0
+            if isinstance(b3, BoxInt):
+                r = self.cpu.get_int_value(deadframe, 0)
+                if isinstance(result, r_singlefloat):
+                    assert -sys.maxint-1 <= r <= 0xFFFFFFFF
+                    r, = struct.unpack("f", struct.pack("I", r & 0xFFFFFFFF))
+                    result = float(result)
+                else:
+                    r = rffi.cast(TP, r)
+                assert r == result
+            elif isinstance(b3, BoxFloat):
+                r = self.cpu.get_float_value(deadframe, 0)
+                if isinstance(result, float):
+                    r = longlong.getrealfloat(r)
+                else:
+                    r = rffi.cast(TP, r)
+                assert r == result
+
+    def test_call_release_gil_variable_function_and_arguments(self):
+        from rpython.rlib.libffi import types
+        from rpython.rlib.rarithmetic import r_uint, r_longlong, r_ulonglong
+        from rpython.rlib.rarithmetic import r_singlefloat
+
+        cpu = self.cpu
+        rnd = random.Random(525)
+
+        ALL_TYPES = [
+            (types.ulong,  lltype.Unsigned),
+            (types.slong,  lltype.Signed),
+            (types.uint8,  rffi.UCHAR),
+            (types.sint8,  rffi.SIGNEDCHAR),
+            (types.uint16, rffi.USHORT),
+            (types.sint16, rffi.SHORT),
+            (types.uint32, rffi.UINT),
+            (types.sint32, rffi.INT),
+            ]
+        if sys.maxint < 2**32 and cpu.supports_longlong:
+            ALL_TYPES += [
+                (types.uint64, lltype.UnsignedLongLong),
+                (types.sint64, lltype.SignedLongLong),
+                ] * 2
+        if cpu.supports_floats:
+            ALL_TYPES += [
+                (types.double, rffi.DOUBLE),
+                ] * 4
+        if cpu.supports_singlefloats:
+            ALL_TYPES += [
+                (types.float,  rffi.FLOAT),
+                ] * 4
+
+        for k in range(100):
+            POSSIBLE_TYPES = [rnd.choice(ALL_TYPES)
+                              for i in range(random.randrange(2, 5))]
+            load_factor = rnd.random()
+            keepalive_factor = rnd.random()
+            #
+            def pseudo_c_function(*args):
+                seen.append(list(args))
+            #
+            ffitypes = []
+            ARGTYPES = []
+            for i in range(rnd.randrange(4, 20)):
+                ffitype, TP = rnd.choice(POSSIBLE_TYPES)
+                ffitypes.append(ffitype)
+                ARGTYPES.append(TP)
+            #
+            FPTR = self.Ptr(self.FuncType(ARGTYPES, lltype.Void))
+            func_ptr = llhelper(FPTR, pseudo_c_function)
+            funcbox = self.get_funcbox(cpu, func_ptr)
+            calldescr = cpu._calldescr_dynamic_for_tests(ffitypes, types.void)
+            faildescr = BasicFailDescr(1)
+            #
+            argboxes = [BoxInt()]   # for the function to call
+            codes = ['X']
+            for ffitype in ffitypes:
+                kind = types.getkind(ffitype)
+                codes.append(kind)
+                if kind in 'uis':
+                    b1 = BoxInt()
+                elif kind in 'fUI':
+                    b1 = BoxFloat()
+                else:
+                    assert 0, kind
+                argboxes.append(b1)
+            codes = ''.join(codes)     # useful for pdb
+            print
+            print codes
+            #
+            argvalues = [funcbox.getint()]
+            for TP in ARGTYPES:
+                r = (rnd.random() - 0.5) * 999999999999.9
+                r = rffi.cast(TP, r)
+                argvalues.append(r)
+            #
+            argvalues_normal = argvalues[:1]
+            for ffitype, r in zip(ffitypes, argvalues[1:]):
+                kind = types.getkind(ffitype)
+                if kind in 'ui':
+                    r = rffi.cast(lltype.Signed, r)
+                elif kind in 's':
+                    r, = struct.unpack("i", struct.pack("f", float(r)))
+                elif kind in 'f':
+                    r = longlong.getfloatstorage(r)
+                elif kind in 'UI':   # 32-bit only
+                    r = rffi.cast(lltype.SignedLongLong, r)
+                else:
+                    assert 0
+                argvalues_normal.append(r)
+            #
+            ops = []
+            loadcodes = []
+            insideboxes = []
+            for b1 in argboxes:
+                load = rnd.random() < load_factor
+                loadcodes.append(' ^'[load])
+                if load:
+                    b2 = b1.clonebox()
+                    ops.insert(rnd.randrange(0, len(ops)+1),
+                               ResOperation(rop.SAME_AS, [b1], b2))
+                    b1 = b2
+                insideboxes.append(b1)
+            loadcodes = ''.join(loadcodes)
+            print loadcodes
+            ops += [
+                ResOperation(rop.CALL_RELEASE_GIL, insideboxes, None,
+                             descr=calldescr),
+                ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+                ResOperation(rop.FINISH, [], None, descr=BasicFinalDescr(0))
+                ]
+            ops[-2].setfailargs([])
+            # keep alive a random subset of the insideboxes
+            for b1 in insideboxes:
+                if rnd.random() < keepalive_factor:
+                    ops.insert(-1, ResOperation(rop.SAME_AS, [b1],
+                                                b1.clonebox()))
+            looptoken = JitCellToken()
+            self.cpu.compile_loop(argboxes, ops, looptoken)
+            #
+            seen = []
+            deadframe = self.cpu.execute_token(looptoken, *argvalues_normal)
+            fail = self.cpu.get_latest_descr(deadframe)
+            assert fail.identifier == 0
+            expected = argvalues[1:]
+            [got] = seen
+            different_values = ['%r != %r' % (a, b)
+                                    for a, b in zip(got, expected)
+                                        if a != b]
+            assert got == expected, ', '.join(different_values)
+
+
     def test_guard_not_invalidated(self):
         cpu = self.cpu
         i0 = BoxInt()

rpython/jit/backend/x86/arch.py

     PASS_ON_MY_FRAME = 12
     JITFRAME_FIXED_SIZE = 28 # 13 GPR + 15 XMM
 
-assert PASS_ON_MY_FRAME >= 11       # asmgcc needs at least JIT_USE_WORDS + 2
+assert PASS_ON_MY_FRAME >= 12       # asmgcc needs at least JIT_USE_WORDS + 3

rpython/jit/backend/x86/assembler.py

                                                 DEBUG_COUNTER, debug_bridge)
 from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from rpython.jit.backend.llsupport.gcmap import allocate_gcmap
-from rpython.jit.metainterp.history import Const, Box
+from rpython.jit.metainterp.history import Const, Box, VOID
 from rpython.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
 from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory
 from rpython.rtyper.lltypesystem.lloperation import llop
     RegLoc, FrameLoc, ConstFloatLoc, ImmedLoc, AddressLoc, imm,
     imm0, imm1, FloatImmedLoc, RawEbpLoc, RawEspLoc)
 from rpython.rlib.objectmodel import we_are_translated
-from rpython.jit.backend.x86 import rx86, codebuf
+from rpython.jit.backend.x86 import rx86, codebuf, callbuilder
 from rpython.jit.metainterp.resoperation import rop
 from rpython.jit.backend.x86 import support
 from rpython.rlib.debug import debug_print, debug_start, debug_stop
 from rpython.rlib import rgc
-from rpython.rlib.clibffi import FFI_DEFAULT_ABI
-from rpython.jit.backend.x86.jump import remap_frame_layout
 from rpython.jit.codewriter.effectinfo import EffectInfo
 from rpython.jit.codewriter import longlong
 from rpython.rlib.rarithmetic import intmask, r_uint
 from rpython.rlib.objectmodel import compute_unique_id
 
 
-# darwin requires the stack to be 16 bytes aligned on calls. Same for gcc 4.5.0,
-# better safe than sorry
-CALL_ALIGN = 16 // WORD
-
-
-def align_stack_words(words):
-    return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
-
-
 class Assembler386(BaseAssembler):
     _regalloc = None
     _output_loop_log = None
             mc.MOV_rs(esi.value, WORD*2)
             # push first arg
             mc.MOV_rr(edi.value, ebp.value)
-            align = align_stack_words(1)
+            align = callbuilder.align_stack_words(1)
             mc.SUB_ri(esp.value, (align - 1) * WORD)
         else:
-            align = align_stack_words(3)
+            align = callbuilder.align_stack_words(3)
             mc.MOV_rs(eax.value, WORD * 2)
             mc.SUB_ri(esp.value, (align - 1) * WORD)
             mc.MOV_sr(WORD, eax.value)
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
         return bool(gcrootmap) and not gcrootmap.is_shadow_stack
 
-    def _emit_call(self, x, arglocs, start=0, tmp=eax,
-                   argtypes=None, callconv=FFI_DEFAULT_ABI,
-                   # whether to worry about a CALL that can collect; this
-                   # is always true except in call_release_gil
-                   can_collect=True,
-                   # max number of arguments we can pass on esp; if more,
-                   # we need to decrease esp temporarily
-                   stack_max=PASS_ON_MY_FRAME):
-        #
-        if IS_X86_64:
-            return self._emit_call_64(x, arglocs, start, argtypes,
-                                      can_collect, stack_max)
-        stack_depth = 0
-        n = len(arglocs)
-        for i in range(start, n):
-            loc = arglocs[i]
-            stack_depth += loc.get_width() // WORD
-        if stack_depth > stack_max:
-            align = align_stack_words(stack_depth - stack_max)
-            self.mc.SUB_ri(esp.value, align * WORD)
-            if can_collect:
-                self.set_extra_stack_depth(self.mc, align * WORD)
+    def simple_call(self, fnloc, arglocs, result_loc=eax):
+        if result_loc is xmm0:
+            result_type = FLOAT
+            result_size = 8
+        elif result_loc is None:
+            result_type = VOID
+            result_size = 0
         else:
-            align = 0
-        p = 0
-        for i in range(start, n):
-            loc = arglocs[i]
-            if isinstance(loc, RegLoc):
-                if loc.is_xmm:
-                    self.mc.MOVSD_sx(p, loc.value)
-                else:
-                    self.mc.MOV_sr(p, loc.value)
-            p += loc.get_width()
-        p = 0
-        for i in range(start, n):
-            loc = arglocs[i]
-            if not isinstance(loc, RegLoc):
-                if loc.get_width() == 8:
-                    self.mc.MOVSD(xmm0, loc)
-                    self.mc.MOVSD_sx(p, xmm0.value)
-                else:
-                    self.mc.MOV(tmp, loc)
-                    self.mc.MOV_sr(p, tmp.value)
-            p += loc.get_width()
-        # x is a location
-        if can_collect:
-            # we push *now* the gcmap, describing the status of GC registers
-            # after the rearrangements done just above, ignoring the return
-            # value eax, if necessary
-            noregs = self.cpu.gc_ll_descr.is_shadow_stack()
-            gcmap = self._regalloc.get_gcmap([eax], noregs=noregs)
-            self.push_gcmap(self.mc, gcmap, store=True)
-        #
-        self.mc.CALL(x)
-        if callconv != FFI_DEFAULT_ABI:
-            self._fix_stdcall(callconv, p - align * WORD)
-        elif align:
-            self.mc.ADD_ri(esp.value, align * WORD)
-        #
-        if can_collect:
-            self._reload_frame_if_necessary(self.mc)
-            if align:
-                self.set_extra_stack_depth(self.mc, 0)
-            self.pop_gcmap(self.mc)
+            result_type = INT
+            result_size = WORD
+        cb = callbuilder.CallBuilder(self, fnloc, arglocs,
+                                     result_loc, result_type,
+                                     result_size)
+        cb.emit()
 
-    def _fix_stdcall(self, callconv, p):
-        from rpython.rlib.clibffi import FFI_STDCALL
-        assert callconv == FFI_STDCALL
-        # it's a bit stupid, but we're just going to cancel the fact that
-        # the called function just added 'p' to ESP, by subtracting it again.
-        self.mc.SUB_ri(esp.value, p)
-
-    def _emit_call_64(self, x, arglocs, start, argtypes,
-                      can_collect, stack_max):
-        src_locs = []
-        dst_locs = []
-        xmm_src_locs = []
-        xmm_dst_locs = []
-        singlefloats = None
-
-        # In reverse order for use with pop()
-        unused_gpr = [r9, r8, ecx, edx, esi, edi]
-        unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
-
-        on_stack = 0
-        # count the stack depth
-        floats = 0
-        for i in range(start, len(arglocs)):
-            arg = arglocs[i]
-            if arg.is_float() or argtypes and argtypes[i - start] == 'S':
-                floats += 1
-        all_args = len(arglocs) - start
-        stack_depth = (max(all_args - floats - len(unused_gpr), 0) +
-                       max(floats - len(unused_xmm), 0))
-        align = 0
-        if stack_depth > stack_max:
-            align = align_stack_words(stack_depth - stack_max)
-            if can_collect:
-                self.set_extra_stack_depth(self.mc, align * WORD)
-            self.mc.SUB_ri(esp.value, align * WORD)
-        for i in range(start, len(arglocs)):
-            loc = arglocs[i]
-            if loc.is_float():
-                xmm_src_locs.append(loc)
-                if len(unused_xmm) > 0:
-                    xmm_dst_locs.append(unused_xmm.pop())
-                else:
-                    xmm_dst_locs.append(RawEspLoc(on_stack * WORD, FLOAT))
-                    on_stack += 1
-            elif argtypes is not None and argtypes[i-start] == 'S':
-                # Singlefloat argument
-                if singlefloats is None:
-                    singlefloats = []
-                if len(unused_xmm) > 0:
-                    singlefloats.append((loc, unused_xmm.pop()))
-                else:
-                    singlefloats.append((loc, RawEspLoc(on_stack * WORD, INT)))
-                    on_stack += 1
-            else:
-                src_locs.append(loc)
-                if len(unused_gpr) > 0:
-                    dst_locs.append(unused_gpr.pop())
-                else:
-                    dst_locs.append(RawEspLoc(on_stack * WORD, INT))
-                    on_stack += 1
-
-        # Handle register arguments: first remap the xmm arguments
-        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs,
-                           X86_64_XMM_SCRATCH_REG)
-        # Load the singlefloat arguments from main regs or stack to xmm regs
-        if singlefloats is not None:
-            for src, dst in singlefloats:
-                if isinstance(dst, RawEspLoc):
-                    # XXX too much special logic
-                    if isinstance(src, RawEbpLoc):
-                        self.mc.MOV32(X86_64_SCRATCH_REG, src)
-                        self.mc.MOV32(dst, X86_64_SCRATCH_REG)
-                    else:
-                        self.mc.MOV32(dst, src)
-                    continue
-                if isinstance(src, ImmedLoc):
-                    self.mc.MOV(X86_64_SCRATCH_REG, src)
-                    src = X86_64_SCRATCH_REG
-                self.mc.MOVD(dst, src)
-        # Finally remap the arguments in the main regs
-        # If x is a register and is in dst_locs, then oups, it needs to
-        # be moved away:
-        if x in dst_locs:
-            src_locs.append(x)
-            dst_locs.append(r10)
-            x = r10
-        remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
-        if can_collect:
-            # we push *now* the gcmap, describing the status of GC registers
-            # after the rearrangements done just above, ignoring the return
-            # value eax, if necessary
-            noregs = self.cpu.gc_ll_descr.is_shadow_stack()
-            gcmap = self._regalloc.get_gcmap([eax], noregs=noregs)
-            self.push_gcmap(self.mc, gcmap, store=True)
-        #
-        self.mc.CALL(x)
-        if align:
-            self.mc.ADD_ri(esp.value, align * WORD)
-        #
-        if can_collect:
-            self._reload_frame_if_necessary(self.mc)
-            if align:
-                self.set_extra_stack_depth(self.mc, 0)
-            self.pop_gcmap(self.mc)
+    def simple_call_no_collect(self, fnloc, arglocs):
+        cb = callbuilder.CallBuilder(self, fnloc, arglocs)
+        cb.emit_no_collect()
 
     def _reload_frame_if_necessary(self, mc, align_stack=False):
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
             self._write_barrier_fastpath(mc, wbdescr, [ebp], array=False,
                                          is_frame=True, align_stack=align_stack)
 
-    def call(self, addr, args, res):
-        self._emit_call(imm(addr), args)
-        assert res is eax
-
     genop_int_neg = _unaryop("NEG")
     genop_int_invert = _unaryop("NOT")
     genop_int_add = _binaryop_or_lea("ADD", True)
     # ----------
 
     def genop_call_malloc_gc(self, op, arglocs, result_loc):
-        self.genop_call(op, arglocs, result_loc)
+        self._genop_call(op, arglocs, result_loc)
         self.propagate_memoryerror_if_eax_is_null()
 
     def propagate_memoryerror_if_eax_is_null(self):
         self.pending_guard_tokens.append(guard_token)
 
     def genop_call(self, op, arglocs, resloc):
-        return self._genop_call(op, arglocs, resloc)
+        self._genop_call(op, arglocs, resloc)
 
     def _genop_call(self, op, arglocs, resloc, is_call_release_gil=False):
         from rpython.jit.backend.llsupport.descr import CallDescr
 
-        sizeloc = arglocs[0]
-        assert isinstance(sizeloc, ImmedLoc)
-        size = sizeloc.value
-        signloc = arglocs[1]
-
-        x = arglocs[2]     # the function address
-        if x is eax:
-            tmp = ecx
-        else:
-            tmp = eax
+        cb = callbuilder.CallBuilder(self, arglocs[2], arglocs[3:], resloc)
 
         descr = op.getdescr()
         assert isinstance(descr, CallDescr)
+        cb.callconv = descr.get_call_conv()
+        cb.argtypes = descr.get_arg_types()
+        cb.restype  = descr.get_result_type()
+        sizeloc = arglocs[0]
+        assert isinstance(sizeloc, ImmedLoc)
+        cb.ressize = sizeloc.value
+        signloc = arglocs[1]
+        assert isinstance(signloc, ImmedLoc)
+        cb.ressign = signloc.value
 
-        stack_max = PASS_ON_MY_FRAME
         if is_call_release_gil:
-            if self._is_asmgcc():
-                from rpython.memory.gctransform import asmgcroot
-                stack_max -= asmgcroot.JIT_USE_WORDS
-            can_collect = False
+            cb.emit_call_release_gil()
         else:
-            can_collect = True
-
-        self._emit_call(x, arglocs, 3, tmp=tmp,
-                        argtypes=descr.get_arg_types(),
-                        callconv=descr.get_call_conv(),
-                        can_collect=can_collect,
-                        stack_max=stack_max)
-
-        if IS_X86_32 and isinstance(resloc, FrameLoc) and resloc.type == FLOAT:
-            # a float or a long long return
-            if descr.get_result_type() == 'L':
-                self.mc.MOV_br(resloc.value, eax.value)      # long long
-                self.mc.MOV_br(resloc.value + 4, edx.value)
-                # XXX should ideally not move the result on the stack,
-                #     but it's a mess to load eax/edx into a xmm register
-                #     and this way is simpler also because the result loc
-                #     can just be always a stack location
-            else:
-                self.mc.FSTPL_b(resloc.value)   # float return
-        elif descr.get_result_type() == 'S':
-            # singlefloat return
-            assert resloc is eax
-            if IS_X86_32:
-                # must convert ST(0) to a 32-bit singlefloat and load it into EAX
-                # mess mess mess
-                self.mc.SUB_ri(esp.value, 4)
-                self.mc.FSTPS_s(0)
-                self.mc.POP_r(eax.value)
-            elif IS_X86_64:
-                # must copy from the lower 32 bits of XMM0 into eax
-                self.mc.MOVD_rx(eax.value, xmm0.value)
-        elif size == WORD:
-            assert resloc is eax or resloc is xmm0    # a full word
-        elif size == 0:
-            pass    # void return
-        else:
-            # use the code in load_from_mem to do the zero- or sign-extension
-            assert resloc is eax
-            if size == 1:
-                srcloc = eax.lowest8bits()
-            else:
-                srcloc = eax
-            self.load_from_mem(eax, srcloc, sizeloc, signloc)
+            cb.emit()
 
     def _store_force_index(self, guard_op):
         faildescr = guard_op.getdescr()
     def genop_guard_call_may_force(self, op, guard_op, guard_token,
                                    arglocs, result_loc):
         self._store_force_index(guard_op)
-        self.genop_call(op, arglocs, result_loc)
+        self._genop_call(op, arglocs, result_loc)
         self._emit_guard_not_forced(guard_token)
 
     def genop_guard_call_release_gil(self, op, guard_op, guard_token,
                                      arglocs, result_loc):
         self._store_force_index(guard_op)
-        # first, close the stack in the sense of the asmgcc GC root tracker
-        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
-        if gcrootmap:
-            # we put the gcmap now into the frame before releasing the GIL,
-            # and pop it below after reacquiring the GIL.  The assumption
-            # is that this gcmap describes correctly the situation at any
-            # point in-between: all values containing GC pointers should
-            # be safely saved out of registers by now, and will not be
-            # manipulated by any of the following CALLs.
-            gcmap = self._regalloc.get_gcmap(noregs=True)
-            self.push_gcmap(self.mc, gcmap, store=True)
-            self.call_release_gil(gcrootmap, arglocs)
-        # do the call
         self._genop_call(op, arglocs, result_loc, is_call_release_gil=True)
-        # then reopen the stack
-        if gcrootmap:
-            self.call_reacquire_gil(gcrootmap, result_loc)
-            self.pop_gcmap(self.mc)     # remove the gcmap saved above
-        # finally, the guard_not_forced
         self._emit_guard_not_forced(guard_token)
 
-    def call_release_gil(self, gcrootmap, save_registers):
-        if gcrootmap.is_shadow_stack:
-            args = []
-        else:
-            from rpython.memory.gctransform import asmgcroot
-            # build a 'css' structure on the stack: 2 words for the linkage,
-            # and 5/7 words as described for asmgcroot.ASM_FRAMEDATA, for a
-            # total size of JIT_USE_WORDS.  This structure is found at
-            # [ESP+css].
-            css = WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS)
-            assert css >= 2
-            # Save ebp
-            index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
-            self.mc.MOV_sr(index_of_ebp, ebp.value)  # MOV [css.ebp], EBP
-            # Save the "return address": we pretend that it's css
-            if IS_X86_32:
-                reg = eax
-            elif IS_X86_64:
-                reg = edi
-            self.mc.LEA_rs(reg.value, css)           # LEA reg, [css]
-            frame_ptr = css + WORD * (2+asmgcroot.FRAME_PTR)
-            self.mc.MOV_sr(frame_ptr, reg.value)     # MOV [css.frame], reg
-            # Set up jf_extra_stack_depth to pretend that the return address
-            # was at css, and so our stack frame is supposedly shorter by
-            # (css+WORD) bytes
-            self.set_extra_stack_depth(self.mc, -css-WORD)
-            # Call the closestack() function (also releasing the GIL)
-            args = [reg]
-        #
-        self._emit_call(imm(self.releasegil_addr), args, can_collect=False)
-
     def call_reacquire_gil(self, gcrootmap, save_loc):
         # save the previous result (eax/xmm0) into the stack temporarily.
         # XXX like with call_release_gil(), we assume that we don't need
         self.call_assembler(op, guard_op, argloc, vloc, result_loc, eax)
         self._emit_guard_not_forced(guard_token)
 
-    def _call_assembler_emit_call(self, addr, argloc, tmploc):
-        self._emit_call(addr, [argloc], 0, tmp=tmploc)
+    def _call_assembler_emit_call(self, addr, argloc, _):
+        self.simple_call(addr, [argloc])
 
-    def _call_assembler_emit_helper_call(self, addr, arglocs, _):
-         self._emit_call(addr, arglocs, 0, tmp=self._second_tmp_reg)
+    def _call_assembler_emit_helper_call(self, addr, arglocs, result_loc):
+        self.simple_call(addr, arglocs, result_loc)
 
     def _call_assembler_check_descr(self, value, tmploc):
         ofs = self.cpu.get_ofs_of_frame_field('jf_descr')

rpython/jit/backend/x86/callbuilder.py

+from rpython.rlib.clibffi import FFI_DEFAULT_ABI
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.jit.metainterp.history import INT, FLOAT
+from rpython.jit.backend.x86.arch import (WORD, IS_X86_64, IS_X86_32,
+                                          PASS_ON_MY_FRAME)
+from rpython.jit.backend.x86.regloc import (eax, ecx, edx, ebx, esp, ebp, esi,
+    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, r8, r9, r10, r11, edi,
+    r12, r13, r14, r15, X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG,
+    RegLoc, RawEspLoc, RawEbpLoc, imm, ImmedLoc)
+from rpython.jit.backend.x86.jump import remap_frame_layout
+
+
+# darwin requires the stack to be 16 bytes aligned on calls.
+# Same for gcc 4.5.0, better safe than sorry
+CALL_ALIGN = 16 // WORD
+
+def align_stack_words(words):
+    return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
+
+
+
+class AbstractCallBuilder(object):
+
+    # max number of words we have room in esp; if we need more for
+    # arguments, we need to decrease esp temporarily
+    stack_max = PASS_ON_MY_FRAME
+
+    # this can be set to guide more complex calls: gives the detailed
+    # type of the arguments
+    argtypes = ""
+    ressign = False
+
+    # this is the calling convention (can be FFI_STDCALL on Windows)
+    callconv = FFI_DEFAULT_ABI
+
+    # is it for the main CALL of a call_release_gil?
+    is_call_release_gil = False
+
+    # set by save_result_value()
+    tmpresloc = None
+
+
+    def __init__(self, assembler, fnloc, arglocs,
+                 resloc=eax, restype=INT, ressize=WORD):
+        # Avoid tons of issues with a non-immediate fnloc by sticking it
+        # as an extra argument if needed
+        self.fnloc_is_immediate = isinstance(fnloc, ImmedLoc)
+        if self.fnloc_is_immediate:
+            self.fnloc = fnloc
+            self.arglocs = arglocs
+        else:
+            self.arglocs = arglocs + [fnloc]
+        self.asm = assembler
+        self.mc = assembler.mc
+        self.resloc = resloc
+        self.restype = restype
+        self.ressize = ressize
+        self.current_esp = 0     # 0 or (usually) negative, counted in bytes
+
+    def emit_no_collect(self):
+        """Emit a call that cannot collect."""
+        self.prepare_arguments()
+        self.emit_raw_call()
+        self.restore_esp()
+        self.load_result()
+
+    def emit(self):
+        """Emit a regular call; not for CALL_RELEASE_GIL."""
+        self.prepare_arguments()
+        self.push_gcmap()
+        self.emit_raw_call()
+        self.restore_esp()
+        self.pop_gcmap()
+        self.load_result()
+
+    def emit_call_release_gil(self):
+        """Emit a CALL_RELEASE_GIL, including calls to releasegil_addr
+        and reacqgil_addr."""
+        self.select_call_release_gil_mode()
+        self.prepare_arguments()
+        self.push_gcmap_for_call_release_gil()
+        self.call_releasegil_addr_and_move_real_arguments()
+        self.emit_raw_call()
+        self.restore_esp()
+        self.move_real_result_and_call_reacqgil_addr()
+        self.pop_gcmap()
+        self.load_result()
+
+    def select_call_release_gil_mode(self):
+        """Overridden in CallBuilder64"""
+        self.is_call_release_gil = True
+        if self.asm._is_asmgcc():
+            from rpython.memory.gctransform import asmgcroot
+            self.stack_max = PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS
+            assert self.stack_max >= 3
+
+    def emit_raw_call(self):
+        self.mc.CALL(self.fnloc)
+        if self.callconv != FFI_DEFAULT_ABI:
+            self.current_esp += self._fix_stdcall(self.callconv)
+
+    def subtract_esp_aligned(self, count):
+        if count > 0:
+            align = align_stack_words(count)
+            self.current_esp -= align * WORD
+            self.mc.SUB_ri(esp.value, align * WORD)
+
+    def restore_esp(self, target_esp=0):
+        if self.current_esp != target_esp:
+            self.mc.ADD_ri(esp.value, target_esp - self.current_esp)
+            self.current_esp = target_esp
+
+    def load_result(self):
+        """Overridden in CallBuilder32 and CallBuilder64"""
+        if self.ressize == 0:
+            return      # void result
+        # use the code in load_from_mem to do the zero- or sign-extension
+        srcloc = self.tmpresloc
+        if srcloc is None:
+            if self.restype == FLOAT:
+                srcloc = xmm0
+            else:
+                srcloc = eax
+        if self.ressize >= WORD and self.resloc is srcloc:
+            return      # no need for any MOV
+        if self.ressize == 1 and isinstance(srcloc, RegLoc):
+            srcloc = srcloc.lowest8bits()
+        self.asm.load_from_mem(self.resloc, srcloc,
+                               imm(self.ressize), imm(self.ressign))
+
+    def push_gcmap(self):
+        # we push *now* the gcmap, describing the status of GC registers
+        # after the rearrangements done just before, ignoring the return
+        # value eax, if necessary
+        assert not self.is_call_release_gil
+        self.change_extra_stack_depth = (self.current_esp != 0)
+        if self.change_extra_stack_depth:
+            self.asm.set_extra_stack_depth(self.mc, -self.current_esp)
+        noregs = self.asm.cpu.gc_ll_descr.is_shadow_stack()
+        gcmap = self.asm._regalloc.get_gcmap([eax], noregs=noregs)
+        self.asm.push_gcmap(self.mc, gcmap, store=True)
+
+    def push_gcmap_for_call_release_gil(self):
+        assert self.is_call_release_gil
+        # we put the gcmap now into the frame before releasing the GIL,
+        # and pop it after reacquiring the GIL.  The assumption
+        # is that this gcmap describes correctly the situation at any
+        # point in-between: all values containing GC pointers should
+        # be safely saved out of registers by now, and will not be
+        # manipulated by any of the following CALLs.
+        gcmap = self.asm._regalloc.get_gcmap(noregs=True)
+        self.asm.push_gcmap(self.mc, gcmap, store=True)
+
+    def pop_gcmap(self):
+        self.asm._reload_frame_if_necessary(self.mc)
+        if self.change_extra_stack_depth:
+            self.asm.set_extra_stack_depth(self.mc, 0)
+        self.asm.pop_gcmap(self.mc)
+
+    def call_releasegil_addr_and_move_real_arguments(self):
+        initial_esp = self.current_esp
+        self.save_register_arguments()
+        #
+        if not self.asm._is_asmgcc():
+            # the helper takes no argument
+            self.change_extra_stack_depth = False
+        else:
+            from rpython.memory.gctransform import asmgcroot
+            # build a 'css' structure on the stack: 2 words for the linkage,
+            # and 5/7 words as described for asmgcroot.ASM_FRAMEDATA, for a
+            # total size of JIT_USE_WORDS.  This structure is found at
+            # [ESP+css].
+            css = -self.current_esp + (
+                WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS))
+            assert css >= 2 * WORD
+            # Save ebp
+            index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
+            self.mc.MOV_sr(index_of_ebp, ebp.value)  # MOV [css.ebp], EBP
+            # Save the "return address": we pretend that it's css
+            if IS_X86_32:
+                reg = eax
+            elif IS_X86_64:
+                reg = edi
+            self.mc.LEA_rs(reg.value, css)           # LEA reg, [css]
+            frame_ptr = css + WORD * (2+asmgcroot.FRAME_PTR)
+            self.mc.MOV_sr(frame_ptr, reg.value)     # MOV [css.frame], reg
+            # Set up jf_extra_stack_depth to pretend that the return address
+            # was at css, and so our stack frame is supposedly shorter by
+            # (PASS_ON_MY_FRAME-JIT_USE_WORDS+1) words
+            delta = PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS + 1
+            self.change_extra_stack_depth = True
+            self.asm.set_extra_stack_depth(self.mc, -delta * WORD)
+            # Call the closestack() function (also releasing the GIL)
+            # with 'reg' as argument
+            if IS_X86_32:
+                self.subtract_esp_aligned(1)
+                self.mc.MOV_sr(0, reg.value)
+            #else:
+            #   on x86_64, reg is edi so that it is already correct
+        #
+        self.mc.CALL(imm(self.asm.releasegil_addr))
+        #
+        if not we_are_translated():        # for testing: we should not access
+            self.mc.ADD(ebp, imm(1))       # ebp any more
+        #
+        self.restore_register_arguments()
+        self.restore_esp(initial_esp)
+
+    def save_register_arguments(self):
+        """Overridden in CallBuilder64"""
+
+    def restore_register_arguments(self):
+        """Overridden in CallBuilder64"""
+
+    def move_real_result_and_call_reacqgil_addr(self):
+        # save the result we just got (in eax/eax+edx/st(0)/xmm0)
+        self.save_result_value()
+        # call the reopenstack() function (also reacquiring the GIL)
+        if not self.asm._is_asmgcc():
+            css = 0     # the helper takes no argument
+        else:
+            from rpython.memory.gctransform import asmgcroot
+            css = WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS)
+            if IS_X86_32:
+                reg = eax
+            elif IS_X86_64:
+                reg = edi
+            self.mc.LEA_rs(reg.value, css)
+            if IS_X86_32:
+                self.mc.MOV_sr(0, reg.value)
+        #
+        self.mc.CALL(imm(self.asm.reacqgil_addr))
+        #
+        if not we_are_translated():        # for testing: now we can accesss
+            self.mc.SUB(ebp, imm(1))       # ebp again
+        #
+        # Now that we required the GIL, we can reload a possibly modified ebp
+        if self.asm._is_asmgcc():
+            # special-case: reload ebp from the css
+            from rpython.memory.gctransform import asmgcroot
+            index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
+            self.mc.MOV_rs(ebp.value, index_of_ebp)  # MOV EBP, [css.ebp]
+        #else:
+        #   for shadowstack, done for us by _reload_frame_if_necessary()
+
+    def save_result_value(self):
+        """Overridden in CallBuilder32 and CallBuilder64"""
+        raise NotImplementedError
+
+
+class CallBuilder32(AbstractCallBuilder):
+
+    def prepare_arguments(self):
+        arglocs = self.arglocs
+        stack_depth = 0
+        n = len(arglocs)
+        for i in range(n):
+            loc = arglocs[i]
+            stack_depth += loc.get_width() // WORD
+        self.subtract_esp_aligned(stack_depth - self.stack_max)
+        #
+        p = 0
+        for i in range(n):
+            loc = arglocs[i]
+            if isinstance(loc, RegLoc):
+                if loc.is_xmm:
+                    self.mc.MOVSD_sx(p, loc.value)
+                else:
+                    self.mc.MOV_sr(p, loc.value)
+            p += loc.get_width()
+        p = 0
+        for i in range(n):
+            loc = arglocs[i]
+            if not isinstance(loc, RegLoc):
+                if loc.get_width() == 8:
+                    self.mc.MOVSD(xmm0, loc)
+                    self.mc.MOVSD_sx(p, xmm0.value)
+                elif isinstance(loc, ImmedLoc):
+                    self.mc.MOV_si(p, loc.value)
+                else:
+                    self.mc.MOV(eax, loc)
+                    self.mc.MOV_sr(p, eax.value)
+            p += loc.get_width()
+        self.total_stack_used_by_arguments = p
+        #
+        if not self.fnloc_is_immediate:    # the last "argument" pushed above
+            self.fnloc = RawEspLoc(p - WORD, INT)
+
+
+    def _fix_stdcall(self, callconv):
+        from rpython.rlib.clibffi import FFI_STDCALL
+        assert callconv == FFI_STDCALL
+        return self.total_stack_used_by_arguments
+
+    def load_result(self):
+        resloc = self.resloc
+        if resloc is not None and resloc.is_float():
+            # a float or a long long return
+            if self.tmpresloc is None:
+                if self.restype == 'L':     # long long
+                    # move eax/edx -> xmm0
+                    self.mc.MOVD_xr(resloc.value^1, edx.value)
+                    self.mc.MOVD_xr(resloc.value,   eax.value)
+                    self.mc.PUNPCKLDQ_xx(resloc.value, resloc.value^1)
+                else:
+                    # float: we have to go via the stack
+                    self.mc.FSTPL_s(0)
+                    self.mc.MOVSD_xs(resloc.value, 0)
+            else:
+                self.mc.MOVSD(resloc, self.tmpresloc)
+            #
+        elif self.restype == 'S':
+            # singlefloat return: must convert ST(0) to a 32-bit singlefloat
+            # and load it into self.resloc.  mess mess mess
+            if self.tmpresloc is None:
+                self.mc.FSTPS_s(0)
+                self.mc.MOV_rs(resloc.value, 0)
+            else:
+                self.mc.MOV(resloc, self.tmpresloc)
+        else:
+            AbstractCallBuilder.load_result(self)
+
+    def save_result_value(self):
+        # Temporarily save the result value into [ESP+4].  We use "+4"
+        # in order to leave the word at [ESP+0] free, in case it's needed
+        if self.ressize == 0:      # void return
+            return
+        if self.resloc.is_float():
+            # a float or a long long return
+            self.tmpresloc = RawEspLoc(4, FLOAT)
+            if self.restype == 'L':
+                self.mc.MOV_sr(4, eax.value)      # long long
+                self.mc.MOV_sr(8, edx.value)
+            else:
+                self.mc.FSTPL_s(4)                # float return
+        else:
+            self.tmpresloc = RawEspLoc(4, INT)
+            if self.restype == 'S':
+                self.mc.FSTPS_s(4)
+            else:
+                assert self.restype == INT
+                assert self.ressize <= WORD
+                self.mc.MOV_sr(4, eax.value)
+
+
+class CallBuilder64(AbstractCallBuilder):
+
+    ARGUMENTS_GPR = [edi, esi, edx, ecx, r8, r9]
+    ARGUMENTS_XMM = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7]
+    DONT_MOVE_GPR = []
+    _ALL_CALLEE_SAVE_GPR = [ebx, r12, r13, r14, r15]
+
+    next_arg_gpr = 0
+    next_arg_xmm = 0
+
+    def _unused_gpr(self, hint):
+        i = self.next_arg_gpr
+        self.next_arg_gpr = i + 1
+        try:
+            res = self.ARGUMENTS_GPR[i]
+        except IndexError:
+            return None
+        if hint in self.DONT_MOVE_GPR:
+            self.ARGUMENTS_GPR[i] = hint
+            res = hint
+        return res
+
+    def _unused_xmm(self):
+        i = self.next_arg_xmm
+        self.next_arg_xmm = i + 1
+        try:
+            return self.ARGUMENTS_XMM[i]
+        except IndexError:
+            return None
+
+    def _permute_to_prefer_unused_registers(self, lst):
+        # permute 'lst' so that it starts with registers that are not
+        # in 'self.already_used', and ends with registers that are.
+        N = len(lst)
+        i = 0
+        while i < N:
+            reg = lst[i]
+            if reg in self.already_used:
+                # move this reg to the end, and decrement N
+                N -= 1
+                assert N >= i
+                lst[N], lst[i] = lst[i], lst[N]
+            else:
+                i += 1
+
+    def select_call_release_gil_mode(self):
+        AbstractCallBuilder.select_call_release_gil_mode(self)
+        # We have to copy the arguments around a bit more in this mode,
+        # but on the other hand we don't need prepare_arguments() moving
+        # them in precisely the final registers.  Here we look around for
+        # unused registers that may be more likely usable.
+        from rpython.jit.backend.x86.regalloc import X86_64_RegisterManager
+        from rpython.jit.backend.x86.regalloc import X86_64_XMMRegisterManager
+        self.already_used = {}
+        for loc in self.arglocs:
+            self.already_used[loc] = None
+        #
+        lst = X86_64_RegisterManager.save_around_call_regs[:]
+        self._permute_to_prefer_unused_registers(lst)
+        # <optimization>
+        extra = []
+        for reg in self.asm._regalloc.rm.free_regs:
+            if (reg not in self.already_used and
+                    reg in self._ALL_CALLEE_SAVE_GPR):
+                extra.append(reg)
+        self.free_callee_save_gprs = extra
+        lst = extra + lst
+        # </optimization>
+        self.ARGUMENTS_GPR = lst[:len(self.ARGUMENTS_GPR)]
+        self.DONT_MOVE_GPR = self._ALL_CALLEE_SAVE_GPR
+        #
+        lst = X86_64_XMMRegisterManager.save_around_call_regs[:]
+        self._permute_to_prefer_unused_registers(lst)
+        self.ARGUMENTS_XMM = lst[:len(self.ARGUMENTS_XMM)]
+
+    def prepare_arguments(self):
+        src_locs = []
+        dst_locs = []
+        xmm_src_locs = []
+        xmm_dst_locs = []
+        singlefloats = None
+
+        arglocs = self.arglocs
+        argtypes = self.argtypes
+
+        on_stack = 0
+        for i in range(len(arglocs)):
+            loc = arglocs[i]
+            if loc.is_float():
+                tgt = self._unused_xmm()
+                if tgt is None:
+                    tgt = RawEspLoc(on_stack * WORD, FLOAT)
+                    on_stack += 1
+                xmm_src_locs.append(loc)
+                xmm_dst_locs.append(tgt)
+            elif i < len(argtypes) and argtypes[i] == 'S':
+                # Singlefloat argument
+                if singlefloats is None:
+                    singlefloats = []
+                tgt = self._unused_xmm()
+                if tgt is None:
+                    tgt = RawEspLoc(on_stack * WORD, INT)
+                    on_stack += 1
+                singlefloats.append((loc, tgt))
+            else:
+                tgt = self._unused_gpr(hint=loc)
+                if tgt is None:
+                    tgt = RawEspLoc(on_stack * WORD, INT)
+                    on_stack += 1
+                src_locs.append(loc)
+                dst_locs.append(tgt)
+
+        if not self.fnloc_is_immediate:
+            self.fnloc = dst_locs[-1]     # the last "argument" prepared above
+
+        if not we_are_translated():  # assert that we got the right stack depth
+            floats = 0
+            for i in range(len(arglocs)):
+                arg = arglocs[i]
+                if arg.is_float() or (i < len(argtypes) and argtypes[i]=='S'):
+                    floats += 1
+            all_args = len(arglocs)
+            stack_depth = (max(all_args - floats - len(self.ARGUMENTS_GPR), 0)
+                           + max(floats - len(self.ARGUMENTS_XMM), 0))
+            assert stack_depth == on_stack
+
+        self.subtract_esp_aligned(on_stack - self.stack_max)
+
+        # Handle register arguments: first remap the xmm arguments
+        remap_frame_layout(self.asm, xmm_src_locs, xmm_dst_locs,
+                           X86_64_XMM_SCRATCH_REG)
+        # Load the singlefloat arguments from main regs or stack to xmm regs
+        if singlefloats is not None:
+            for src, dst in singlefloats:
+                if isinstance(dst, RawEspLoc):
+                    # XXX too much special logic
+                    if isinstance(src, RawEbpLoc):
+                        self.mc.MOV32(X86_64_SCRATCH_REG, src)
+                        self.mc.MOV32(dst, X86_64_SCRATCH_REG)
+                    else:
+                        self.mc.MOV32(dst, src)
+                    continue
+                if isinstance(src, ImmedLoc):
+                    self.mc.MOV(X86_64_SCRATCH_REG, src)
+                    src = X86_64_SCRATCH_REG
+                self.mc.MOVD(dst, src)
+        # Finally remap the arguments in the main regs
+        remap_frame_layout(self.asm, src_locs, dst_locs, X86_64_SCRATCH_REG)
+
+
+    def _fix_stdcall(self, callconv):
+        assert 0     # should not occur on 64-bit
+
+    def load_result(self):
+        if self.restype == 'S' and self.tmpresloc is None:
+            # singlefloat return: use MOVD to load the target register
+            # from the lower 32 bits of XMM0
+            self.mc.MOVD(self.resloc, xmm0)
+        else:
+            AbstractCallBuilder.load_result(self)
+
+    def save_result_value(self):
+        # Temporarily save the result value into [ESP].
+        if self.ressize == 0:      # void return
+            return
+        #
+        if self.restype == FLOAT:    # and not 'S'
+            self.mc.MOVSD_sx(0, xmm0.value)
+            self.tmpresloc = RawEspLoc(0, FLOAT)
+            return
+        #
+        if len(self.free_callee_save_gprs) == 0:
+            self.tmpresloc = RawEspLoc(0, INT)
+        else:
+            self.tmpresloc = self.free_callee_save_gprs[0]
+        #
+        if self.restype == 'S':
+            # singlefloat return: use MOVD to store the lower 32 bits
+            # of XMM0 into the tmpresloc (register or [ESP])
+            self.mc.MOVD(self.tmpresloc, xmm0)
+        else:
+            assert self.restype == INT
+            self.mc.MOV(self.tmpresloc, eax)
+
+    def save_register_arguments(self):
+        # Save the argument registers, which are given by self.ARGUMENTS_xxx.
+        n_gpr = min(self.next_arg_gpr, len(self.ARGUMENTS_GPR))
+        n_xmm = min(self.next_arg_xmm, len(self.ARGUMENTS_XMM))
+        n_saved_regs = n_gpr + n_xmm
+        for i in range(n_gpr):
+            if self.ARGUMENTS_GPR[i] in self._ALL_CALLEE_SAVE_GPR:
+                n_saved_regs -= 1     # don't need to save it
+        self.subtract_esp_aligned(n_saved_regs)
+        #
+        n = 0
+        for i in range(n_gpr):
+            if self.ARGUMENTS_GPR[i] not in self._ALL_CALLEE_SAVE_GPR:
+                self.mc.MOV_sr(n * WORD, self.ARGUMENTS_GPR[i].value)
+                n += 1
+        for i in range(n_xmm):
+            self.mc.MOVSD_sx(n * WORD, self.ARGUMENTS_XMM[i].value)
+            n += 1
+        assert n == n_saved_regs
+        self.n_saved_regs = n_saved_regs
+
+    def restore_register_arguments(self):
+        # Restore the saved values into the *real* registers used for calls
+        # --- which are not self.ARGUMENTS_xxx!
+        n_gpr = min(self.next_arg_gpr, len(self.ARGUMENTS_GPR))
+        n_xmm = min(self.next_arg_xmm, len(self.ARGUMENTS_XMM))
+        #
+        n = 0
+        for i in range(n_gpr):
+            tgtvalue = CallBuilder64.ARGUMENTS_GPR[i].value
+            if self.ARGUMENTS_GPR[i] not in self._ALL_CALLEE_SAVE_GPR:
+                self.mc.MOV_rs(tgtvalue, n * WORD)
+                n += 1
+            else:
+                self.mc.MOV_rr(tgtvalue, self.ARGUMENTS_GPR[i].value)
+        for i in range(n_xmm):
+            self.mc.MOVSD_xs(CallBuilder64.ARGUMENTS_XMM[i].value, n * WORD)
+            n += 1
+        assert n == self.n_saved_regs
+        #
+        if isinstance(self.fnloc, RegLoc):    # fix this register
+            self.fnloc = CallBuilder64.ARGUMENTS_GPR[n_gpr - 1]
+
+
+if IS_X86_32:
+    CallBuilder = CallBuilder32
+if IS_X86_64:
+    CallBuilder = CallBuilder64

rpython/jit/backend/x86/regalloc.py

         rffi.cast(rffi.CArrayPtr(longlong.FLOATSTORAGE), adr)[1] = y
         return ConstFloatLoc(adr)
 
-    def after_call(self, v):
-        # the result is stored in st0, but we don't have this around,
-        # so genop_call will move it to some frame location immediately
-        # after the call
-        return self.frame_manager.loc(v)
+    def call_result_location(self, v):
+        return xmm0
 
 class X86_64_XMMRegisterManager(X86XMMRegisterManager):
     # xmm15 reserved for scratch use
     all_regs = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14]
     save_around_call_regs = all_regs
 
-    def call_result_location(self, v):
-        return xmm0
-
-    def after_call(self, v):
-        # We use RegisterManager's implementation, since X86XMMRegisterManager
-        # places the result on the stack, which we don't need to do when the
-        # calling convention places the result in xmm0
-        return RegisterManager.after_call(self, v)
-
 class X86FrameManager(FrameManager):
     def __init__(self, base_ofs):
         FrameManager.__init__(self)
         self._consider_call(op, guard_op)
 
     def consider_call_release_gil(self, op, guard_op):
-        # We spill the arguments to the stack, because we need to do 3 calls:
-        # call_release_gil(), the_real_c_function(), and call_reacquire_gil().
-        # The arguments are used on the second call only.  XXX we assume
-        # that the XMM arguments won't be modified by call_release_gil().
-        for i in range(op.numargs()):
-            loc = self.loc(op.getarg(i))
-            if loc in self.rm.save_around_call_regs:
-                self.rm.force_spill_var(op.getarg(i))
         assert guard_op is not None
         self._consider_call(op, guard_op)
 
         # call memcpy()
         self.rm.before_call()
         self.xrm.before_call()
-        self.assembler._emit_call(imm(self.assembler.memcpy_addr),
-                                  [dstaddr_loc, srcaddr_loc, length_loc],
-                                  can_collect=False)
+        self.assembler.simple_call_no_collect(imm(self.assembler.memcpy_addr),
+                                        [dstaddr_loc, srcaddr_loc, length_loc])
         self.rm.possibly_free_var(length_box)
         self.rm.possibly_free_var(dstaddr_box)
         self.rm.possibly_free_var(srcaddr_box)

rpython/jit/backend/x86/rx86.py

     CALL_l = insn('\xE8', relative(1))
     CALL_r = insn(rex_nw, '\xFF', register(1), chr(0xC0 | (2<<3)))
     CALL_b = insn('\xFF', orbyte(2<<3), stack_bp(1))
+    CALL_s = insn('\xFF', orbyte(2<<3), stack_sp(1))
 
     # XXX: Only here for testing purposes..."as" happens the encode the
     # registers in the opposite order that we would otherwise do in a
 
     # x87 instructions
     FSTPL_b = insn('\xDD', orbyte(3<<3), stack_bp(1)) # rffi.DOUBLE ('as' wants L??)
+    FSTPL_s = insn('\xDD', orbyte(3<<3), stack_sp(1)) # rffi.DOUBLE ('as' wants L??)
     FSTPS_s = insn('\xD9', orbyte(3<<3), stack_sp(1)) # lltype.SingleFloat
 
     # ------------------------------ Random mess -----------------------
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.