TCI cleanup and re-encoding

Fixes for #367 and #390. Move TCGCond to tcg/tcg-cond.h. Fix for win32 qemu_try_memalign. -----BEGIN PGP SIGNATURE----- iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmDOwtkdHHJpY2hhcmQu aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV+Gbwf/SATiwlxAIb7A6h4N jLOOAVor/QK0O2Nwv6dkht8I9gaT4glFl3he2xZ0bwPsBPBVlsCJkEnJwLKhotTE 3mLM/K7qou6qA5XFyu/zvrG3xkpXvX4Ctejo1bAzPMtRMoXCaMO7aKkucUXwLZ8C 9KD2vOO4t4c9HA0B4XQK1fpA+G6LxdA8GuvXfPQI/VaKcyWYLILx/RYDXZTWXanp LEbqQlupup+1kSMPsoFCe3xvZKC3Lrx5qETowb4po0MgPZyqBbUDH/0ueb87R46a K3RKexJGzsiseqt/O8mzNFna2Qe+EEO7znS1UTK417tYG1JH5t1puZHtXJAwehgB EJqE4w== =4AJ8 -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/rth-gitlab/tags/pull-tcg-20210619-2' into staging TCI cleanup and re-encoding Fixes for #367 and #390. Move TCGCond to tcg/tcg-cond.h. Fix for win32 qemu_try_memalign. # gpg: Signature made Sun 20 Jun 2021 05:23:53 BST # gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F # gpg: issuer "richard.henderson@linaro.org" # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full] # Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F * remotes/rth-gitlab/tags/pull-tcg-20210619-2: (33 commits) util/oslib-win32: Fix fatal assertion in qemu_try_memalign tcg: expose TCGCond manipulation routines tcg: Restart when exhausting the stack frame tcg: Allocate sufficient storage in temp_allocate_frame tcg/sparc: Fix temp_allocate_frame vs sparc stack bias accel/tcg: Probe the proper permissions for atomic ops tests/tcg: Increase timeout for TCI tcg/tci: Use {set,clear}_helper_retaddr tcg/tci: Remove the qemu_ld/st_type macros Revert "tcg/tci: Use exec/cpu_ldst.h interfaces" tcg/tci: Split out tci_qemu_ld, tci_qemu_st tcg/tci: Implement add2, sub2 tcg/tci: Implement mulu2, muls2 tcg/tci: Implement clz, ctz, ctpop tcg/tci: Implement extract, sextract tcg/tci: Implement andc, orc, eqv, nand, nor tcg/tci: Implement movcond tcg/tci: Implement goto_ptr tcg/tci: Change encoding to uint32_t units tcg/tci: Remove tci_write_reg ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2021-06-22 10:39:16 +01:00 · 2021-06-22 10:39:16 +01:00 · b733163e05
parent 0add99ea3e 1c9638667b
commit b733163e05
39 changed files with 1453 additions and 1202 deletions
--- a/accel/tcg/atomic_template.h
+++ b/accel/tcg/atomic_template.h
@ -74,7 +74,7 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
                              ABI_TYPE cmpv, ABI_TYPE newv EXTRA_ARGS)
 {
    ATOMIC_MMU_DECLS;
-    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP_RW;
    DATA_TYPE ret;
    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,
                                         ATOMIC_MMU_IDX);
@ -95,7 +95,7 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
 ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
 {
    ATOMIC_MMU_DECLS;
-    DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP_R;
    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,
                                         ATOMIC_MMU_IDX);

@ -110,7 +110,7 @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
                     ABI_TYPE val EXTRA_ARGS)
 {
    ATOMIC_MMU_DECLS;
-    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP_W;
    uint16_t info = trace_mem_build_info(SHIFT, false, 0, true,
                                         ATOMIC_MMU_IDX);

@ -125,7 +125,7 @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
                           ABI_TYPE val EXTRA_ARGS)
 {
    ATOMIC_MMU_DECLS;
-    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP_RW;
    DATA_TYPE ret;
    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,
                                         ATOMIC_MMU_IDX);
@ -142,7 +142,7 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                        ABI_TYPE val EXTRA_ARGS)                    \
 {                                                                   \
    ATOMIC_MMU_DECLS;                                               \
-    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP_RW;                        \
    DATA_TYPE ret;                                                  \
    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,    \
                                         ATOMIC_MMU_IDX);           \
@ -176,7 +176,7 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                        ABI_TYPE xval EXTRA_ARGS)                   \
 {                                                                   \
    ATOMIC_MMU_DECLS;                                               \
-    XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                          \
+    XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP_RW;                       \
    XDATA_TYPE cmp, old, new, val = xval;                           \
    uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,    \
                                         ATOMIC_MMU_IDX);           \
@ -221,7 +221,7 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
                              ABI_TYPE cmpv, ABI_TYPE newv EXTRA_ARGS)
 {
    ATOMIC_MMU_DECLS;
-    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP_RW;
    DATA_TYPE ret;
    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false,
                                         ATOMIC_MMU_IDX);
@ -242,7 +242,7 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
 ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
 {
    ATOMIC_MMU_DECLS;
-    DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP_R;
    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false,
                                         ATOMIC_MMU_IDX);

@ -257,7 +257,7 @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
                     ABI_TYPE val EXTRA_ARGS)
 {
    ATOMIC_MMU_DECLS;
-    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP_W;
    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, true,
                                         ATOMIC_MMU_IDX);

@ -274,7 +274,7 @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
                           ABI_TYPE val EXTRA_ARGS)
 {
    ATOMIC_MMU_DECLS;
-    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP_RW;
    ABI_TYPE ret;
    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false,
                                         ATOMIC_MMU_IDX);
@ -291,7 +291,7 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                        ABI_TYPE val EXTRA_ARGS)                    \
 {                                                                   \
    ATOMIC_MMU_DECLS;                                               \
-    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
+    DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP_RW;                        \
    DATA_TYPE ret;                                                  \
    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP,    \
                                         false, ATOMIC_MMU_IDX);    \
@ -323,7 +323,7 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                        ABI_TYPE xval EXTRA_ARGS)                   \
 {                                                                   \
    ATOMIC_MMU_DECLS;                                               \
-    XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                          \
+    XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP_RW;                       \
    XDATA_TYPE ldo, ldn, old, new, val = xval;                      \
    uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP,    \
                                         false, ATOMIC_MMU_IDX);    \
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@ -1742,18 +1742,22 @@ bool tlb_plugin_lookup(CPUState *cpu, target_ulong addr, int mmu_idx,

 #endif

-/* Probe for a read-modify-write atomic operation.  Do not allow unaligned
- * operations, or io operations to proceed.  Return the host address.  */
+/*
+ * Probe for an atomic operation.  Do not allow unaligned operations,
+ * or io operations to proceed.  Return the host address.
+ *
+ * @prot may be PAGE_READ, PAGE_WRITE, or PAGE_READ|PAGE_WRITE.
+ */
 static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
-                               TCGMemOpIdx oi, uintptr_t retaddr)
+                               TCGMemOpIdx oi, int size, int prot,
+                               uintptr_t retaddr)
 {
    size_t mmu_idx = get_mmuidx(oi);
-    uintptr_t index = tlb_index(env, mmu_idx, addr);
-    CPUTLBEntry *tlbe = tlb_entry(env, mmu_idx, addr);
-    target_ulong tlb_addr = tlb_addr_write(tlbe);
    MemOp mop = get_memop(oi);
    int a_bits = get_alignment_bits(mop);
-    int s_bits = mop & MO_SIZE;
+    uintptr_t index;
+    CPUTLBEntry *tlbe;
+    target_ulong tlb_addr;
    void *hostaddr;

    /* Adjust the given return address.  */
@ -1767,7 +1771,7 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
    }

    /* Enforce qemu required alignment.  */
-    if (unlikely(addr & ((1 << s_bits) - 1))) {
+    if (unlikely(addr & (size - 1))) {
        /* We get here if guest alignment was not requested,
           or was not enforced by cpu_unaligned_access above.
           We might widen the access and emulate, but for now
@ -1775,15 +1779,45 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
        goto stop_the_world;
    }

+    index = tlb_index(env, mmu_idx, addr);
+    tlbe = tlb_entry(env, mmu_idx, addr);
+
    /* Check TLB entry and enforce page permissions.  */
-    if (!tlb_hit(tlb_addr, addr)) {
-        if (!VICTIM_TLB_HIT(addr_write, addr)) {
-            tlb_fill(env_cpu(env), addr, 1 << s_bits, MMU_DATA_STORE,
-                     mmu_idx, retaddr);
-            index = tlb_index(env, mmu_idx, addr);
-            tlbe = tlb_entry(env, mmu_idx, addr);
+    if (prot & PAGE_WRITE) {
+        tlb_addr = tlb_addr_write(tlbe);
+        if (!tlb_hit(tlb_addr, addr)) {
+            if (!VICTIM_TLB_HIT(addr_write, addr)) {
+                tlb_fill(env_cpu(env), addr, size,
+                         MMU_DATA_STORE, mmu_idx, retaddr);
+                index = tlb_index(env, mmu_idx, addr);
+                tlbe = tlb_entry(env, mmu_idx, addr);
+            }
+            tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
+        }
+
+        /* Let the guest notice RMW on a write-only page.  */
+        if ((prot & PAGE_READ) &&
+            unlikely(tlbe->addr_read != (tlb_addr & ~TLB_NOTDIRTY))) {
+            tlb_fill(env_cpu(env), addr, size,
+                     MMU_DATA_LOAD, mmu_idx, retaddr);
+            /*
+             * Since we don't support reads and writes to different addresses,
+             * and we do have the proper page loaded for write, this shouldn't
+             * ever return.  But just in case, handle via stop-the-world.
+             */
+            goto stop_the_world;
+        }
+    } else /* if (prot & PAGE_READ) */ {
+        tlb_addr = tlbe->addr_read;
+        if (!tlb_hit(tlb_addr, addr)) {
+            if (!VICTIM_TLB_HIT(addr_write, addr)) {
+                tlb_fill(env_cpu(env), addr, size,
+                         MMU_DATA_LOAD, mmu_idx, retaddr);
+                index = tlb_index(env, mmu_idx, addr);
+                tlbe = tlb_entry(env, mmu_idx, addr);
+            }
+            tlb_addr = tlbe->addr_read & ~TLB_INVALID_MASK;
        }
-        tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
    }

    /* Notice an IO access or a needs-MMU-lookup access */
@ -1793,20 +1827,10 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
        goto stop_the_world;
    }

-    /* Let the guest notice RMW on a write-only page.  */
-    if (unlikely(tlbe->addr_read != (tlb_addr & ~TLB_NOTDIRTY))) {
-        tlb_fill(env_cpu(env), addr, 1 << s_bits, MMU_DATA_LOAD,
-                 mmu_idx, retaddr);
-        /* Since we don't support reads and writes to different addresses,
-           and we do have the proper page loaded for write, this shouldn't
-           ever return.  But just in case, handle via stop-the-world.  */
-        goto stop_the_world;
-    }
-
    hostaddr = (void *)((uintptr_t)addr + tlbe->addend);

    if (unlikely(tlb_addr & TLB_NOTDIRTY)) {
-        notdirty_write(env_cpu(env), addr, 1 << s_bits,
+        notdirty_write(env_cpu(env), addr, size,
                       &env_tlb(env)->d[mmu_idx].iotlb[index], retaddr);
    }

@ -2669,7 +2693,12 @@ void cpu_stq_le_data(CPUArchState *env, target_ulong ptr, uint64_t val)
 #define ATOMIC_NAME(X) \
    HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
 #define ATOMIC_MMU_DECLS
-#define ATOMIC_MMU_LOOKUP atomic_mmu_lookup(env, addr, oi, retaddr)
+#define ATOMIC_MMU_LOOKUP_RW \
+    atomic_mmu_lookup(env, addr, oi, DATA_SIZE, PAGE_READ | PAGE_WRITE, retaddr)
+#define ATOMIC_MMU_LOOKUP_R \
+    atomic_mmu_lookup(env, addr, oi, DATA_SIZE, PAGE_READ, retaddr)
+#define ATOMIC_MMU_LOOKUP_W \
+    atomic_mmu_lookup(env, addr, oi, DATA_SIZE, PAGE_WRITE, retaddr)
 #define ATOMIC_MMU_CLEANUP
 #define ATOMIC_MMU_IDX   get_mmuidx(oi)

@ -2698,10 +2727,18 @@ void cpu_stq_le_data(CPUArchState *env, target_ulong ptr, uint64_t val)

 #undef EXTRA_ARGS
 #undef ATOMIC_NAME
-#undef ATOMIC_MMU_LOOKUP
+#undef ATOMIC_MMU_LOOKUP_RW
+#undef ATOMIC_MMU_LOOKUP_R
+#undef ATOMIC_MMU_LOOKUP_W
+
 #define EXTRA_ARGS         , TCGMemOpIdx oi
 #define ATOMIC_NAME(X)     HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
-#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, GETPC())
+#define ATOMIC_MMU_LOOKUP_RW \
+    atomic_mmu_lookup(env, addr, oi, DATA_SIZE, PAGE_READ | PAGE_WRITE, GETPC())
+#define ATOMIC_MMU_LOOKUP_R \
+    atomic_mmu_lookup(env, addr, oi, DATA_SIZE, PAGE_READ, GETPC())
+#define ATOMIC_MMU_LOOKUP_W \
+    atomic_mmu_lookup(env, addr, oi, DATA_SIZE, PAGE_WRITE, GETPC())

 #define DATA_SIZE 1
 #include "atomic_template.h"
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@ -160,9 +160,8 @@ static void gen_empty_mem_helper(void)
    tcg_temp_free_ptr(ptr);
 }

-static inline
-void gen_plugin_cb_start(enum plugin_gen_from from,
-                         enum plugin_gen_cb type, unsigned wr)
+static void gen_plugin_cb_start(enum plugin_gen_from from,
+                                enum plugin_gen_cb type, unsigned wr)
 {
    TCGOp *op;

@ -179,7 +178,7 @@ static void gen_wrapped(enum plugin_gen_from from,
    tcg_gen_plugin_cb_end();
 }

-static inline void plugin_gen_empty_callback(enum plugin_gen_from from)
+static void plugin_gen_empty_callback(enum plugin_gen_from from)
 {
    switch (from) {
    case PLUGIN_GEN_AFTER_INSN:
@ -385,7 +384,7 @@ static TCGOp *copy_st_ptr(TCGOp **begin_op, TCGOp *op)
 }

 static TCGOp *copy_call(TCGOp **begin_op, TCGOp *op, void *empty_func,
-                        void *func, unsigned tcg_flags, int *cb_idx)
+                        void *func, int *cb_idx)
 {
    /* copy all ops until the call */
    do {
@ -412,7 +411,7 @@ static TCGOp *copy_call(TCGOp **begin_op, TCGOp *op, void *empty_func,
        tcg_debug_assert(i < MAX_OPC_PARAM_ARGS);
    }
    op->args[*cb_idx] = (uintptr_t)func;
-    op->args[*cb_idx + 1] = tcg_flags;
+    op->args[*cb_idx + 1] = (*begin_op)->args[*cb_idx + 1];

    return op;
 }
@ -439,7 +438,7 @@ static TCGOp *append_udata_cb(const struct qemu_plugin_dyn_cb *cb,

    /* call */
    op = copy_call(&begin_op, op, HELPER(plugin_vcpu_udata_cb),
-                   cb->f.vcpu_udata, cb->tcg_flags, cb_idx);
+                   cb->f.vcpu_udata, cb_idx);

    return op;
 }
@ -490,7 +489,7 @@ static TCGOp *append_mem_cb(const struct qemu_plugin_dyn_cb *cb,
    if (type == PLUGIN_GEN_CB_MEM) {
        /* call */
        op = copy_call(&begin_op, op, HELPER(plugin_vcpu_mem_cb),
-                       cb->f.vcpu_udata, cb->tcg_flags, cb_idx);
+                       cb->f.vcpu_udata, cb_idx);
    }

    return op;
@ -513,9 +512,8 @@ static bool op_rw(const TCGOp *op, const struct qemu_plugin_dyn_cb *cb)
    return !!(cb->rw & (w + 1));
 }

-static inline
-void inject_cb_type(const GArray *cbs, TCGOp *begin_op, inject_fn inject,
-                    op_ok_fn ok)
+static void inject_cb_type(const GArray *cbs, TCGOp *begin_op,
+                           inject_fn inject, op_ok_fn ok)
 {
    TCGOp *end_op;
    TCGOp *op;
--- a/accel/tcg/plugin-helpers.h
+++ b/accel/tcg/plugin-helpers.h
@ -1,5 +1,4 @@
 #ifdef CONFIG_PLUGIN
-/* Note: no TCG flags because those are overwritten later */
-DEF_HELPER_2(plugin_vcpu_udata_cb, void, i32, ptr)
-DEF_HELPER_4(plugin_vcpu_mem_cb, void, i32, i32, i64, ptr)
+DEF_HELPER_FLAGS_2(plugin_vcpu_udata_cb, TCG_CALL_NO_RWG, void, i32, ptr)
+DEF_HELPER_FLAGS_4(plugin_vcpu_mem_cb, TCG_CALL_NO_RWG, void, i32, i32, i64, ptr)
 #endif
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@ -1220,7 +1220,9 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,

 /* Macro to call the above, with local variables from the use context.  */
 #define ATOMIC_MMU_DECLS do {} while (0)
-#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, GETPC())
+#define ATOMIC_MMU_LOOKUP_RW  atomic_mmu_lookup(env, addr, DATA_SIZE, GETPC())
+#define ATOMIC_MMU_LOOKUP_R   ATOMIC_MMU_LOOKUP_RW
+#define ATOMIC_MMU_LOOKUP_W   ATOMIC_MMU_LOOKUP_RW
 #define ATOMIC_MMU_CLEANUP do { clear_helper_retaddr(); } while (0)
 #define ATOMIC_MMU_IDX MMU_USER_IDX

@ -1250,12 +1252,12 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,

 #undef EXTRA_ARGS
 #undef ATOMIC_NAME
-#undef ATOMIC_MMU_LOOKUP
+#undef ATOMIC_MMU_LOOKUP_RW

 #define EXTRA_ARGS     , TCGMemOpIdx oi, uintptr_t retaddr
 #define ATOMIC_NAME(X) \
    HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
-#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, retaddr)
+#define ATOMIC_MMU_LOOKUP_RW  atomic_mmu_lookup(env, addr, DATA_SIZE, retaddr)

 #define DATA_SIZE 16
 #include "atomic_template.h"
--- a/3
+++ b/3
@ -5803,6 +5803,9 @@ fi
 if test "$optreset" = "yes" ; then
  echo "HAVE_OPTRESET=y" >> $config_host_mak
 fi
+if test "$tcg" = "enabled" -a "$tcg_interpreter" = "true" ; then
+  echo "CONFIG_TCG_INTERPRETER=y" >> $config_host_mak
+fi
 if test "$fdatasync" = "yes" ; then
  echo "CONFIG_FDATASYNC=y" >> $config_host_mak
 fi
--- a/include/exec/helper-head.h
+++ b/include/exec/helper-head.h
@ -85,32 +85,14 @@
 #define dh_retvar_ptr tcgv_ptr_temp(retval)
 #define dh_retvar(t) glue(dh_retvar_, dh_alias(t))

-#define dh_is_64bit_void 0
-#define dh_is_64bit_noreturn 0
-#define dh_is_64bit_i32 0
-#define dh_is_64bit_i64 1
-#define dh_is_64bit_ptr (sizeof(void *) == 8)
-#define dh_is_64bit_cptr dh_is_64bit_ptr
-#define dh_is_64bit(t) glue(dh_is_64bit_, dh_alias(t))
-
-#define dh_is_signed_void 0
-#define dh_is_signed_noreturn 0
-#define dh_is_signed_i32 0
-#define dh_is_signed_s32 1
-#define dh_is_signed_i64 0
-#define dh_is_signed_s64 1
-#define dh_is_signed_f16 0
-#define dh_is_signed_f32 0
-#define dh_is_signed_f64 0
-#define dh_is_signed_tl  0
-#define dh_is_signed_int 1
-/* ??? This is highly specific to the host cpu.  There are even special
-   extension instructions that may be required, e.g. ia64's addp4.  But
-   for now we don't support any 64-bit targets with 32-bit pointers.  */
-#define dh_is_signed_ptr 0
-#define dh_is_signed_cptr dh_is_signed_ptr
-#define dh_is_signed_env dh_is_signed_ptr
-#define dh_is_signed(t) dh_is_signed_##t
+#define dh_typecode_void 0
+#define dh_typecode_noreturn 0
+#define dh_typecode_i32 2
+#define dh_typecode_s32 3
+#define dh_typecode_i64 4
+#define dh_typecode_s64 5
+#define dh_typecode_ptr 6
+#define dh_typecode(t) glue(dh_typecode_, dh_alias(t))

 #define dh_callflag_i32  0
 #define dh_callflag_s32  0
@ -126,8 +108,7 @@
 #define dh_callflag_noreturn TCG_CALL_NO_RETURN
 #define dh_callflag(t) glue(dh_callflag_, dh_alias(t))

-#define dh_sizemask(t, n) \
-  ((dh_is_64bit(t) << (n*2)) | (dh_is_signed(t) << (n*2+1)))
+#define dh_typemask(t, n)  (dh_typecode(t) << (n * 3))

 #define dh_arg(t, n) \
  glue(glue(tcgv_, dh_alias(t)), _temp)(glue(arg, n))
--- a/include/exec/helper-tcg.h
+++ b/include/exec/helper-tcg.h
@ -13,50 +13,50 @@
 #define DEF_HELPER_FLAGS_0(NAME, FLAGS, ret) \
  { .func = HELPER(NAME), .name = str(NAME), \
    .flags = FLAGS | dh_callflag(ret), \
-    .sizemask = dh_sizemask(ret, 0) },
+    .typemask = dh_typemask(ret, 0) },

 #define DEF_HELPER_FLAGS_1(NAME, FLAGS, ret, t1) \
  { .func = HELPER(NAME), .name = str(NAME), \
    .flags = FLAGS | dh_callflag(ret), \
-    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) },
+    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) },

 #define DEF_HELPER_FLAGS_2(NAME, FLAGS, ret, t1, t2) \
  { .func = HELPER(NAME), .name = str(NAME), \
    .flags = FLAGS | dh_callflag(ret), \
-    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) \
-    | dh_sizemask(t2, 2) },
+    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
+    | dh_typemask(t2, 2) },

 #define DEF_HELPER_FLAGS_3(NAME, FLAGS, ret, t1, t2, t3) \
  { .func = HELPER(NAME), .name = str(NAME), \
    .flags = FLAGS | dh_callflag(ret), \
-    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) \
-    | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) },
+    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
+    | dh_typemask(t2, 2) | dh_typemask(t3, 3) },

 #define DEF_HELPER_FLAGS_4(NAME, FLAGS, ret, t1, t2, t3, t4) \
  { .func = HELPER(NAME), .name = str(NAME), \
    .flags = FLAGS | dh_callflag(ret), \
-    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) \
-    | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) },
+    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
+    | dh_typemask(t2, 2) | dh_typemask(t3, 3) | dh_typemask(t4, 4) },

 #define DEF_HELPER_FLAGS_5(NAME, FLAGS, ret, t1, t2, t3, t4, t5) \
  { .func = HELPER(NAME), .name = str(NAME), \
    .flags = FLAGS | dh_callflag(ret), \
-    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) \
-    | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
-    | dh_sizemask(t5, 5) },
+    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
+    | dh_typemask(t2, 2) | dh_typemask(t3, 3) | dh_typemask(t4, 4) \
+    | dh_typemask(t5, 5) },

 #define DEF_HELPER_FLAGS_6(NAME, FLAGS, ret, t1, t2, t3, t4, t5, t6) \
  { .func = HELPER(NAME), .name = str(NAME), \
    .flags = FLAGS | dh_callflag(ret), \
-    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) \
-    | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
-    | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) },
+    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
+    | dh_typemask(t2, 2) | dh_typemask(t3, 3) | dh_typemask(t4, 4) \
+    | dh_typemask(t5, 5) | dh_typemask(t6, 6) },

 #define DEF_HELPER_FLAGS_7(NAME, FLAGS, ret, t1, t2, t3, t4, t5, t6, t7) \
  { .func = HELPER(NAME), .name = str(NAME), .flags = FLAGS, \
-    .sizemask = dh_sizemask(ret, 0) | dh_sizemask(t1, 1) \
-    | dh_sizemask(t2, 2) | dh_sizemask(t3, 3) | dh_sizemask(t4, 4) \
-    | dh_sizemask(t5, 5) | dh_sizemask(t6, 6) | dh_sizemask(t7, 7) },
+    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
+    | dh_typemask(t2, 2) | dh_typemask(t3, 3) | dh_typemask(t4, 4) \
+    | dh_typemask(t5, 5) | dh_typemask(t6, 6) | dh_typemask(t7, 7) },

 #include "helper.h"
 #include "trace/generated-helpers.h"
--- a/include/qemu/plugin.h
+++ b/include/qemu/plugin.h
@ -79,7 +79,6 @@ enum plugin_dyn_cb_subtype {
 struct qemu_plugin_dyn_cb {
    union qemu_plugin_cb_sig f;
    void *userp;
-    unsigned tcg_flags;
    enum plugin_dyn_cb_subtype type;
    /* @rw applies to mem callbacks only (both regular and inline) */
    enum qemu_plugin_mem_rw rw;
--- a/include/tcg/tcg-cond.h
+++ b/include/tcg/tcg-cond.h
@ -0,0 +1,101 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef TCG_COND_H
+#define TCG_COND_H
+
+/*
+ * Conditions.  Note that these are laid out for easy manipulation by
+ * the functions below:
+ *    bit 0 is used for inverting;
+ *    bit 1 is signed,
+ *    bit 2 is unsigned,
+ *    bit 3 is used with bit 0 for swapping signed/unsigned.
+ */
+typedef enum {
+    /* non-signed */
+    TCG_COND_NEVER  = 0 | 0 | 0 | 0,
+    TCG_COND_ALWAYS = 0 | 0 | 0 | 1,
+    TCG_COND_EQ     = 8 | 0 | 0 | 0,
+    TCG_COND_NE     = 8 | 0 | 0 | 1,
+    /* signed */
+    TCG_COND_LT     = 0 | 0 | 2 | 0,
+    TCG_COND_GE     = 0 | 0 | 2 | 1,
+    TCG_COND_LE     = 8 | 0 | 2 | 0,
+    TCG_COND_GT     = 8 | 0 | 2 | 1,
+    /* unsigned */
+    TCG_COND_LTU    = 0 | 4 | 0 | 0,
+    TCG_COND_GEU    = 0 | 4 | 0 | 1,
+    TCG_COND_LEU    = 8 | 4 | 0 | 0,
+    TCG_COND_GTU    = 8 | 4 | 0 | 1,
+} TCGCond;
+
+/* Invert the sense of the comparison.  */
+static inline TCGCond tcg_invert_cond(TCGCond c)
+{
+    return (TCGCond)(c ^ 1);
+}
+
+/* Swap the operands in a comparison.  */
+static inline TCGCond tcg_swap_cond(TCGCond c)
+{
+    return c & 6 ? (TCGCond)(c ^ 9) : c;
+}
+
+/* Create an "unsigned" version of a "signed" comparison.  */
+static inline TCGCond tcg_unsigned_cond(TCGCond c)
+{
+    return c & 2 ? (TCGCond)(c ^ 6) : c;
+}
+
+/* Create a "signed" version of an "unsigned" comparison.  */
+static inline TCGCond tcg_signed_cond(TCGCond c)
+{
+    return c & 4 ? (TCGCond)(c ^ 6) : c;
+}
+
+/* Must a comparison be considered unsigned?  */
+static inline bool is_unsigned_cond(TCGCond c)
+{
+    return (c & 4) != 0;
+}
+
+/*
+ * Create a "high" version of a double-word comparison.
+ * This removes equality from a LTE or GTE comparison.
+ */
+static inline TCGCond tcg_high_cond(TCGCond c)
+{
+    switch (c) {
+    case TCG_COND_GE:
+    case TCG_COND_LE:
+    case TCG_COND_GEU:
+    case TCG_COND_LEU:
+        return (TCGCond)(c ^ 8);
+    default:
+        return c;
+    }
+}
+
+#endif /* TCG_COND_H */
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@ -277,8 +277,8 @@ DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT)

 #ifdef TCG_TARGET_INTERPRETER
 /* These opcodes are only for use between the tci generator and interpreter. */
-DEF(tci_movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT)
-DEF(tci_movi_i64, 1, 0, 1, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
+DEF(tci_movi, 1, 0, 1, TCG_OPF_NOT_PRESENT)
+DEF(tci_movl, 1, 0, 1, TCG_OPF_NOT_PRESENT)
 #endif

 #undef TLADDR_ARGS
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@ -33,6 +33,7 @@
 #include "tcg/tcg-mo.h"
 #include "tcg-target.h"
 #include "qemu/int128.h"
+#include "tcg/tcg-cond.h"

 /* XXX: make safe guess about sizes */
 #define MAX_OP_PER_INSTR 266
@ -52,6 +53,7 @@
 #define MAX_OPC_PARAM (4 + (MAX_OPC_PARAM_PER_ARG * MAX_OPC_PARAM_ARGS))

 #define CPU_TEMP_BUF_NLONGS 128
+#define TCG_STATIC_FRAME_SIZE  (CPU_TEMP_BUF_NLONGS * sizeof(long))

 /* Default target word size to pointer size.  */
 #ifndef TCG_TARGET_REG_BITS
@ -406,75 +408,6 @@ typedef TCGv_ptr TCGv_env;
 /* Used to align parameters.  See the comment before tcgv_i32_temp.  */
 #define TCG_CALL_DUMMY_ARG      ((TCGArg)0)

-/* Conditions.  Note that these are laid out for easy manipulation by
-   the functions below:
-     bit 0 is used for inverting;
-     bit 1 is signed,
-     bit 2 is unsigned,
-     bit 3 is used with bit 0 for swapping signed/unsigned.  */
-typedef enum {
-    /* non-signed */
-    TCG_COND_NEVER  = 0 | 0 | 0 | 0,
-    TCG_COND_ALWAYS = 0 | 0 | 0 | 1,
-    TCG_COND_EQ     = 8 | 0 | 0 | 0,
-    TCG_COND_NE     = 8 | 0 | 0 | 1,
-    /* signed */
-    TCG_COND_LT     = 0 | 0 | 2 | 0,
-    TCG_COND_GE     = 0 | 0 | 2 | 1,
-    TCG_COND_LE     = 8 | 0 | 2 | 0,
-    TCG_COND_GT     = 8 | 0 | 2 | 1,
-    /* unsigned */
-    TCG_COND_LTU    = 0 | 4 | 0 | 0,
-    TCG_COND_GEU    = 0 | 4 | 0 | 1,
-    TCG_COND_LEU    = 8 | 4 | 0 | 0,
-    TCG_COND_GTU    = 8 | 4 | 0 | 1,
-} TCGCond;
-
-/* Invert the sense of the comparison.  */
-static inline TCGCond tcg_invert_cond(TCGCond c)
-{
-    return (TCGCond)(c ^ 1);
-}
-
-/* Swap the operands in a comparison.  */
-static inline TCGCond tcg_swap_cond(TCGCond c)
-{
-    return c & 6 ? (TCGCond)(c ^ 9) : c;
-}
-
-/* Create an "unsigned" version of a "signed" comparison.  */
-static inline TCGCond tcg_unsigned_cond(TCGCond c)
-{
-    return c & 2 ? (TCGCond)(c ^ 6) : c;
-}
-
-/* Create a "signed" version of an "unsigned" comparison.  */
-static inline TCGCond tcg_signed_cond(TCGCond c)
-{
-    return c & 4 ? (TCGCond)(c ^ 6) : c;
-}
-
-/* Must a comparison be considered unsigned?  */
-static inline bool is_unsigned_cond(TCGCond c)
-{
-    return (c & 4) != 0;
-}
-
-/* Create a "high" version of a double-word comparison.
-   This removes equality from a LTE or GTE comparison.  */
-static inline TCGCond tcg_high_cond(TCGCond c)
-{
-    switch (c) {
-    case TCG_COND_GE:
-    case TCG_COND_LE:
-    case TCG_COND_GEU:
-    case TCG_COND_LEU:
-        return (TCGCond)(c ^ 8);
-    default:
-        return c;
-    }
-}
-
 typedef enum TCGTempVal {
    TEMP_VAL_DEAD,
    TEMP_VAL_REG,
--- a/plugins/core.c
+++ b/plugins/core.c
@ -295,33 +295,15 @@ void plugin_register_inline_op(GArray **arr,
    dyn_cb->inline_insn.imm = imm;
 }

-static inline uint32_t cb_to_tcg_flags(enum qemu_plugin_cb_flags flags)
-{
-    uint32_t ret;
-
-    switch (flags) {
-    case QEMU_PLUGIN_CB_RW_REGS:
-        ret = 0;
-        break;
-    case QEMU_PLUGIN_CB_R_REGS:
-        ret = TCG_CALL_NO_WG;
-        break;
-    case QEMU_PLUGIN_CB_NO_REGS:
-    default:
-        ret = TCG_CALL_NO_RWG;
-    }
-    return ret;
-}
-
-inline void
-plugin_register_dyn_cb__udata(GArray **arr,
-                              qemu_plugin_vcpu_udata_cb_t cb,
-                              enum qemu_plugin_cb_flags flags, void *udata)
+void plugin_register_dyn_cb__udata(GArray **arr,
+                                   qemu_plugin_vcpu_udata_cb_t cb,
+                                   enum qemu_plugin_cb_flags flags,
+                                   void *udata)
 {
    struct qemu_plugin_dyn_cb *dyn_cb = plugin_get_dyn_cb(arr);

    dyn_cb->userp = udata;
-    dyn_cb->tcg_flags = cb_to_tcg_flags(flags);
+    /* Note flags are discarded as unused. */
    dyn_cb->f.vcpu_udata = cb;
    dyn_cb->type = PLUGIN_CB_REGULAR;
 }
@ -336,7 +318,7 @@ void plugin_register_vcpu_mem_cb(GArray **arr,

    dyn_cb = plugin_get_dyn_cb(arr);
    dyn_cb->userp = udata;
-    dyn_cb->tcg_flags = cb_to_tcg_flags(flags);
+    /* Note flags are discarded as unused. */
    dyn_cb->type = PLUGIN_CB_REGULAR;
    dyn_cb->rw = rw;
    dyn_cb->f.generic = cb;
--- a/target/hppa/helper.h
+++ b/target/hppa/helper.h
@ -1,12 +1,9 @@
 #if TARGET_REGISTER_BITS == 64
 # define dh_alias_tr     i64
-# define dh_is_64bit_tr  1
 #else
 # define dh_alias_tr     i32
-# define dh_is_64bit_tr  0
 #endif
 #define dh_ctype_tr      target_ureg
-#define dh_is_signed_tr  0

 DEF_HELPER_2(excp, noreturn, env, int)
 DEF_HELPER_FLAGS_2(tsv, TCG_CALL_NO_WG, void, env, tr)
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@ -30,9 +30,6 @@
 #define dh_ctype_Reg Reg *
 #define dh_ctype_ZMMReg ZMMReg *
 #define dh_ctype_MMXReg MMXReg *
-#define dh_is_signed_Reg dh_is_signed_ptr
-#define dh_is_signed_ZMMReg dh_is_signed_ptr
-#define dh_is_signed_MMXReg dh_is_signed_ptr

 DEF_HELPER_3(glue(psrlw, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue(psraw, SUFFIX), void, env, Reg, Reg)
--- a/target/m68k/helper.h
+++ b/target/m68k/helper.h
@ -17,7 +17,6 @@ DEF_HELPER_4(cas2l_parallel, void, env, i32, i32, i32)

 #define dh_alias_fp ptr
 #define dh_ctype_fp FPReg *
-#define dh_is_signed_fp dh_is_signed_ptr

 DEF_HELPER_3(exts32, void, env, fp, s32)
 DEF_HELPER_3(extf32, void, env, fp, f32)
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@ -109,11 +109,9 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64)

 #define dh_alias_avr ptr
 #define dh_ctype_avr ppc_avr_t *
-#define dh_is_signed_avr dh_is_signed_ptr

 #define dh_alias_vsr ptr
 #define dh_ctype_vsr ppc_vsr_t *
-#define dh_is_signed_vsr dh_is_signed_ptr

 DEF_HELPER_3(vavgub, void, avr, avr, avr)
 DEF_HELPER_3(vavguh, void, avr, avr, avr)
@ -697,7 +695,6 @@ DEF_HELPER_3(store_601_batu, void, env, i32, tl)

 #define dh_alias_fprp ptr
 #define dh_ctype_fprp ppc_fprp_t *
-#define dh_is_signed_fprp dh_is_signed_ptr

 DEF_HELPER_4(dadd, void, env, fprp, fprp, fprp)
 DEF_HELPER_4(daddq, void, env, fprp, fprp, fprp)
--- a/tcg/meson.build
+++ b/tcg/meson.build
@ -9,6 +9,12 @@ tcg_ss.add(files(
  'tcg-op-gvec.c',
  'tcg-op-vec.c',
 ))
-tcg_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('tci.c'))
+
+if get_option('tcg_interpreter')
+  libffi = dependency('libffi', version: '>=3.0', required: true,
+                      method: 'pkg-config', kwargs: static_kwargs)
+  specific_ss.add(libffi)
+  specific_ss.add(files('tci.c'))
+endif

 specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@ -25,6 +25,7 @@

 #include "qemu/osdep.h"
 #include "tcg/tcg-op.h"
+#include "tcg-internal.h"

 #define CASE_OP_32_64(x)                        \
        glue(glue(case INDEX_op_, x), _i32):    \
@ -1481,7 +1482,7 @@ void tcg_optimize(TCGContext *s)
            break;

        case INDEX_op_call:
-            if (!(op->args[nb_oargs + nb_iargs + 1]
+            if (!(tcg_call_flags(op)
                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                for (i = 0; i < nb_globals; i++) {
                    if (test_bit(i, temps_used.l)) {
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@ -984,14 +984,18 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 {
    int tmp_buf_size, frame_size;

-    /* The TCG temp buffer is at the top of the frame, immediately
-       below the frame pointer.  */
+    /*
+     * The TCG temp buffer is at the top of the frame, immediately
+     * below the frame pointer.  Use the logical (aligned) offset here;
+     * the stack bias is applied in temp_allocate_frame().
+     */
    tmp_buf_size = CPU_TEMP_BUF_NLONGS * (int)sizeof(long);
-    tcg_set_frame(s, TCG_REG_I6, TCG_TARGET_STACK_BIAS - tmp_buf_size,
-                  tmp_buf_size);
+    tcg_set_frame(s, TCG_REG_I6, -tmp_buf_size, tmp_buf_size);

-    /* TCG_TARGET_CALL_STACK_OFFSET includes the stack bias, but is
-       otherwise the minimal frame usable by callees.  */
+    /*
+     * TCG_TARGET_CALL_STACK_OFFSET includes the stack bias, but is
+     * otherwise the minimal frame usable by callees.
+     */
    frame_size = TCG_TARGET_CALL_STACK_OFFSET - TCG_TARGET_STACK_BIAS;
    frame_size += TCG_STATIC_CALL_ARGS_SIZE + tmp_buf_size;
    frame_size += TCG_TARGET_STACK_ALIGN - 1;
--- a/tcg/tcg-internal.h
+++ b/tcg/tcg-internal.h
@ -27,6 +27,13 @@

 #define TCG_HIGHWATER 1024

+typedef struct TCGHelperInfo {
+    void *func;
+    const char *name;
+    unsigned flags;
+    unsigned typemask;
+} TCGHelperInfo;
+
 extern TCGContext tcg_init_ctx;
 extern TCGContext **tcg_ctxs;
 extern unsigned int tcg_cur_ctxs;
@ -37,4 +44,19 @@ bool tcg_region_alloc(TCGContext *s);
 void tcg_region_initial_alloc(TCGContext *s);
 void tcg_region_prologue_set(TCGContext *s);

+static inline void *tcg_call_func(TCGOp *op)
+{
+    return (void *)(uintptr_t)op->args[TCGOP_CALLO(op) + TCGOP_CALLI(op)];
+}
+
+static inline const TCGHelperInfo *tcg_call_info(TCGOp *op)
+{
+    return (void *)(uintptr_t)op->args[TCGOP_CALLO(op) + TCGOP_CALLI(op) + 1];
+}
+
+static inline unsigned tcg_call_flags(TCGOp *op)
+{
+    return tcg_call_info(op)->flags;
+}
+
 #endif /* TCG_INTERNAL_H */
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@ -60,6 +60,10 @@
 #include "exec/log.h"
 #include "tcg-internal.h"

+#ifdef CONFIG_TCG_INTERPRETER
+#include <ffi.h>
+#endif
+
 /* Forward declarations for functions declared in tcg-target.c.inc and
   used here. */
 static void tcg_target_init(TCGContext *s);
@ -143,7 +147,12 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
                       intptr_t arg2);
 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
                        TCGReg base, intptr_t ofs);
+#ifdef CONFIG_TCG_INTERPRETER
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
+                         ffi_cif *cif);
+#else
 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target);
+#endif
 static bool tcg_target_const_match(int64_t val, TCGType type, int ct);
 #ifdef TCG_TARGET_NEED_LDST_LABELS
 static int tcg_out_ldst_finalize(TCGContext *s);
@ -532,13 +541,6 @@ void tcg_pool_reset(TCGContext *s)
    s->pool_current = NULL;
 }

-typedef struct TCGHelperInfo {
-    void *func;
-    const char *name;
-    unsigned flags;
-    unsigned sizemask;
-} TCGHelperInfo;
-
 #include "exec/helper-proto.h"

 static const TCGHelperInfo all_helpers[] = {
@ -546,6 +548,19 @@ static const TCGHelperInfo all_helpers[] = {
 };
 static GHashTable *helper_table;

+#ifdef CONFIG_TCG_INTERPRETER
+static GHashTable *ffi_table;
+
+static ffi_type * const typecode_to_ffi[8] = {
+    [dh_typecode_void] = &ffi_type_void,
+    [dh_typecode_i32]  = &ffi_type_uint32,
+    [dh_typecode_s32]  = &ffi_type_sint32,
+    [dh_typecode_i64]  = &ffi_type_uint64,
+    [dh_typecode_s64]  = &ffi_type_sint64,
+    [dh_typecode_ptr]  = &ffi_type_pointer,
+};
+#endif
+
 static int indirect_reg_alloc_order[ARRAY_SIZE(tcg_target_reg_alloc_order)];
 static void process_op_defs(TCGContext *s);
 static TCGTemp *tcg_global_reg_new_internal(TCGContext *s, TCGType type,
@ -589,6 +604,47 @@ static void tcg_context_init(unsigned max_cpus)
                            (gpointer)&all_helpers[i]);
    }

+#ifdef CONFIG_TCG_INTERPRETER
+    /* g_direct_hash/equal for direct comparisons on uint32_t.  */
+    ffi_table = g_hash_table_new(NULL, NULL);
+    for (i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
+        struct {
+            ffi_cif cif;
+            ffi_type *args[];
+        } *ca;
+        uint32_t typemask = all_helpers[i].typemask;
+        gpointer hash = (gpointer)(uintptr_t)typemask;
+        ffi_status status;
+        int nargs;
+
+        if (g_hash_table_lookup(ffi_table, hash)) {
+            continue;
+        }
+
+        /* Ignoring the return type, find the last non-zero field. */
+        nargs = 32 - clz32(typemask >> 3);
+        nargs = DIV_ROUND_UP(nargs, 3);
+
+        ca = g_malloc0(sizeof(*ca) + nargs * sizeof(ffi_type *));
+        ca->cif.rtype = typecode_to_ffi[typemask & 7];
+        ca->cif.nargs = nargs;
+
+        if (nargs != 0) {
+            ca->cif.arg_types = ca->args;
+            for (i = 0; i < nargs; ++i) {
+                int typecode = extract32(typemask, (i + 1) * 3, 3);
+                ca->args[i] = typecode_to_ffi[typecode];
+            }
+        }
+
+        status = ffi_prep_cif(&ca->cif, FFI_DEFAULT_ABI, nargs,
+                              ca->cif.rtype, ca->cif.arg_types);
+        assert(status == FFI_OK);
+
+        g_hash_table_insert(ffi_table, hash, (gpointer)&ca->cif);
+    }
+#endif
+
    tcg_target_init(s);
    process_op_defs(s);

@ -729,10 +785,16 @@ void tcg_prologue_init(TCGContext *s)
    }
 #endif

-    /* Assert that goto_ptr is implemented completely.  */
+#ifndef CONFIG_TCG_INTERPRETER
+    /*
+     * Assert that goto_ptr is implemented completely, setting an epilogue.
+     * For tci, we use NULL as the signal to return from the interpreter,
+     * so skip this check.
+     */
    if (TCG_TARGET_HAS_goto_ptr) {
        tcg_debug_assert(tcg_code_gen_epilogue != NULL);
    }
+#endif
 }

 void tcg_func_start(TCGContext *s)
@ -1395,13 +1457,12 @@ bool tcg_op_supported(TCGOpcode op)
 void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 {
    int i, real_args, nb_rets, pi;
-    unsigned sizemask, flags;
-    TCGHelperInfo *info;
+    unsigned typemask;
+    const TCGHelperInfo *info;
    TCGOp *op;

    info = g_hash_table_lookup(helper_table, (gpointer)func);
-    flags = info->flags;
-    sizemask = info->sizemask;
+    typemask = info->typemask;

 #ifdef CONFIG_PLUGIN
    /* detect non-plugin helpers */
@ -1414,36 +1475,41 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
    && !defined(CONFIG_TCG_INTERPRETER)
    /* We have 64-bit values in one register, but need to pass as two
       separate parameters.  Split them.  */
-    int orig_sizemask = sizemask;
+    int orig_typemask = typemask;
    int orig_nargs = nargs;
    TCGv_i64 retl, reth;
    TCGTemp *split_args[MAX_OPC_PARAM];

    retl = NULL;
    reth = NULL;
-    if (sizemask != 0) {
-        for (i = real_args = 0; i < nargs; ++i) {
-            int is_64bit = sizemask & (1 << (i+1)*2);
-            if (is_64bit) {
-                TCGv_i64 orig = temp_tcgv_i64(args[i]);
-                TCGv_i32 h = tcg_temp_new_i32();
-                TCGv_i32 l = tcg_temp_new_i32();
-                tcg_gen_extr_i64_i32(l, h, orig);
-                split_args[real_args++] = tcgv_i32_temp(h);
-                split_args[real_args++] = tcgv_i32_temp(l);
-            } else {
-                split_args[real_args++] = args[i];
-            }
+    typemask = 0;
+    for (i = real_args = 0; i < nargs; ++i) {
+        int argtype = extract32(orig_typemask, (i + 1) * 3, 3);
+        bool is_64bit = (argtype & ~1) == dh_typecode_i64;
+
+        if (is_64bit) {
+            TCGv_i64 orig = temp_tcgv_i64(args[i]);
+            TCGv_i32 h = tcg_temp_new_i32();
+            TCGv_i32 l = tcg_temp_new_i32();
+            tcg_gen_extr_i64_i32(l, h, orig);
+            split_args[real_args++] = tcgv_i32_temp(h);
+            typemask |= dh_typecode_i32 << (real_args * 3);
+            split_args[real_args++] = tcgv_i32_temp(l);
+            typemask |= dh_typecode_i32 << (real_args * 3);
+        } else {
+            split_args[real_args++] = args[i];
+            typemask |= argtype << (real_args * 3);
        }
-        nargs = real_args;
-        args = split_args;
-        sizemask = 0;
    }
+    nargs = real_args;
+    args = split_args;
 #elif defined(TCG_TARGET_EXTEND_ARGS) && TCG_TARGET_REG_BITS == 64
    for (i = 0; i < nargs; ++i) {
-        int is_64bit = sizemask & (1 << (i+1)*2);
-        int is_signed = sizemask & (2 << (i+1)*2);
-        if (!is_64bit) {
+        int argtype = extract32(typemask, (i + 1) * 3, 3);
+        bool is_32bit = (argtype & ~1) == dh_typecode_i32;
+        bool is_signed = argtype & 1;
+
+        if (is_32bit) {
            TCGv_i64 temp = tcg_temp_new_i64();
            TCGv_i64 orig = temp_tcgv_i64(args[i]);
            if (is_signed) {
@ -1462,7 +1528,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
    if (ret != NULL) {
 #if defined(__sparc__) && !defined(__arch64__) \
    && !defined(CONFIG_TCG_INTERPRETER)
-        if (orig_sizemask & 1) {
+        if ((typemask & 6) == dh_typecode_i64) {
            /* The 32-bit ABI is going to return the 64-bit value in
               the %o0/%o1 register pair.  Prepare for this by using
               two return temporaries, and reassemble below.  */
@ -1476,7 +1542,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
            nb_rets = 1;
        }
 #else
-        if (TCG_TARGET_REG_BITS < 64 && (sizemask & 1)) {
+        if (TCG_TARGET_REG_BITS < 64 && (typemask & 6) == dh_typecode_i64) {
 #ifdef HOST_WORDS_BIGENDIAN
            op->args[pi++] = temp_arg(ret + 1);
            op->args[pi++] = temp_arg(ret);
@ -1497,25 +1563,39 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)

    real_args = 0;
    for (i = 0; i < nargs; i++) {
-        int is_64bit = sizemask & (1 << (i+1)*2);
-        if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
-#ifdef TCG_TARGET_CALL_ALIGN_ARGS
-            /* some targets want aligned 64 bit args */
-            if (real_args & 1) {
-                op->args[pi++] = TCG_CALL_DUMMY_ARG;
-                real_args++;
-            }
+        int argtype = extract32(typemask, (i + 1) * 3, 3);
+        bool is_64bit = (argtype & ~1) == dh_typecode_i64;
+        bool want_align = false;
+
+#if defined(CONFIG_TCG_INTERPRETER)
+        /*
+         * Align all arguments, so that they land in predictable places
+         * for passing off to ffi_call.
+         */
+        want_align = true;
+#elif defined(TCG_TARGET_CALL_ALIGN_ARGS)
+        /* Some targets want aligned 64 bit args */
+        want_align = is_64bit;
 #endif
-           /* If stack grows up, then we will be placing successive
-              arguments at lower addresses, which means we need to
-              reverse the order compared to how we would normally
-              treat either big or little-endian.  For those arguments
-              that will wind up in registers, this still works for
-              HPPA (the only current STACK_GROWSUP target) since the
-              argument registers are *also* allocated in decreasing
-              order.  If another such target is added, this logic may
-              have to get more complicated to differentiate between
-              stack arguments and register arguments.  */
+
+        if (TCG_TARGET_REG_BITS < 64 && want_align && (real_args & 1)) {
+            op->args[pi++] = TCG_CALL_DUMMY_ARG;
+            real_args++;
+        }
+
+        if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
+            /*
+             * If stack grows up, then we will be placing successive
+             * arguments at lower addresses, which means we need to
+             * reverse the order compared to how we would normally
+             * treat either big or little-endian.  For those arguments
+             * that will wind up in registers, this still works for
+             * HPPA (the only current STACK_GROWSUP target) since the
+             * argument registers are *also* allocated in decreasing
+             * order.  If another such target is added, this logic may
+             * have to get more complicated to differentiate between
+             * stack arguments and register arguments.
+             */
 #if defined(HOST_WORDS_BIGENDIAN) != defined(TCG_TARGET_STACK_GROWSUP)
            op->args[pi++] = temp_arg(args[i] + 1);
            op->args[pi++] = temp_arg(args[i]);
@ -1531,7 +1611,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
        real_args++;
    }
    op->args[pi++] = (uintptr_t)func;
-    op->args[pi++] = flags;
+    op->args[pi++] = (uintptr_t)info;
    TCGOP_CALLI(op) = real_args;

    /* Make sure the fields didn't overflow.  */
@ -1542,7 +1622,9 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
    && !defined(CONFIG_TCG_INTERPRETER)
    /* Free all of the parts we allocated above.  */
    for (i = real_args = 0; i < orig_nargs; ++i) {
-        int is_64bit = orig_sizemask & (1 << (i+1)*2);
+        int argtype = extract32(orig_typemask, (i + 1) * 3, 3);
+        bool is_64bit = (argtype & ~1) == dh_typecode_i64;
+
        if (is_64bit) {
            tcg_temp_free_internal(args[real_args++]);
            tcg_temp_free_internal(args[real_args++]);
@ -1550,7 +1632,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
            real_args++;
        }
    }
-    if (orig_sizemask & 1) {
+    if ((orig_typemask & 6) == dh_typecode_i64) {
        /* The 32-bit ABI returned two 32-bit pieces.  Re-assemble them.
           Note that describing these as TCGv_i64 eliminates an unnecessary
           zero-extension that tcg_gen_concat_i32_i64 would create.  */
@ -1560,8 +1642,10 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
    }
 #elif defined(TCG_TARGET_EXTEND_ARGS) && TCG_TARGET_REG_BITS == 64
    for (i = 0; i < nargs; ++i) {
-        int is_64bit = sizemask & (1 << (i+1)*2);
-        if (!is_64bit) {
+        int argtype = extract32(typemask, (i + 1) * 3, 3);
+        bool is_32bit = (argtype & ~1) == dh_typecode_i32;
+
+        if (is_32bit) {
            tcg_temp_free_internal(args[i]);
        }
    }
@ -1646,19 +1730,6 @@ static char *tcg_get_arg_str(TCGContext *s, char *buf,
    return tcg_get_arg_str_ptr(s, buf, buf_size, arg_temp(arg));
 }

-/* Find helper name.  */
-static inline const char *tcg_find_helper(TCGContext *s, uintptr_t val)
-{
-    const char *ret = NULL;
-    if (helper_table) {
-        TCGHelperInfo *info = g_hash_table_lookup(helper_table, (gpointer)val);
-        if (info) {
-            ret = info->name;
-        }
-    }
-    return ret;
-}
-
 static const char * const cond_name[] =
 {
    [TCG_COND_NEVER] = "never",
@ -1749,15 +1820,28 @@ static void tcg_dump_ops(TCGContext *s, bool have_prefs)
                col += qemu_log(" " TARGET_FMT_lx, a);
            }
        } else if (c == INDEX_op_call) {
+            const TCGHelperInfo *info = tcg_call_info(op);
+            void *func = tcg_call_func(op);
+
            /* variable number of arguments */
            nb_oargs = TCGOP_CALLO(op);
            nb_iargs = TCGOP_CALLI(op);
            nb_cargs = def->nb_cargs;

-            /* function name, flags, out args */
-            col += qemu_log(" %s %s,$0x%" TCG_PRIlx ",$%d", def->name,
-                            tcg_find_helper(s, op->args[nb_oargs + nb_iargs]),
-                            op->args[nb_oargs + nb_iargs + 1], nb_oargs);
+            col += qemu_log(" %s ", def->name);
+
+            /*
+             * Print the function name from TCGHelperInfo, if available.
+             * Note that plugins have a template function for the info,
+             * but the actual function pointer comes from the plugin.
+             */
+            if (func == info->func) {
+                col += qemu_log("%s", info->name);
+            } else {
+                col += qemu_log("plugin(%p)", func);
+            }
+
+            col += qemu_log("$0x%x,$%d", info->flags, nb_oargs);
            for (i = 0; i < nb_oargs; i++) {
                col += qemu_log(",%s", tcg_get_arg_str(s, buf, sizeof(buf),
                                                       op->args[i]));
@ -2144,7 +2228,6 @@ static void reachable_code_pass(TCGContext *s)
    QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
        bool remove = dead;
        TCGLabel *label;
-        int call_flags;

        switch (op->opc) {
        case INDEX_op_set_label:
@ -2189,8 +2272,7 @@ static void reachable_code_pass(TCGContext *s)

        case INDEX_op_call:
            /* Notice noreturn helper calls, raising exceptions.  */
-            call_flags = op->args[TCGOP_CALLO(op) + TCGOP_CALLI(op) + 1];
-            if (call_flags & TCG_CALL_NO_RETURN) {
+            if (tcg_call_flags(op) & TCG_CALL_NO_RETURN) {
                dead = true;
            }
            break;
@ -2391,7 +2473,7 @@ static void liveness_pass_1(TCGContext *s)

                nb_oargs = TCGOP_CALLO(op);
                nb_iargs = TCGOP_CALLI(op);
-                call_flags = op->args[nb_oargs + nb_iargs + 1];
+                call_flags = tcg_call_flags(op);

                /* pure functions can be removed if their result is unused */
                if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) {
@ -2706,7 +2788,7 @@ static bool liveness_pass_2(TCGContext *s)
        if (opc == INDEX_op_call) {
            nb_oargs = TCGOP_CALLO(op);
            nb_iargs = TCGOP_CALLI(op);
-            call_flags = op->args[nb_oargs + nb_iargs + 1];
+            call_flags = tcg_call_flags(op);
        } else {
            nb_iargs = def->nb_iargs;
            nb_oargs = def->nb_oargs;
@ -2933,20 +3015,42 @@ static void check_regs(TCGContext *s)

 static void temp_allocate_frame(TCGContext *s, TCGTemp *ts)
 {
-#if !(defined(__sparc__) && TCG_TARGET_REG_BITS == 64)
-    /* Sparc64 stack is accessed with offset of 2047 */
-    s->current_frame_offset = (s->current_frame_offset +
-                               (tcg_target_long)sizeof(tcg_target_long) - 1) &
-        ~(sizeof(tcg_target_long) - 1);
-#endif
-    if (s->current_frame_offset + (tcg_target_long)sizeof(tcg_target_long) >
-        s->frame_end) {
-        tcg_abort();
+    intptr_t off, size, align;
+
+    switch (ts->type) {
+    case TCG_TYPE_I32:
+        size = align = 4;
+        break;
+    case TCG_TYPE_I64:
+    case TCG_TYPE_V64:
+        size = align = 8;
+        break;
+    case TCG_TYPE_V128:
+        size = align = 16;
+        break;
+    case TCG_TYPE_V256:
+        /* Note that we do not require aligned storage for V256. */
+        size = 32, align = 16;
+        break;
+    default:
+        g_assert_not_reached();
    }
-    ts->mem_offset = s->current_frame_offset;
+
+    assert(align <= TCG_TARGET_STACK_ALIGN);
+    off = ROUND_UP(s->current_frame_offset, align);
+
+    /* If we've exhausted the stack frame, restart with a smaller TB. */
+    if (off + size > s->frame_end) {
+        tcg_raise_tb_overflow(s);
+    }
+    s->current_frame_offset = off + size;
+
+    ts->mem_offset = off;
+#if defined(__sparc__)
+    ts->mem_offset += TCG_TARGET_STACK_BIAS;
+#endif
    ts->mem_base = s->frame_temp;
    ts->mem_allocated = 1;
-    s->current_frame_offset += sizeof(tcg_target_long);
 }

 static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
@ -3777,6 +3881,7 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
    const int nb_oargs = TCGOP_CALLO(op);
    const int nb_iargs = TCGOP_CALLI(op);
    const TCGLifeData arg_life = op->life;
+    const TCGHelperInfo *info;
    int flags, nb_regs, i;
    TCGReg reg;
    TCGArg arg;
@ -3787,8 +3892,9 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
    int allocate_args;
    TCGRegSet allocated_regs;

-    func_addr = (tcg_insn_unit *)(intptr_t)op->args[nb_oargs + nb_iargs];
-    flags = op->args[nb_oargs + nb_iargs + 1];
+    func_addr = tcg_call_func(op);
+    info = tcg_call_info(op);
+    flags = info->flags;

    nb_regs = ARRAY_SIZE(tcg_target_call_iarg_regs);
    if (nb_regs > nb_iargs) {
@ -3880,7 +3986,16 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
        save_globals(s, allocated_regs);
    }

+#ifdef CONFIG_TCG_INTERPRETER
+    {
+        gpointer hash = (gpointer)(uintptr_t)info->typemask;
+        ffi_cif *cif = g_hash_table_lookup(ffi_table, hash);
+        assert(cif != NULL);
+        tcg_out_call(s, func_addr, cif);
+    }
+#else
    tcg_out_call(s, func_addr);
+#endif

    /* assign output registers and emit moves if needed */
    for(i = 0; i < nb_oargs; i++) {
--- a/tcg/tci.c
+++ b/tcg/tci.c
--- a/tcg/tci/README
+++ b/tcg/tci/README
@ -23,10 +23,12 @@ This is what TCI (Tiny Code Interpreter) does.
 Like each TCG host frontend, TCI implements the code generator in
 tcg-target.c.inc, tcg-target.h. Both files are in directory tcg/tci.

-The additional file tcg/tci.c adds the interpreter.
+The additional file tcg/tci.c adds the interpreter and disassembler.

-The bytecode consists of opcodes (same numeric values as those used by
-TCG), command length and arguments of variable size and number.
+The bytecode consists of opcodes (with only a few exceptions, with
+the same same numeric values and semantics as used by TCG), and up
+to six arguments packed into a 32-bit integer.  See comments in tci.c
+for details on the encoding.

 3) Usage

@ -39,11 +41,6 @@ suggest using this option. Setting it automatically would need
 additional code in configure which must be fixed when new native TCG
 implementations are added.

-System emulation should work on any 32 or 64 bit host.
-User mode emulation might work. Maybe a new linker script (*.ld)
-is needed. Byte order might be wrong (on big endian hosts)
-and need fixes in configure.
-
 For hosts with native TCG, the interpreter TCI can be enabled by

        configure --enable-tcg-interpreter
@ -118,13 +115,6 @@ u1 = linux-user-test works
  in the interpreter. These opcodes raise a runtime exception, so it is
  possible to see where code must be added.

-* The pseudo code is not optimized and still ugly. For hosts with special
-  alignment requirements, it needs some fixes (maybe aligned bytecode
-  would also improve speed for hosts which support byte alignment).
-
-* A better disassembler for the pseudo code would be nice (a very primitive
-  disassembler is included in tcg-target.c.inc).
-
 * It might be useful to have a runtime option which selects the native TCG
  or TCI, so QEMU would have to include two TCGs. Today, selecting TCI
  is a configure option, so you need two compilations of QEMU.
--- a/tcg/tci/tcg-target-con-set.h
+++ b/tcg/tci/tcg-target-con-set.h
@ -9,6 +9,7 @@
 * Each operand should be a sequence of constraint letters as defined by
 * tcg-target-con-str.h; the constraint combination is inclusive or.
 */
+C_O0_I1(r)
 C_O0_I2(r, r)
 C_O0_I3(r, r, r)
 C_O0_I4(r, r, r, r)
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@ -22,24 +22,14 @@
 * THE SOFTWARE.
 */

-/* TODO list:
- * - See TODO comments in code.
- */
-
-/* Marker for missing code. */
-#define TODO() \
-    do { \
-        fprintf(stderr, "TODO %s:%u: %s()\n", \
-                __FILE__, __LINE__, __func__); \
-        tcg_abort(); \
-    } while (0)
-
-/* Bitfield n...m (in 32 bit value). */
-#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m)
+#include "../tcg-pool.c.inc"

 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 {
    switch (op) {
+    case INDEX_op_goto_ptr:
+        return C_O0_I1(r);
+
    case INDEX_op_ld8u_i32:
    case INDEX_op_ld8s_i32:
    case INDEX_op_ld16u_i32:
@ -73,6 +63,12 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
    case INDEX_op_bswap32_i32:
    case INDEX_op_bswap32_i64:
    case INDEX_op_bswap64_i64:
+    case INDEX_op_extract_i32:
+    case INDEX_op_extract_i64:
+    case INDEX_op_sextract_i32:
+    case INDEX_op_sextract_i64:
+    case INDEX_op_ctpop_i32:
+    case INDEX_op_ctpop_i64:
        return C_O1_I1(r, r);

    case INDEX_op_st8_i32:
@ -128,24 +124,37 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
    case INDEX_op_setcond_i64:
    case INDEX_op_deposit_i32:
    case INDEX_op_deposit_i64:
+    case INDEX_op_clz_i32:
+    case INDEX_op_clz_i64:
+    case INDEX_op_ctz_i32:
+    case INDEX_op_ctz_i64:
        return C_O1_I2(r, r, r);

    case INDEX_op_brcond_i32:
    case INDEX_op_brcond_i64:
        return C_O0_I2(r, r);

-#if TCG_TARGET_REG_BITS == 32
-    /* TODO: Support R, R, R, R, RI, RI? Will it be faster? */
    case INDEX_op_add2_i32:
+    case INDEX_op_add2_i64:
    case INDEX_op_sub2_i32:
+    case INDEX_op_sub2_i64:
        return C_O2_I4(r, r, r, r, r, r);
+
+#if TCG_TARGET_REG_BITS == 32
    case INDEX_op_brcond2_i32:
        return C_O0_I4(r, r, r, r);
+#endif
+
    case INDEX_op_mulu2_i32:
+    case INDEX_op_mulu2_i64:
+    case INDEX_op_muls2_i32:
+    case INDEX_op_muls2_i64:
        return C_O2_I2(r, r, r, r);
+
+    case INDEX_op_movcond_i32:
+    case INDEX_op_movcond_i64:
    case INDEX_op_setcond2_i32:
        return C_O1_I4(r, r, r, r, r);
-#endif

    case INDEX_op_qemu_ld_i32:
        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
@ -170,8 +179,6 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 }

 static const int tcg_target_reg_alloc_order[] = {
-    TCG_REG_R0,
-    TCG_REG_R1,
    TCG_REG_R2,
    TCG_REG_R3,
    TCG_REG_R4,
@ -186,29 +193,16 @@ static const int tcg_target_reg_alloc_order[] = {
    TCG_REG_R13,
    TCG_REG_R14,
    TCG_REG_R15,
+    TCG_REG_R1,
+    TCG_REG_R0,
 };

 #if MAX_OPC_PARAM_IARGS != 6
 # error Fix needed, number of supported input arguments changed!
 #endif

-static const int tcg_target_call_iarg_regs[] = {
-    TCG_REG_R0,
-    TCG_REG_R1,
-    TCG_REG_R2,
-    TCG_REG_R3,
-    TCG_REG_R4,
-    TCG_REG_R5,
-#if TCG_TARGET_REG_BITS == 32
-    /* 32 bit hosts need 2 * MAX_OPC_PARAM_IARGS registers. */
-    TCG_REG_R6,
-    TCG_REG_R7,
-    TCG_REG_R8,
-    TCG_REG_R9,
-    TCG_REG_R10,
-    TCG_REG_R11,
-#endif
-};
+/* No call arguments via registers.  All will be stored on the "stack". */
+static const int tcg_target_call_iarg_regs[] = { };

 static const int tcg_target_call_oarg_regs[] = {
    TCG_REG_R0,
@ -241,317 +235,281 @@ static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
                        intptr_t value, intptr_t addend)
 {
-    /* tcg_out_reloc always uses the same type, addend. */
-    tcg_debug_assert(type == sizeof(tcg_target_long));
+    intptr_t diff = value - (intptr_t)(code_ptr + 1);
+
    tcg_debug_assert(addend == 0);
-    tcg_debug_assert(value != 0);
-    if (TCG_TARGET_REG_BITS == 32) {
-        tcg_patch32(code_ptr, value);
-    } else {
-        tcg_patch64(code_ptr, value);
-    }
-    return true;
-}
-
-/* Write value (native size). */
-static void tcg_out_i(TCGContext *s, tcg_target_ulong v)
-{
-    if (TCG_TARGET_REG_BITS == 32) {
-        tcg_out32(s, v);
-    } else {
-        tcg_out64(s, v);
-    }
-}
-
-/* Write opcode. */
-static void tcg_out_op_t(TCGContext *s, TCGOpcode op)
-{
-    tcg_out8(s, op);
-    tcg_out8(s, 0);
-}
-
-/* Write register. */
-static void tcg_out_r(TCGContext *s, TCGArg t0)
-{
-    tcg_debug_assert(t0 < TCG_TARGET_NB_REGS);
-    tcg_out8(s, t0);
-}
-
-/* Write label. */
-static void tci_out_label(TCGContext *s, TCGLabel *label)
-{
-    if (label->has_value) {
-        tcg_out_i(s, label->u.value);
-        tcg_debug_assert(label->u.value);
-    } else {
-        tcg_out_reloc(s, s->code_ptr, sizeof(tcg_target_ulong), label, 0);
-        s->code_ptr += sizeof(tcg_target_ulong);
+    tcg_debug_assert(type == 20);
+
+    if (diff == sextract32(diff, 0, type)) {
+        tcg_patch32(code_ptr, deposit32(*code_ptr, 32 - type, type, diff));
+        return true;
    }
+    return false;
 }

 static void stack_bounds_check(TCGReg base, target_long offset)
 {
    if (base == TCG_REG_CALL_STACK) {
-        tcg_debug_assert(offset < 0);
-        tcg_debug_assert(offset >= -(CPU_TEMP_BUF_NLONGS * sizeof(long)));
+        tcg_debug_assert(offset >= 0);
+        tcg_debug_assert(offset < (TCG_STATIC_CALL_ARGS_SIZE +
+                                   TCG_STATIC_FRAME_SIZE));
    }
 }

 static void tcg_out_op_l(TCGContext *s, TCGOpcode op, TCGLabel *l0)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tci_out_label(s, l0);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    tcg_out_reloc(s, s->code_ptr, 20, l0, 0);
+    insn = deposit32(insn, 0, 8, op);
+    tcg_out32(s, insn);
 }

 static void tcg_out_op_p(TCGContext *s, TCGOpcode op, void *p0)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;
+    intptr_t diff;

-    tcg_out_op_t(s, op);
-    tcg_out_i(s, (uintptr_t)p0);
+    /* Special case for exit_tb: map null -> 0. */
+    if (p0 == NULL) {
+        diff = 0;
+    } else {
+        diff = p0 - (void *)(s->code_ptr + 1);
+        tcg_debug_assert(diff != 0);
+        if (diff != sextract32(diff, 0, 20)) {
+            tcg_raise_tb_overflow(s);
+        }
+    }
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 12, 20, diff);
+    tcg_out32(s, insn);
+}

-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+static void tcg_out_op_r(TCGContext *s, TCGOpcode op, TCGReg r0)
+{
+    tcg_insn_unit insn = 0;
+
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    tcg_out32(s, insn);
 }

 static void tcg_out_op_v(TCGContext *s, TCGOpcode op)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
-
-    tcg_out_op_t(s, op);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    tcg_out32(s, (uint8_t)op);
 }

 static void tcg_out_op_ri(TCGContext *s, TCGOpcode op, TCGReg r0, int32_t i1)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out32(s, i1);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    tcg_debug_assert(i1 == sextract32(i1, 0, 20));
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 20, i1);
+    tcg_out32(s, insn);
 }

-#if TCG_TARGET_REG_BITS == 64
-static void tcg_out_op_rI(TCGContext *s, TCGOpcode op,
-                          TCGReg r0, uint64_t i1)
+static void tcg_out_op_rl(TCGContext *s, TCGOpcode op, TCGReg r0, TCGLabel *l1)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out64(s, i1);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    tcg_out_reloc(s, s->code_ptr, 20, l1, 0);
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    tcg_out32(s, insn);
 }
-#endif

 static void tcg_out_op_rr(TCGContext *s, TCGOpcode op, TCGReg r0, TCGReg r1)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 4, r1);
+    tcg_out32(s, insn);
 }

 static void tcg_out_op_rrm(TCGContext *s, TCGOpcode op,
                           TCGReg r0, TCGReg r1, TCGArg m2)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-    tcg_out32(s, m2);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    tcg_debug_assert(m2 == extract32(m2, 0, 12));
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 4, r1);
+    insn = deposit32(insn, 20, 12, m2);
+    tcg_out32(s, insn);
 }

 static void tcg_out_op_rrr(TCGContext *s, TCGOpcode op,
                           TCGReg r0, TCGReg r1, TCGReg r2)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-    tcg_out_r(s, r2);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 4, r1);
+    insn = deposit32(insn, 16, 4, r2);
+    tcg_out32(s, insn);
 }

 static void tcg_out_op_rrs(TCGContext *s, TCGOpcode op,
                           TCGReg r0, TCGReg r1, intptr_t i2)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-    tcg_debug_assert(i2 == (int32_t)i2);
-    tcg_out32(s, i2);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    tcg_debug_assert(i2 == sextract32(i2, 0, 16));
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 4, r1);
+    insn = deposit32(insn, 16, 16, i2);
+    tcg_out32(s, insn);
 }

-static void tcg_out_op_rrcl(TCGContext *s, TCGOpcode op,
-                            TCGReg r0, TCGReg r1, TCGCond c2, TCGLabel *l3)
+static void tcg_out_op_rrbb(TCGContext *s, TCGOpcode op, TCGReg r0,
+                            TCGReg r1, uint8_t b2, uint8_t b3)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-    tcg_out8(s, c2);
-    tci_out_label(s, l3);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    tcg_debug_assert(b2 == extract32(b2, 0, 6));
+    tcg_debug_assert(b3 == extract32(b3, 0, 6));
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 4, r1);
+    insn = deposit32(insn, 16, 6, b2);
+    insn = deposit32(insn, 22, 6, b3);
+    tcg_out32(s, insn);
 }

 static void tcg_out_op_rrrc(TCGContext *s, TCGOpcode op,
                            TCGReg r0, TCGReg r1, TCGReg r2, TCGCond c3)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-    tcg_out_r(s, r2);
-    tcg_out8(s, c3);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 4, r1);
+    insn = deposit32(insn, 16, 4, r2);
+    insn = deposit32(insn, 20, 4, c3);
+    tcg_out32(s, insn);
 }

 static void tcg_out_op_rrrm(TCGContext *s, TCGOpcode op,
                            TCGReg r0, TCGReg r1, TCGReg r2, TCGArg m3)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-    tcg_out_r(s, r2);
-    tcg_out32(s, m3);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    tcg_debug_assert(m3 == extract32(m3, 0, 12));
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 4, r1);
+    insn = deposit32(insn, 16, 4, r2);
+    insn = deposit32(insn, 20, 12, m3);
+    tcg_out32(s, insn);
 }

 static void tcg_out_op_rrrbb(TCGContext *s, TCGOpcode op, TCGReg r0,
                             TCGReg r1, TCGReg r2, uint8_t b3, uint8_t b4)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-    tcg_out_r(s, r2);
-    tcg_out8(s, b3);
-    tcg_out8(s, b4);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    tcg_debug_assert(b3 == extract32(b3, 0, 6));
+    tcg_debug_assert(b4 == extract32(b4, 0, 6));
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 4, r1);
+    insn = deposit32(insn, 16, 4, r2);
+    insn = deposit32(insn, 20, 6, b3);
+    insn = deposit32(insn, 26, 6, b4);
+    tcg_out32(s, insn);
 }

-static void tcg_out_op_rrrrm(TCGContext *s, TCGOpcode op, TCGReg r0,
-                             TCGReg r1, TCGReg r2, TCGReg r3, TCGArg m4)
+static void tcg_out_op_rrrrr(TCGContext *s, TCGOpcode op, TCGReg r0,
+                             TCGReg r1, TCGReg r2, TCGReg r3, TCGReg r4)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-    tcg_out_r(s, r2);
-    tcg_out_r(s, r3);
-    tcg_out32(s, m4);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 4, r1);
+    insn = deposit32(insn, 16, 4, r2);
+    insn = deposit32(insn, 20, 4, r3);
+    insn = deposit32(insn, 24, 4, r4);
+    tcg_out32(s, insn);
 }

-#if TCG_TARGET_REG_BITS == 32
 static void tcg_out_op_rrrr(TCGContext *s, TCGOpcode op,
                            TCGReg r0, TCGReg r1, TCGReg r2, TCGReg r3)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-    tcg_out_r(s, r2);
-    tcg_out_r(s, r3);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
-}
-
-static void tcg_out_op_rrrrcl(TCGContext *s, TCGOpcode op,
-                              TCGReg r0, TCGReg r1, TCGReg r2, TCGReg r3,
-                              TCGCond c4, TCGLabel *l5)
-{
-    uint8_t *old_code_ptr = s->code_ptr;
-
-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-    tcg_out_r(s, r2);
-    tcg_out_r(s, r3);
-    tcg_out8(s, c4);
-    tci_out_label(s, l5);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 4, r1);
+    insn = deposit32(insn, 16, 4, r2);
+    insn = deposit32(insn, 20, 4, r3);
+    tcg_out32(s, insn);
 }

 static void tcg_out_op_rrrrrc(TCGContext *s, TCGOpcode op,
                              TCGReg r0, TCGReg r1, TCGReg r2,
                              TCGReg r3, TCGReg r4, TCGCond c5)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-    tcg_out_r(s, r2);
-    tcg_out_r(s, r3);
-    tcg_out_r(s, r4);
-    tcg_out8(s, c5);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 4, r1);
+    insn = deposit32(insn, 16, 4, r2);
+    insn = deposit32(insn, 20, 4, r3);
+    insn = deposit32(insn, 24, 4, r4);
+    insn = deposit32(insn, 28, 4, c5);
+    tcg_out32(s, insn);
 }

 static void tcg_out_op_rrrrrr(TCGContext *s, TCGOpcode op,
                              TCGReg r0, TCGReg r1, TCGReg r2,
                              TCGReg r3, TCGReg r4, TCGReg r5)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
+    tcg_insn_unit insn = 0;

-    tcg_out_op_t(s, op);
-    tcg_out_r(s, r0);
-    tcg_out_r(s, r1);
-    tcg_out_r(s, r2);
-    tcg_out_r(s, r3);
-    tcg_out_r(s, r4);
-    tcg_out_r(s, r5);
-
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    insn = deposit32(insn, 0, 8, op);
+    insn = deposit32(insn, 8, 4, r0);
+    insn = deposit32(insn, 12, 4, r1);
+    insn = deposit32(insn, 16, 4, r2);
+    insn = deposit32(insn, 20, 4, r3);
+    insn = deposit32(insn, 24, 4, r4);
+    insn = deposit32(insn, 28, 4, r5);
+    tcg_out32(s, insn);
+}
+
+static void tcg_out_ldst(TCGContext *s, TCGOpcode op, TCGReg val,
+                         TCGReg base, intptr_t offset)
+{
+    stack_bounds_check(base, offset);
+    if (offset != sextract32(offset, 0, 16)) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, offset);
+        tcg_out_op_rrr(s, (TCG_TARGET_REG_BITS == 32
+                           ? INDEX_op_add_i32 : INDEX_op_add_i64),
+                       TCG_REG_TMP, TCG_REG_TMP, base);
+        base = TCG_REG_TMP;
+        offset = 0;
+    }
+    tcg_out_op_rrs(s, op, val, base, offset);
 }
-#endif

 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg val, TCGReg base,
                       intptr_t offset)
 {
-    stack_bounds_check(base, offset);
    switch (type) {
    case TCG_TYPE_I32:
-        tcg_out_op_rrs(s, INDEX_op_ld_i32, val, base, offset);
+        tcg_out_ldst(s, INDEX_op_ld_i32, val, base, offset);
        break;
 #if TCG_TARGET_REG_BITS == 64
    case TCG_TYPE_I64:
-        tcg_out_op_rrs(s, INDEX_op_ld_i64, val, base, offset);
+        tcg_out_ldst(s, INDEX_op_ld_i64, val, base, offset);
        break;
 #endif
    default:
@ -581,24 +539,46 @@ static void tcg_out_movi(TCGContext *s, TCGType type,
 {
    switch (type) {
    case TCG_TYPE_I32:
-        tcg_out_op_ri(s, INDEX_op_tci_movi_i32, ret, arg);
-        break;
 #if TCG_TARGET_REG_BITS == 64
+        arg = (int32_t)arg;
+        /* fall through */
    case TCG_TYPE_I64:
-        tcg_out_op_rI(s, INDEX_op_tci_movi_i64, ret, arg);
-        break;
 #endif
+        break;
    default:
        g_assert_not_reached();
    }
+
+    if (arg == sextract32(arg, 0, 20)) {
+        tcg_out_op_ri(s, INDEX_op_tci_movi, ret, arg);
+    } else {
+        tcg_insn_unit insn = 0;
+
+        new_pool_label(s, arg, 20, s->code_ptr, 0);
+        insn = deposit32(insn, 0, 8, INDEX_op_tci_movl);
+        insn = deposit32(insn, 8, 4, ret);
+        tcg_out32(s, insn);
+    }
 }

-static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
+static void tcg_out_call(TCGContext *s, const tcg_insn_unit *func,
+                         ffi_cif *cif)
 {
-    uint8_t *old_code_ptr = s->code_ptr;
-    tcg_out_op_t(s, INDEX_op_call);
-    tcg_out_i(s, (uintptr_t)arg);
-    old_code_ptr[1] = s->code_ptr - old_code_ptr;
+    tcg_insn_unit insn = 0;
+    uint8_t which;
+
+    if (cif->rtype == &ffi_type_void) {
+        which = 0;
+    } else if (cif->rtype->size == 4) {
+        which = 1;
+    } else {
+        tcg_debug_assert(cif->rtype->size == 8);
+        which = 2;
+    }
+    new_pool_l2(s, 20, s->code_ptr, 0, (uintptr_t)func, (uintptr_t)cif);
+    insn = deposit32(insn, 0, 8, INDEX_op_call);
+    insn = deposit32(insn, 8, 4, which);
+    tcg_out32(s, insn);
 }

 #if TCG_TARGET_REG_BITS == 64
@ -629,6 +609,10 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
        set_jmp_reset_offset(s, args[0]);
        break;

+    case INDEX_op_goto_ptr:
+        tcg_out_op_r(s, opc, args[0]);
+        break;
+
    case INDEX_op_br:
        tcg_out_op_l(s, opc, arg_label(args[0]));
        break;
@ -637,12 +621,11 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
        tcg_out_op_rrrc(s, opc, args[0], args[1], args[2], args[3]);
        break;

-#if TCG_TARGET_REG_BITS == 32
+    CASE_32_64(movcond)
    case INDEX_op_setcond2_i32:
        tcg_out_op_rrrrrc(s, opc, args[0], args[1], args[2],
                          args[3], args[4], args[5]);
        break;
-#endif

    CASE_32_64(ld8u)
    CASE_32_64(ld8s)
@ -657,8 +640,7 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
    case INDEX_op_st_i32:
    CASE_64(st32)
    CASE_64(st)
-        stack_bounds_check(args[1], args[2]);
-        tcg_out_op_rrs(s, opc, args[0], args[1], args[2]);
+        tcg_out_ldst(s, opc, args[0], args[1], args[2]);
        break;

    CASE_32_64(add)
@ -681,6 +663,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
    CASE_32_64(divu)     /* Optional (TCG_TARGET_HAS_div_*). */
    CASE_32_64(rem)      /* Optional (TCG_TARGET_HAS_div_*). */
    CASE_32_64(remu)     /* Optional (TCG_TARGET_HAS_div_*). */
+    CASE_32_64(clz)      /* Optional (TCG_TARGET_HAS_clz_*). */
+    CASE_32_64(ctz)      /* Optional (TCG_TARGET_HAS_ctz_*). */
        tcg_out_op_rrr(s, opc, args[0], args[1], args[2]);
        break;

@ -696,8 +680,24 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
        }
        break;

+    CASE_32_64(extract)  /* Optional (TCG_TARGET_HAS_extract_*). */
+    CASE_32_64(sextract) /* Optional (TCG_TARGET_HAS_sextract_*). */
+        {
+            TCGArg pos = args[2], len = args[3];
+            TCGArg max = tcg_op_defs[opc].flags & TCG_OPF_64BIT ? 64 : 32;
+
+            tcg_debug_assert(pos < max);
+            tcg_debug_assert(pos + len <= max);
+
+            tcg_out_op_rrbb(s, opc, args[0], args[1], pos, len);
+        }
+        break;
+
    CASE_32_64(brcond)
-        tcg_out_op_rrcl(s, opc, args[0], args[1], args[2], arg_label(args[3]));
+        tcg_out_op_rrrc(s, (opc == INDEX_op_brcond_i32
+                            ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64),
+                        TCG_REG_TMP, args[0], args[1], args[2]);
+        tcg_out_op_rl(s, opc, TCG_REG_TMP, arg_label(args[3]));
        break;

    CASE_32_64(neg)      /* Optional (TCG_TARGET_HAS_neg_*). */
@ -713,24 +713,29 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
    CASE_32_64(bswap16)  /* Optional (TCG_TARGET_HAS_bswap16_*). */
    CASE_32_64(bswap32)  /* Optional (TCG_TARGET_HAS_bswap32_*). */
    CASE_64(bswap64)     /* Optional (TCG_TARGET_HAS_bswap64_i64). */
+    CASE_32_64(ctpop)    /* Optional (TCG_TARGET_HAS_ctpop_*). */
        tcg_out_op_rr(s, opc, args[0], args[1]);
        break;

-#if TCG_TARGET_REG_BITS == 32
-    case INDEX_op_add2_i32:
-    case INDEX_op_sub2_i32:
+    CASE_32_64(add2)
+    CASE_32_64(sub2)
        tcg_out_op_rrrrrr(s, opc, args[0], args[1], args[2],
                          args[3], args[4], args[5]);
        break;
+
+#if TCG_TARGET_REG_BITS == 32
    case INDEX_op_brcond2_i32:
-        tcg_out_op_rrrrcl(s, opc, args[0], args[1], args[2],
-                          args[3], args[4], arg_label(args[5]));
-        break;
-    case INDEX_op_mulu2_i32:
-        tcg_out_op_rrrr(s, opc, args[0], args[1], args[2], args[3]);
+        tcg_out_op_rrrrrc(s, INDEX_op_setcond2_i32, TCG_REG_TMP,
+                          args[0], args[1], args[2], args[3], args[4]);
+        tcg_out_op_rl(s, INDEX_op_brcond_i32, TCG_REG_TMP, arg_label(args[5]));
        break;
 #endif

+    CASE_32_64(mulu2)
+    CASE_32_64(muls2)
+        tcg_out_op_rrrr(s, opc, args[0], args[1], args[2], args[3]);
+        break;
+
    case INDEX_op_qemu_ld_i32:
    case INDEX_op_qemu_st_i32:
        if (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS) {
@ -747,8 +752,9 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
        } else if (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS) {
            tcg_out_op_rrrm(s, opc, args[0], args[1], args[2], args[3]);
        } else {
-            tcg_out_op_rrrrm(s, opc, args[0], args[1],
-                             args[2], args[3], args[4]);
+            tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_TMP, args[4]);
+            tcg_out_op_rrrrr(s, opc, args[0], args[1],
+                             args[2], args[3], TCG_REG_TMP);
        }
        break;

@ -794,6 +800,11 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
    return ct & TCG_CT_CONST;
 }

+static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
+{
+    memset(p, 0, sizeof(*p) * count);
+}
+
 static void tcg_target_init(TCGContext *s)
 {
 #if defined(CONFIG_DEBUG_TCG_INTERPRETER)
@ -810,17 +821,22 @@ static void tcg_target_init(TCGContext *s)
    tcg_target_available_regs[TCG_TYPE_I32] = BIT(TCG_TARGET_NB_REGS) - 1;
    /* Registers available for 64 bit operations. */
    tcg_target_available_regs[TCG_TYPE_I64] = BIT(TCG_TARGET_NB_REGS) - 1;
-    /* TODO: Which registers should be set here? */
-    tcg_target_call_clobber_regs = BIT(TCG_TARGET_NB_REGS) - 1;
+    /*
+     * The interpreter "registers" are in the local stack frame and
+     * cannot be clobbered by the called helper functions.  However,
+     * the interpreter assumes a 64-bit return value and assigns to
+     * the return value registers.
+     */
+    tcg_target_call_clobber_regs =
+        MAKE_64BIT_MASK(TCG_REG_R0, 64 / TCG_TARGET_REG_BITS);

    s->reserved_regs = 0;
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);

-    /* We use negative offsets from "sp" so that we can distinguish
-       stores that might pretend to be call arguments.  */
-    tcg_set_frame(s, TCG_REG_CALL_STACK,
-                  -CPU_TEMP_BUF_NLONGS * sizeof(long),
-                  CPU_TEMP_BUF_NLONGS * sizeof(long));
+    /* The call arguments come first, followed by the temp storage. */
+    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
+                  TCG_STATIC_FRAME_SIZE);
 }

 /* Generate global QEMU prologue and epilogue code. */
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@ -41,7 +41,7 @@
 #define TCG_TARGET_H

 #define TCG_TARGET_INTERPRETER 1
-#define TCG_TARGET_INSN_UNIT_SIZE 1
+#define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 32
 #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)

@ -68,26 +68,26 @@
 #define TCG_TARGET_HAS_ext16s_i32       1
 #define TCG_TARGET_HAS_ext8u_i32        1
 #define TCG_TARGET_HAS_ext16u_i32       1
-#define TCG_TARGET_HAS_andc_i32         0
+#define TCG_TARGET_HAS_andc_i32         1
 #define TCG_TARGET_HAS_deposit_i32      1
-#define TCG_TARGET_HAS_extract_i32      0
-#define TCG_TARGET_HAS_sextract_i32     0
+#define TCG_TARGET_HAS_extract_i32      1
+#define TCG_TARGET_HAS_sextract_i32     1
 #define TCG_TARGET_HAS_extract2_i32     0
-#define TCG_TARGET_HAS_eqv_i32          0
-#define TCG_TARGET_HAS_nand_i32         0
-#define TCG_TARGET_HAS_nor_i32          0
-#define TCG_TARGET_HAS_clz_i32          0
-#define TCG_TARGET_HAS_ctz_i32          0
-#define TCG_TARGET_HAS_ctpop_i32        0
+#define TCG_TARGET_HAS_eqv_i32          1
+#define TCG_TARGET_HAS_nand_i32         1
+#define TCG_TARGET_HAS_nor_i32          1
+#define TCG_TARGET_HAS_clz_i32          1
+#define TCG_TARGET_HAS_ctz_i32          1
+#define TCG_TARGET_HAS_ctpop_i32        1
 #define TCG_TARGET_HAS_neg_i32          1
 #define TCG_TARGET_HAS_not_i32          1
-#define TCG_TARGET_HAS_orc_i32          0
+#define TCG_TARGET_HAS_orc_i32          1
 #define TCG_TARGET_HAS_rot_i32          1
-#define TCG_TARGET_HAS_movcond_i32      0
-#define TCG_TARGET_HAS_muls2_i32        0
+#define TCG_TARGET_HAS_movcond_i32      1
+#define TCG_TARGET_HAS_muls2_i32        1
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
-#define TCG_TARGET_HAS_goto_ptr         0
+#define TCG_TARGET_HAS_goto_ptr         1
 #define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_qemu_st8_i32     0

@ -98,8 +98,8 @@
 #define TCG_TARGET_HAS_bswap32_i64      1
 #define TCG_TARGET_HAS_bswap64_i64      1
 #define TCG_TARGET_HAS_deposit_i64      1
-#define TCG_TARGET_HAS_extract_i64      0
-#define TCG_TARGET_HAS_sextract_i64     0
+#define TCG_TARGET_HAS_extract_i64      1
+#define TCG_TARGET_HAS_sextract_i64     1
 #define TCG_TARGET_HAS_extract2_i64     0
 #define TCG_TARGET_HAS_div_i64          1
 #define TCG_TARGET_HAS_rem_i64          1
@ -109,25 +109,25 @@
 #define TCG_TARGET_HAS_ext8u_i64        1
 #define TCG_TARGET_HAS_ext16u_i64       1
 #define TCG_TARGET_HAS_ext32u_i64       1
-#define TCG_TARGET_HAS_andc_i64         0
-#define TCG_TARGET_HAS_eqv_i64          0
-#define TCG_TARGET_HAS_nand_i64         0
-#define TCG_TARGET_HAS_nor_i64          0
-#define TCG_TARGET_HAS_clz_i64          0
-#define TCG_TARGET_HAS_ctz_i64          0
-#define TCG_TARGET_HAS_ctpop_i64        0
+#define TCG_TARGET_HAS_andc_i64         1
+#define TCG_TARGET_HAS_eqv_i64          1
+#define TCG_TARGET_HAS_nand_i64         1
+#define TCG_TARGET_HAS_nor_i64          1
+#define TCG_TARGET_HAS_clz_i64          1
+#define TCG_TARGET_HAS_ctz_i64          1
+#define TCG_TARGET_HAS_ctpop_i64        1
 #define TCG_TARGET_HAS_neg_i64          1
 #define TCG_TARGET_HAS_not_i64          1
-#define TCG_TARGET_HAS_orc_i64          0
+#define TCG_TARGET_HAS_orc_i64          1
 #define TCG_TARGET_HAS_rot_i64          1
-#define TCG_TARGET_HAS_movcond_i64      0
-#define TCG_TARGET_HAS_muls2_i64        0
-#define TCG_TARGET_HAS_add2_i32         0
-#define TCG_TARGET_HAS_sub2_i32         0
-#define TCG_TARGET_HAS_mulu2_i32        0
-#define TCG_TARGET_HAS_add2_i64         0
-#define TCG_TARGET_HAS_sub2_i64         0
-#define TCG_TARGET_HAS_mulu2_i64        0
+#define TCG_TARGET_HAS_movcond_i64      1
+#define TCG_TARGET_HAS_muls2_i64        1
+#define TCG_TARGET_HAS_add2_i32         1
+#define TCG_TARGET_HAS_sub2_i32         1
+#define TCG_TARGET_HAS_mulu2_i32        1
+#define TCG_TARGET_HAS_add2_i64         1
+#define TCG_TARGET_HAS_sub2_i64         1
+#define TCG_TARGET_HAS_mulu2_i64        1
 #define TCG_TARGET_HAS_muluh_i64        0
 #define TCG_TARGET_HAS_mulsh_i64        0
 #else
@ -156,15 +156,17 @@ typedef enum {
    TCG_REG_R14,
    TCG_REG_R15,

+    TCG_REG_TMP = TCG_REG_R13,
    TCG_AREG0 = TCG_REG_R14,
    TCG_REG_CALL_STACK = TCG_REG_R15,
 } TCGReg;

 /* Used for function call generation. */
 #define TCG_TARGET_CALL_STACK_OFFSET    0
-#define TCG_TARGET_STACK_ALIGN          16
+#define TCG_TARGET_STACK_ALIGN          8

 #define HAVE_TCG_QEMU_TB_EXEC
+#define TCG_TARGET_NEED_POOL_LABELS

 /* We could notice __i386__ or __s390x__ and reduce the barriers depending
   on the host.  But if you want performance, you use the normal backend.
--- a/tests/docker/dockerfiles/alpine.docker
+++ b/tests/docker/dockerfiles/alpine.docker
@ -22,6 +22,7 @@ ENV PACKAGES \
 	libaio-dev \
 	libbpf-dev \
 	libcap-ng-dev \
+	libffi-dev \
 	libjpeg-turbo-dev \
 	libnfs-dev \
 	libpng-dev \
--- a/tests/docker/dockerfiles/centos8.docker
+++ b/tests/docker/dockerfiles/centos8.docker
@ -17,6 +17,7 @@ ENV PACKAGES \
    libbpf-devel \
    libepoxy-devel \
    libfdt-devel \
+    libffi-devel \
    libgcrypt-devel \
    lzo-devel \
    make \
--- a/tests/docker/dockerfiles/debian10.docker
+++ b/tests/docker/dockerfiles/debian10.docker
@ -26,6 +26,7 @@ RUN apt update && \
        gdb-multiarch \
        gettext \
        git \
+        libffi-dev \
        libncurses5-dev \
        ninja-build \
        pkg-config \
--- a/tests/docker/dockerfiles/fedora-i386-cross.docker
+++ b/tests/docker/dockerfiles/fedora-i386-cross.docker
@ -6,6 +6,7 @@ ENV PACKAGES \
    findutils \
    gcc \
    git \
+    libffi-devel.i686 \
    libtasn1-devel.i686 \
    libzstd-devel.i686 \
    make \
--- a/tests/docker/dockerfiles/fedora-win32-cross.docker
+++ b/tests/docker/dockerfiles/fedora-win32-cross.docker
@ -19,6 +19,7 @@ ENV PACKAGES \
    mingw32-gmp \
    mingw32-gnutls \
    mingw32-gtk3 \
+    mingw32-libffi \
    mingw32-libjpeg-turbo \
    mingw32-libpng \
    mingw32-libtasn1 \
--- a/tests/docker/dockerfiles/fedora-win64-cross.docker
+++ b/tests/docker/dockerfiles/fedora-win64-cross.docker
@ -18,6 +18,7 @@ ENV PACKAGES \
    mingw64-glib2 \
    mingw64-gmp \
    mingw64-gtk3 \
+    mingw64-libffi \
    mingw64-libjpeg-turbo \
    mingw64-libpng \
    mingw64-libtasn1 \
--- a/tests/docker/dockerfiles/fedora.docker
+++ b/tests/docker/dockerfiles/fedora.docker
@ -33,6 +33,7 @@ ENV PACKAGES \
    libepoxy-devel \
    libfdt-devel \
    libbpf-devel \
+    libffi-devel \
    libiscsi-devel \
    libjpeg-devel \
    libpmem-devel \
--- a/tests/docker/dockerfiles/ubuntu.docker
+++ b/tests/docker/dockerfiles/ubuntu.docker
@ -28,6 +28,7 @@ ENV PACKAGES \
    libdrm-dev \
    libepoxy-dev \
    libfdt-dev \
+    libffi-dev \
    libgbm-dev \
    libgnutls28-dev \
    libgtk-3-dev \
--- a/tests/docker/dockerfiles/ubuntu1804.docker
+++ b/tests/docker/dockerfiles/ubuntu1804.docker
@ -16,6 +16,7 @@ ENV PACKAGES \
    libdrm-dev \
    libepoxy-dev \
    libfdt-dev \
+    libffi-dev \
    libgbm-dev \
    libgtk-3-dev \
    libibverbs-dev \
--- a/tests/docker/dockerfiles/ubuntu2004.docker
+++ b/tests/docker/dockerfiles/ubuntu2004.docker
@ -19,6 +19,7 @@ ENV PACKAGES flex bison \
    libdrm-dev \
    libepoxy-dev \
    libfdt-dev \
+    libffi-dev \
    libgbm-dev \
    libgtk-3-dev \
    libibverbs-dev \
--- a/tests/tcg/Makefile.target
+++ b/tests/tcg/Makefile.target
@ -81,8 +81,10 @@ LDFLAGS=
 QEMU_OPTS=


-# If TCG debugging is enabled things are a lot slower
-ifeq ($(CONFIG_DEBUG_TCG),y)
+# If TCG debugging, or TCI is enabled things are a lot slower
+ifneq ($(CONFIG_TCG_INTERPRETER),)
+TIMEOUT=90
+else ifneq ($(CONFIG_DEBUG_TCG),)
 TIMEOUT=60
 else
 TIMEOUT=15
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@ -58,7 +58,11 @@ void *qemu_try_memalign(size_t alignment, size_t size)
    void *ptr;

    g_assert(size != 0);
-    g_assert(is_power_of_2(alignment));
+    if (alignment < sizeof(void *)) {
+        alignment = sizeof(void *);
+    } else {
+        g_assert(is_power_of_2(alignment));
+    }
    ptr = _aligned_malloc(size, alignment);
    trace_qemu_memalign(alignment, size, ptr);
    return ptr;