tcg: Add generic vector expanders

Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-09-15 14:11:45 -07:00 · 2017-09-15 14:11:45 -07:00 · db432672dc
commit db432672dc
parent 474b2e8f0f
13 changed files with 2024 additions and 18 deletions
--- a/Makefile.target
+++ b/Makefile.target
@ -93,7 +93,7 @@ all: $(PROGS) stap
 # cpu emulator library
 obj-y += exec.o
 obj-y += accel/
-obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o
+obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o tcg/tcg-op-gvec.o
 obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/optimize.o
 obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
 obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
--- a/accel/tcg/Makefile.objs
+++ b/accel/tcg/Makefile.objs
@ -1,6 +1,6 @@
 obj-$(CONFIG_SOFTMMU) += tcg-all.o
 obj-$(CONFIG_SOFTMMU) += cputlb.o
-obj-y += tcg-runtime.o
+obj-y += tcg-runtime.o tcg-runtime-gvec.o
 obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
 obj-y += translator.o

--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@ -0,0 +1,325 @@
+/*
+ * Generic vectorized operation runtime
+ *
+ * Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "tcg-gvec-desc.h"
+
+
+/* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
+ * them via GCC's generic vector extension.  This turns out to be simpler and
+ * more reliable than getting the compiler to autovectorize.
+ *
+ * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
+ * are multiples of 16.
+ *
+ * When the compiler does not support all of the operations we require, the
+ * loops are written so that we can always fall back on the base types.
+ */
+#ifdef CONFIG_VECTOR16
+typedef uint8_t vec8 __attribute__((vector_size(16)));
+typedef uint16_t vec16 __attribute__((vector_size(16)));
+typedef uint32_t vec32 __attribute__((vector_size(16)));
+typedef uint64_t vec64 __attribute__((vector_size(16)));
+
+typedef int8_t svec8 __attribute__((vector_size(16)));
+typedef int16_t svec16 __attribute__((vector_size(16)));
+typedef int32_t svec32 __attribute__((vector_size(16)));
+typedef int64_t svec64 __attribute__((vector_size(16)));
+
+#define DUP16(X)  { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
+#define DUP8(X)   { X, X, X, X, X, X, X, X }
+#define DUP4(X)   { X, X, X, X }
+#define DUP2(X)   { X, X }
+#else
+typedef uint8_t vec8;
+typedef uint16_t vec16;
+typedef uint32_t vec32;
+typedef uint64_t vec64;
+
+typedef int8_t svec8;
+typedef int16_t svec16;
+typedef int32_t svec32;
+typedef int64_t svec64;
+
+#define DUP16(X)  X
+#define DUP8(X)   X
+#define DUP4(X)   X
+#define DUP2(X)   X
+#endif /* CONFIG_VECTOR16 */
+
+static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
+{
+    intptr_t maxsz = simd_maxsz(desc);
+    intptr_t i;
+
+    if (unlikely(maxsz > oprsz)) {
+        for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) {
+            *(uint64_t *)(d + i) = 0;
+        }
+    }
+}
+
+void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = -*(vec8 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = -*(vec16 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = -*(vec32 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = -*(vec64 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+
+    memcpy(d, a, oprsz);
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    if (c == 0) {
+        oprsz = 0;
+    } else {
+        for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+            *(uint64_t *)(d + i) = c;
+        }
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    if (c == 0) {
+        oprsz = 0;
+    } else {
+        for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+            *(uint32_t *)(d + i) = c;
+        }
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c)
+{
+    HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff));
+}
+
+void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c)
+{
+    HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff));
+}
+
+void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@ -134,3 +134,32 @@ GEN_ATOMIC_HELPERS(xor_fetch)
 GEN_ATOMIC_HELPERS(xchg)

 #undef GEN_ATOMIC_HELPERS
+
+DEF_HELPER_FLAGS_3(gvec_mov, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_dup8, TCG_CALL_NO_RWG, void, ptr, i32, i32)
+DEF_HELPER_FLAGS_3(gvec_dup16, TCG_CALL_NO_RWG, void, ptr, i32, i32)
+DEF_HELPER_FLAGS_3(gvec_dup32, TCG_CALL_NO_RWG, void, ptr, i32, i32)
+DEF_HELPER_FLAGS_3(gvec_dup64, TCG_CALL_NO_RWG, void, ptr, i32, i64)
+
+DEF_HELPER_FLAGS_4(gvec_add8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_sub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
--- a/48
+++ b/48
@ -5000,6 +5000,50 @@ if compile_prog "" "" ; then
  atomic64=yes
 fi

+########################################
+# See if 16-byte vector operations are supported.
+# Even without a vector unit the compiler may expand these.
+# There is a bug in old GCC for PPC that crashes here.
+# Unfortunately it's the system compiler for Centos 7.
+
+cat > $TMPC << EOF
+typedef unsigned char U1 __attribute__((vector_size(16)));
+typedef unsigned short U2 __attribute__((vector_size(16)));
+typedef unsigned int U4 __attribute__((vector_size(16)));
+typedef unsigned long long U8 __attribute__((vector_size(16)));
+typedef signed char S1 __attribute__((vector_size(16)));
+typedef signed short S2 __attribute__((vector_size(16)));
+typedef signed int S4 __attribute__((vector_size(16)));
+typedef signed long long S8 __attribute__((vector_size(16)));
+static U1 a1, b1;
+static U2 a2, b2;
+static U4 a4, b4;
+static U8 a8, b8;
+static S1 c1;
+static S2 c2;
+static S4 c4;
+static S8 c8;
+static int i;
+int main(void)
+{
+  a1 += b1; a2 += b2; a4 += b4; a8 += b8;
+  a1 -= b1; a2 -= b2; a4 -= b4; a8 -= b8;
+  a1 *= b1; a2 *= b2; a4 *= b4; a8 *= b8;
+  a1 &= b1; a2 &= b2; a4 &= b4; a8 &= b8;
+  a1 |= b1; a2 |= b2; a4 |= b4; a8 |= b8;
+  a1 ^= b1; a2 ^= b2; a4 ^= b4; a8 ^= b8;
+  a1 <<= i; a2 <<= i; a4 <<= i; a8 <<= i;
+  a1 >>= i; a2 >>= i; a4 >>= i; a8 >>= i;
+  c1 >>= i; c2 >>= i; c4 >>= i; c8 >>= i;
+  return 0;
+}
+EOF
+
+vector16=no
+if compile_prog "" "" ; then
+  vector16=yes
+fi
+
 ########################################
 # check if getauxval is available.

@ -6329,6 +6373,10 @@ if test "$atomic64" = "yes" ; then
  echo "CONFIG_ATOMIC64=y" >> $config_host_mak
 fi

+if test "$vector16" = "yes" ; then
+  echo "CONFIG_VECTOR16=y" >> $config_host_mak
+fi
+
 if test "$getauxval" = "yes" ; then
  echo "CONFIG_GETAUXVAL=y" >> $config_host_mak
 fi
--- a/tcg/tcg-gvec-desc.h
+++ b/tcg/tcg-gvec-desc.h
@ -0,0 +1,49 @@
+/*
+ * Generic vector operation descriptor
+ *
+ * Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */
+#define SIMD_OPRSZ_SHIFT   0
+#define SIMD_OPRSZ_BITS    5
+
+#define SIMD_MAXSZ_SHIFT   (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
+#define SIMD_MAXSZ_BITS    5
+
+#define SIMD_DATA_SHIFT    (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
+#define SIMD_DATA_BITS     (32 - SIMD_DATA_SHIFT)
+
+/* Create a descriptor from components.  */
+uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data);
+
+/* Extract the operation size from a descriptor.  */
+static inline intptr_t simd_oprsz(uint32_t desc)
+{
+    return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8;
+}
+
+/* Extract the max vector size from a descriptor.  */
+static inline intptr_t simd_maxsz(uint32_t desc)
+{
+    return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8;
+}
+
+/* Extract the operation-specific data from a descriptor.  */
+static inline int32_t simd_data(uint32_t desc)
+{
+    return sextract32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS);
+}
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
--- a/tcg/tcg-op-gvec.h
+++ b/tcg/tcg-op-gvec.h
@ -0,0 +1,198 @@
+/*
+ * Generic vector operation expansion
+ *
+ * Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * "Generic" vectors.  All operands are given as offsets from ENV,
+ * and therefore cannot also be allocated via tcg_global_mem_new_*.
+ * OPRSZ is the byte size of the vector upon which the operation is performed.
+ * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
+ *
+ * All sizes must be 8 or any multiple of 16.
+ * When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
+ * Operands may completely, but not partially, overlap.
+ */
+
+/* Expand a call to a gvec-style helper, with pointers to two vector
+   operands, and a descriptor (see tcg-gvec-desc.h).  */
+typedef void gen_helper_gvec_2(TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_2 *fn);
+
+/* Similarly, passing an extra pointer (e.g. env or float_status).  */
+typedef void gen_helper_gvec_2_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
+                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+                        int32_t data, gen_helper_gvec_2_ptr *fn);
+
+/* Similarly, with three vector operands.  */
+typedef void gen_helper_gvec_3(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_3 *fn);
+
+/* Similarly, with four vector operands.  */
+typedef void gen_helper_gvec_4(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                               TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
+                        int32_t data, gen_helper_gvec_4 *fn);
+
+/* Similarly, with five vector operands.  */
+typedef void gen_helper_gvec_5(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                               TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
+                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn);
+
+typedef void gen_helper_gvec_3_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                                   TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+                        int32_t data, gen_helper_gvec_3_ptr *fn);
+
+typedef void gen_helper_gvec_4_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
+                        uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_4_ptr *fn);
+
+/* Expand a gvec operation.  Either inline or out-of-line depending on
+   the actual vector size and the operations supported by the host.  */
+typedef struct {
+    /* Expand inline as a 64-bit or 32-bit integer.
+       Only one of these will be non-NULL.  */
+    void (*fni8)(TCGv_i64, TCGv_i64);
+    void (*fni4)(TCGv_i32, TCGv_i32);
+    /* Expand inline with a host vector type.  */
+    void (*fniv)(unsigned, TCGv_vec, TCGv_vec);
+    /* Expand out-of-line helper w/descriptor.  */
+    gen_helper_gvec_2 *fno;
+    /* The opcode, if any, to which this corresponds.  */
+    TCGOpcode opc;
+    /* The data argument to the out-of-line helper.  */
+    int32_t data;
+    /* The vector element size, if applicable.  */
+    uint8_t vece;
+    /* Prefer i64 to v64.  */
+    bool prefer_i64;
+} GVecGen2;
+
+typedef struct {
+    /* Expand inline as a 64-bit or 32-bit integer.
+       Only one of these will be non-NULL.  */
+    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
+    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
+    /* Expand inline with a host vector type.  */
+    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
+    /* Expand out-of-line helper w/descriptor.  */
+    gen_helper_gvec_3 *fno;
+    /* The opcode, if any, to which this corresponds.  */
+    TCGOpcode opc;
+    /* The data argument to the out-of-line helper.  */
+    int32_t data;
+    /* The vector element size, if applicable.  */
+    uint8_t vece;
+    /* Prefer i64 to v64.  */
+    bool prefer_i64;
+    /* Load dest as a 3rd source operand.  */
+    bool load_dest;
+} GVecGen3;
+
+typedef struct {
+    /* Expand inline as a 64-bit or 32-bit integer.
+       Only one of these will be non-NULL.  */
+    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64);
+    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32);
+    /* Expand inline with a host vector type.  */
+    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec);
+    /* Expand out-of-line helper w/descriptor.  */
+    gen_helper_gvec_4 *fno;
+    /* The opcode, if any, to which this corresponds.  */
+    TCGOpcode opc;
+    /* The data argument to the out-of-line helper.  */
+    int32_t data;
+    /* The vector element size, if applicable.  */
+    uint8_t vece;
+    /* Prefer i64 to v64.  */
+    bool prefer_i64;
+} GVecGen4;
+
+void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
+                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *);
+void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *);
+void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
+                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *);
+
+/* Expand a specific vector operation.  */
+
+void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
+                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
+                          uint32_t s, uint32_t m);
+void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
+                          uint32_t m, TCGv_i32);
+void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
+                          uint32_t m, TCGv_i64);
+
+void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t s, uint32_t m, uint8_t x);
+void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t s, uint32_t m, uint16_t x);
+void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t s, uint32_t m, uint32_t x);
+void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t s, uint32_t m, uint64_t x);
+
+/*
+ * 64-bit vector operations.  Use these when the register has been allocated
+ * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
+ * OPRSZ = MAXSZ = 8.
+ */
+
+void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
+void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
+void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
+
+void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+
+void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@ -73,7 +73,8 @@ static void vec_gen_op2(TCGOpcode opc, unsigned vece, TCGv_vec r, TCGv_vec a)
    TCGTemp *at = tcgv_vec_temp(a);
    TCGType type = rt->base_type;

-    tcg_debug_assert(at->base_type == type);
+    /* Must enough inputs for the output.  */
+    tcg_debug_assert(at->base_type >= type);
    vec_gen_2(opc, type, vece, temp_arg(rt), temp_arg(at));
 }

@ -85,8 +86,9 @@ static void vec_gen_op3(TCGOpcode opc, unsigned vece,
    TCGTemp *bt = tcgv_vec_temp(b);
    TCGType type = rt->base_type;

-    tcg_debug_assert(at->base_type == type);
-    tcg_debug_assert(bt->base_type == type);
+    /* Must enough inputs for the output.  */
+    tcg_debug_assert(at->base_type >= type);
+    tcg_debug_assert(bt->base_type >= type);
    vec_gen_3(opc, type, vece, temp_arg(rt), temp_arg(at), temp_arg(bt));
 }

@ -99,7 +101,7 @@ void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)

 #define MO_REG  (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32)

-static void tcg_gen_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
+static void do_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
 {
    TCGTemp *rt = tcgv_vec_temp(r);
    vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a);
@ -108,14 +110,14 @@ static void tcg_gen_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
 TCGv_vec tcg_const_zeros_vec(TCGType type)
 {
    TCGv_vec ret = tcg_temp_new_vec(type);
-    tcg_gen_dupi_vec(ret, MO_REG, 0);
+    do_dupi_vec(ret, MO_REG, 0);
    return ret;
 }

 TCGv_vec tcg_const_ones_vec(TCGType type)
 {
    TCGv_vec ret = tcg_temp_new_vec(type);
-    tcg_gen_dupi_vec(ret, MO_REG, -1);
+    do_dupi_vec(ret, MO_REG, -1);
    return ret;
 }

@ -134,9 +136,9 @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
 void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
 {
    if (TCG_TARGET_REG_BITS == 32 && a == deposit64(a, 32, 32, a)) {
-        tcg_gen_dupi_vec(r, MO_32, a);
+        do_dupi_vec(r, MO_32, a);
    } else if (TCG_TARGET_REG_BITS == 64 || a == (uint64_t)(int32_t)a) {
-        tcg_gen_dupi_vec(r, MO_64, a);
+        do_dupi_vec(r, MO_64, a);
    } else {
        TCGv_i64 c = tcg_const_i64(a);
        tcg_gen_dup_i64_vec(MO_64, r, c);
@ -146,17 +148,22 @@ void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)

 void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
 {
-    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xffffffffu) * a);
+    do_dupi_vec(r, MO_REG, dup_const(MO_32, a));
 }

 void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
 {
-    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xffff) * (a & 0xffff));
+    do_dupi_vec(r, MO_REG, dup_const(MO_16, a));
 }

 void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
 {
-    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xff) * (a & 0xff));
+    do_dupi_vec(r, MO_REG, dup_const(MO_8, a));
+}
+
+void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
+{
+    do_dupi_vec(r, MO_REG, dup_const(vece, a));
 }

 void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
@ -167,14 +174,14 @@ void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)

    if (TCG_TARGET_REG_BITS == 64) {
        TCGArg ai = tcgv_i64_arg(a);
-        vec_gen_2(INDEX_op_dup_vec, type, MO_64, ri, ai);
+        vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
    } else if (vece == MO_64) {
        TCGArg al = tcgv_i32_arg(TCGV_LOW(a));
        TCGArg ah = tcgv_i32_arg(TCGV_HIGH(a));
        vec_gen_3(INDEX_op_dup2_vec, type, MO_64, ri, al, ah);
    } else {
        TCGArg ai = tcgv_i32_arg(TCGV_LOW(a));
-        vec_gen_2(INDEX_op_dup_vec, type, MO_64, ri, ai);
+        vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
    }
 }

--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@ -914,6 +914,7 @@ void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
 void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
 void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
 void tcg_gen_dup64i_vec(TCGv_vec, uint64_t);
+void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
 void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
 void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
 void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@ -228,6 +228,12 @@ DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
 DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
 DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))

+DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT)
+
+#if TCG_TARGET_MAYBE_vec
+#include "tcg-target.opc.h"
+#endif
+
 #undef TLADDR_ARGS
 #undef DATA64_ARGS
 #undef IMPL
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@ -1403,10 +1403,10 @@ bool tcg_op_supported(TCGOpcode op)
    case INDEX_op_orc_vec:
        return have_vec && TCG_TARGET_HAS_orc_vec;

-    case NB_OPS:
-        break;
+    default:
+        tcg_debug_assert(op > INDEX_op_last_generic && op < NB_OPS);
+        return true;
    }
-    g_assert_not_reached();
 }

 /* Note: we convert the 64 bit args to 32 bit and do some alignment
@ -3733,3 +3733,10 @@ void tcg_register_jit(void *buf, size_t buf_size)
 {
 }
 #endif /* ELF_HOST_MACHINE */
+
+#if !TCG_TARGET_MAYBE_vec
+void tcg_expand_vec_op(TCGOpcode o, TCGType t, unsigned e, TCGArg a0, ...)
+{
+    g_assert_not_reached();
+}
+#endif
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@ -1207,6 +1207,33 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr);

 void tcg_register_jit(void *buf, size_t buf_size);

+#if TCG_TARGET_MAYBE_vec
+/* Return zero if the tuple (opc, type, vece) is unsupportable;
+   return > 0 if it is directly supportable;
+   return < 0 if we must call tcg_expand_vec_op.  */
+int tcg_can_emit_vec_op(TCGOpcode, TCGType, unsigned);
+#else
+static inline int tcg_can_emit_vec_op(TCGOpcode o, TCGType t, unsigned ve)
+{
+    return 0;
+}
+#endif
+
+/* Expand the tuple (opc, type, vece) on the given arguments.  */
+void tcg_expand_vec_op(TCGOpcode, TCGType, unsigned, TCGArg, ...);
+
+/* Replicate a constant C accoring to the log2 of the element size.  */
+uint64_t dup_const(unsigned vece, uint64_t c);
+
+#define dup_const(VECE, C)                                         \
+    (__builtin_constant_p(VECE)                                    \
+     ? (  (VECE) == MO_8  ? 0x0101010101010101ull * (uint8_t)(C)   \
+        : (VECE) == MO_16 ? 0x0001000100010001ull * (uint16_t)(C)  \
+        : (VECE) == MO_32 ? 0x0000000100000001ull * (uint32_t)(C)  \
+        : dup_const(VECE, C))                                      \
+     : dup_const(VECE, C))
+
+
 /*
 * Memory helpers that will be used by TCG generated code.
 */