target-arm queue:

* convert aarch32 VFP decoder to decodetree (includes tightening up decode in a few places) * fix minor bugs in VFP short-vector handling * hw/core/bus.c: Only the main system bus can have no parent * smmuv3: Fix decoding of ID register range * Implement NSACR gating of floating point * Use tcg_gen_gvec_bitsel -----BEGIN PGP SIGNATURE----- iQJNBAABCAA3FiEE4aXFk81BneKOgxXPPCUl7RQ2DN4FAl0CWosZHHBldGVyLm1h eWRlbGxAbGluYXJvLm9yZwAKCRA8JSXtFDYM3rJlD/4jOccv8t27ixQXRdAkc3KF E4mI28aSqzSpIZBQBeg06AuWK1kuO6e0TiivQ7xylDDc5EYsGVoqQgU/vFRzWrKJ dQ8TxerNd23bAmIYmFh2PEebmlWpXT++sppJGhA5mm5vyf3wKtl2Amo30ZItRCG7 HhrKtM8pJJjI02dmVhgx8nlSs8U8vTDTwSD5pKCTl9uEd+PZ+dfENTkJwuyau37P UrV3rfBUhtQIyNTuqS0G/uqD1u966oe0AdX59MzOnitt9nbtKpPVuIcV6/qYwakL eGRvYpT6Ip/6pD34wKxw3PRqmIPZkvVl7TImtpBn9doH0eWutojI5j9wDMU/F6tT Z2xQ0x8tNPT0zGA+YjQrHFzGV6uA9isL2Mpa4Prm7MtlfFF10ibyRlwNzxNJq5F7 OkGvPPPMWdZDPNKVonnu6R2+iLaIWCATE1OFL8/pw1DZN4qwYvW019Y43D36Xqb1 fyyDFan50osu/57dkngQbvbWuvf/O6usoIL432U7mBJY/115bXXzBfD0KCm3nAnU hmGtxUX8dMXdm1OCdUS8QRZCupDByreLPut5ICA3VlmJBw2cEUMHfutlX8IIvLpH PQOquVyGpTmAT4ghWcNtT/xs2sfsg8n2r+riOFEyl56AUyz1LcQPcv6qbP58cfaw x2scWxih6EeL2+48hPt7QA== =rXhV -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20190613-1' into staging target-arm queue: * convert aarch32 VFP decoder to decodetree (includes tightening up decode in a few places) * fix minor bugs in VFP short-vector handling * hw/core/bus.c: Only the main system bus can have no parent * smmuv3: Fix decoding of ID register range * Implement NSACR gating of floating point * Use tcg_gen_gvec_bitsel # gpg: Signature made Thu 13 Jun 2019 15:15:39 BST # gpg: using RSA key E1A5C593CD419DE28E8315CF3C2525ED14360CDE # gpg: issuer "peter.maydell@linaro.org" # gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>" [ultimate] # gpg: aka "Peter Maydell <pmaydell@gmail.com>" [ultimate] # gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>" [ultimate] # Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE * remotes/pmaydell/tags/pull-target-arm-20190613-1: (47 commits) target/arm: Fix short-vector increment behaviour target/arm: Convert float-to-integer VCVT insns to decodetree target/arm: Convert VCVT fp/fixed-point conversion insns to decodetree target/arm: Convert VJCVT to decodetree target/arm: Convert integer-to-float insns to decodetree target/arm: Convert double-single precision conversion insns to decodetree target/arm: Convert VFP round insns to decodetree target/arm: Convert the VCVT-to-f16 insns to decodetree target/arm: Convert the VCVT-from-f16 insns to decodetree target/arm: Convert VFP comparison insns to decodetree target/arm: Convert VMOV (register) to decodetree target/arm: Convert VSQRT to decodetree target/arm: Convert VNEG to decodetree target/arm: Convert VABS to decodetree target/arm: Convert VMOV (imm) to decodetree target/arm: Convert VFP fused multiply-add insns to decodetree target/arm: Convert VDIV to decodetree target/arm: Convert VSUB to decodetree target/arm: Convert VADD to decodetree target/arm: Convert VNMUL to decodetree ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2019-06-13 15:16:39 +01:00 · 2019-06-13 15:16:39 +01:00 · 650a379d50
parent 785a602eae 18cf951af9
commit 650a379d50
17 changed files with 3203 additions and 1572 deletions
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@ -1232,7 +1232,7 @@ static MemTxResult smmu_readl(SMMUv3State *s, hwaddr offset,
                              uint64_t *data, MemTxAttrs attrs)
 {
    switch (offset) {
-    case A_IDREGS ... A_IDREGS + 0x1f:
+    case A_IDREGS ... A_IDREGS + 0x2f:
        *data = smmuv3_idreg(offset - A_IDREGS);
        return MEMTX_OK;
    case A_IDR0 ... A_IDR5:
--- a/hw/core/bus.c
+++ b/hw/core/bus.c
@ -97,10 +97,9 @@ static void qbus_realize(BusState *bus, DeviceState *parent, const char *name)
        bus->parent->num_child_bus++;
        object_property_add_child(OBJECT(bus->parent), bus->name, OBJECT(bus), NULL);
        object_unref(OBJECT(bus));
-    } else if (bus != sysbus_get_default()) {
-        /* TODO: once all bus devices are qdevified,
-           only reset handler for main_system_bus should be registered here. */
-        qemu_register_reset(qbus_reset_all_fn, bus);
+    } else {
+        /* The only bus without a parent is the main system bus */
+        assert(bus == sysbus_get_default());
    }
 }

@ -109,18 +108,16 @@ static void bus_unparent(Object *obj)
    BusState *bus = BUS(obj);
    BusChild *kid;

+    /* Only the main system bus has no parent, and that bus is never freed */
+    assert(bus->parent);
+
    while ((kid = QTAILQ_FIRST(&bus->children)) != NULL) {
        DeviceState *dev = kid->child;
        object_unparent(OBJECT(dev));
    }
-    if (bus->parent) {
-        QLIST_REMOVE(bus, sibling);
-        bus->parent->num_child_bus--;
-        bus->parent = NULL;
-    } else {
-        assert(bus != sysbus_get_default()); /* main_system_bus is never freed */
-        qemu_unregister_reset(qbus_reset_all_fn, bus);
-    }
+    QLIST_REMOVE(bus, sibling);
+    bus->parent->num_child_bus--;
+    bus->parent = NULL;
 }

 void qbus_create_inplace(void *bus, size_t size, const char *typename,
--- a/scripts/decodetree.py
+++ b/scripts/decodetree.py
@ -184,7 +184,7 @@ class Field:
        return '{0}(insn, {1}, {2})'.format(extr, self.pos, self.len)

    def __eq__(self, other):
-        return self.sign == other.sign and self.sign == other.sign
+        return self.sign == other.sign and self.mask == other.mask

    def __ne__(self, other):
        return not self.__eq__(other)
--- a/target/arm/Makefile.objs
+++ b/target/arm/Makefile.objs
@ -19,5 +19,18 @@ target/arm/decode-sve.inc.c: $(SRC_PATH)/target/arm/sve.decode $(DECODETREE)
 	  $(PYTHON) $(DECODETREE) --decode disas_sve -o $@ $<,\
 	  "GEN", $(TARGET_DIR)$@)

+target/arm/decode-vfp.inc.c: $(SRC_PATH)/target/arm/vfp.decode $(DECODETREE)
+	$(call quiet-command,\
+	  $(PYTHON) $(DECODETREE) --static-decode disas_vfp -o $@ $<,\
+	  "GEN", $(TARGET_DIR)$@)
+
+target/arm/decode-vfp-uncond.inc.c: $(SRC_PATH)/target/arm/vfp-uncond.decode $(DECODETREE)
+	$(call quiet-command,\
+	  $(PYTHON) $(DECODETREE) --static-decode disas_vfp_uncond -o $@ $<,\
+	  "GEN", $(TARGET_DIR)$@)
+
 target/arm/translate-sve.o: target/arm/decode-sve.inc.c
+target/arm/translate.o: target/arm/decode-vfp.inc.c
+target/arm/translate.o: target/arm/decode-vfp-uncond.inc.c
+
 obj-$(TARGET_AARCH64) += translate-sve.o sve_helper.o
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@ -1609,6 +1609,8 @@ static void cortex_r5f_initfn(Object *obj)

    cortex_r5_initfn(obj);
    set_feature(&cpu->env, ARM_FEATURE_VFP3);
+    cpu->isar.mvfr0 = 0x10110221;
+    cpu->isar.mvfr1 = 0x00000011;
 }

 static const ARMCPRegInfo cortexa8_cp_reginfo[] = {
@ -2021,6 +2023,10 @@ static void arm_max_initfn(Object *obj)
        kvm_arm_set_cpu_features_from_host(cpu);
    } else {
        cortex_a15_initfn(obj);
+
+        /* old-style VFP short-vector support */
+        cpu->isar.mvfr0 = FIELD_DP32(cpu->isar.mvfr0, MVFR0, FPSHVEC, 1);
+
 #ifdef CONFIG_USER_ONLY
        /* We don't set these in system emulation mode for the moment,
         * since we don't correctly set (all of) the ID registers to
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@ -3371,6 +3371,17 @@ static inline bool isar_feature_aa32_fp16_arith(const ARMISARegisters *id)
    return FIELD_EX64(id->id_aa64pfr0, ID_AA64PFR0, FP) == 1;
 }

+static inline bool isar_feature_aa32_fp_d32(const ARMISARegisters *id)
+{
+    /* Return true if D16-D31 are implemented */
+    return FIELD_EX64(id->mvfr0, MVFR0, SIMDREG) >= 2;
+}
+
+static inline bool isar_feature_aa32_fpshvec(const ARMISARegisters *id)
+{
+    return FIELD_EX64(id->mvfr0, MVFR0, FPSHVEC) > 0;
+}
+
 /*
 * We always set the FP and SIMD FP16 fields to indicate identical
 * levels of support (assuming SIMD is implemented at all), so
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@ -930,9 +930,36 @@ static void cpacr_write(CPUARMState *env, const ARMCPRegInfo *ri,
        }
        value &= mask;
    }
+
+    /*
+     * For A-profile AArch32 EL3 (but not M-profile secure mode), if NSACR.CP10
+     * is 0 then CPACR.{CP11,CP10} ignore writes and read as 0b00.
+     */
+    if (arm_feature(env, ARM_FEATURE_EL3) && !arm_el_is_aa64(env, 3) &&
+        !arm_is_secure(env) && !extract32(env->cp15.nsacr, 10, 1)) {
+        value &= ~(0xf << 20);
+        value |= env->cp15.cpacr_el1 & (0xf << 20);
+    }
+
    env->cp15.cpacr_el1 = value;
 }

+static uint64_t cpacr_read(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+    /*
+     * For A-profile AArch32 EL3 (but not M-profile secure mode), if NSACR.CP10
+     * is 0 then CPACR.{CP11,CP10} ignore writes and read as 0b00.
+     */
+    uint64_t value = env->cp15.cpacr_el1;
+
+    if (arm_feature(env, ARM_FEATURE_EL3) && !arm_el_is_aa64(env, 3) &&
+        !arm_is_secure(env) && !extract32(env->cp15.nsacr, 10, 1)) {
+        value &= ~(0xf << 20);
+    }
+    return value;
+}
+
+
 static void cpacr_reset(CPUARMState *env, const ARMCPRegInfo *ri)
 {
    /* Call cpacr_write() so that we reset with the correct RAO bits set
@ -998,7 +1025,7 @@ static const ARMCPRegInfo v6_cp_reginfo[] = {
    { .name = "CPACR", .state = ARM_CP_STATE_BOTH, .opc0 = 3,
      .crn = 1, .crm = 0, .opc1 = 0, .opc2 = 2, .accessfn = cpacr_access,
      .access = PL1_RW, .fieldoffset = offsetof(CPUARMState, cp15.cpacr_el1),
-      .resetfn = cpacr_reset, .writefn = cpacr_write },
+      .resetfn = cpacr_reset, .writefn = cpacr_write, .readfn = cpacr_read },
    REGINFO_SENTINEL
 };

@ -4683,6 +4710,36 @@ uint64_t arm_hcr_el2_eff(CPUARMState *env)
    return ret;
 }

+static void cptr_el2_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                           uint64_t value)
+{
+    /*
+     * For A-profile AArch32 EL3, if NSACR.CP10
+     * is 0 then HCPTR.{TCP11,TCP10} ignore writes and read as 1.
+     */
+    if (arm_feature(env, ARM_FEATURE_EL3) && !arm_el_is_aa64(env, 3) &&
+        !arm_is_secure(env) && !extract32(env->cp15.nsacr, 10, 1)) {
+        value &= ~(0x3 << 10);
+        value |= env->cp15.cptr_el[2] & (0x3 << 10);
+    }
+    env->cp15.cptr_el[2] = value;
+}
+
+static uint64_t cptr_el2_read(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+    /*
+     * For A-profile AArch32 EL3, if NSACR.CP10
+     * is 0 then HCPTR.{TCP11,TCP10} ignore writes and read as 1.
+     */
+    uint64_t value = env->cp15.cptr_el[2];
+
+    if (arm_feature(env, ARM_FEATURE_EL3) && !arm_el_is_aa64(env, 3) &&
+        !arm_is_secure(env) && !extract32(env->cp15.nsacr, 10, 1)) {
+        value |= 0x3 << 10;
+    }
+    return value;
+}
+
 static const ARMCPRegInfo el2_cp_reginfo[] = {
    { .name = "HCR_EL2", .state = ARM_CP_STATE_AA64,
      .type = ARM_CP_IO,
@ -4730,7 +4787,8 @@ static const ARMCPRegInfo el2_cp_reginfo[] = {
    { .name = "CPTR_EL2", .state = ARM_CP_STATE_BOTH,
      .opc0 = 3, .opc1 = 4, .crn = 1, .crm = 1, .opc2 = 2,
      .access = PL2_RW, .accessfn = cptr_access, .resetvalue = 0,
-      .fieldoffset = offsetof(CPUARMState, cp15.cptr_el[2]) },
+      .fieldoffset = offsetof(CPUARMState, cp15.cptr_el[2]),
+      .readfn = cptr_el2_read, .writefn = cptr_el2_write },
    { .name = "MAIR_EL2", .state = ARM_CP_STATE_BOTH,
      .opc0 = 3, .opc1 = 4, .crn = 10, .crm = 2, .opc2 = 0,
      .access = PL2_RW, .fieldoffset = offsetof(CPUARMState, cp15.mair_el[2]),
@ -13587,6 +13645,19 @@ int fp_exception_el(CPUARMState *env, int cur_el)
        break;
    }

+    /*
+     * The NSACR allows A-profile AArch32 EL3 and M-profile secure mode
+     * to control non-secure access to the FPU. It doesn't have any
+     * effect if EL3 is AArch64 or if EL3 doesn't exist at all.
+     */
+    if ((arm_feature(env, ARM_FEATURE_EL3) && !arm_el_is_aa64(env, 3) &&
+         cur_el <= 2 && !arm_is_secure_below_el3(env))) {
+        if (!extract32(env->cp15.nsacr, 10, 1)) {
+            /* FP insns act as UNDEF */
+            return cur_el == 2 ? 2 : 1;
+        }
+    }
+
    /* For the CPTR registers we don't need to guard with an ARM_FEATURE
     * check because zero bits in the registers mean "don't trap".
     */
--- a/target/arm/pauth_helper.c
+++ b/target/arm/pauth_helper.c
@ -344,9 +344,9 @@ static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier,
    if (unlikely(extract64(test, bot_bit, top_bit - bot_bit))) {
        int error_code = (keynumber << 1) | (keynumber ^ 1);
        if (param.tbi) {
-            return deposit64(ptr, 53, 2, error_code);
+            return deposit64(orig_ptr, 53, 2, error_code);
        } else {
-            return deposit64(ptr, 61, 2, error_code);
+            return deposit64(orig_ptr, 61, 2, error_code);
        }
    }
    return orig_ptr;
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@ -704,6 +704,15 @@ static void gen_gvec_fn3(DisasContext *s, bool is_q, int rd, int rn, int rm,
            vec_full_reg_offset(s, rm), is_q ? 16 : 8, vec_full_reg_size(s));
 }

+/* Expand a 4-operand AdvSIMD vector operation using an expander function.  */
+static void gen_gvec_fn4(DisasContext *s, bool is_q, int rd, int rn, int rm,
+                         int rx, GVecGen4Fn *gvec_fn, int vece)
+{
+    gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+            vec_full_reg_offset(s, rm), vec_full_reg_offset(s, rx),
+            is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
 /* Expand a 2-operand + immediate AdvSIMD vector operation using
 * an op descriptor.
 */
@ -10918,13 +10927,13 @@ static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
        return;

    case 5: /* BSL bitwise select */
-        gen_gvec_op3(s, is_q, rd, rn, rm, &bsl_op);
+        gen_gvec_fn4(s, is_q, rd, rd, rn, rm, tcg_gen_gvec_bitsel, 0);
        return;
    case 6: /* BIT, bitwise insert if true */
-        gen_gvec_op3(s, is_q, rd, rn, rm, &bit_op);
+        gen_gvec_fn4(s, is_q, rd, rm, rn, rd, tcg_gen_gvec_bitsel, 0);
        return;
    case 7: /* BIF, bitwise insert if false */
-        gen_gvec_op3(s, is_q, rd, rn, rm, &bif_op);
+        gen_gvec_fn4(s, is_q, rd, rm, rd, rn, tcg_gen_gvec_bitsel, 0);
        return;

    default:
--- a/target/arm/translate-a64.h
+++ b/target/arm/translate-a64.h
@ -122,5 +122,7 @@ typedef void GVecGen2iFn(unsigned, uint32_t, uint32_t, int64_t,
                         uint32_t, uint32_t);
 typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t,
                        uint32_t, uint32_t, uint32_t);
+typedef void GVecGen4Fn(unsigned, uint32_t, uint32_t, uint32_t,
+                        uint32_t, uint32_t, uint32_t);

 #endif /* TARGET_ARM_TRANSLATE_A64_H */
--- a/target/arm/translate-vfp.inc.c
+++ b/target/arm/translate-vfp.inc.c
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
--- a/target/arm/translate.h
+++ b/target/arm/translate.h
@ -238,9 +238,6 @@ static inline void gen_ss_advance(DisasContext *s)
 }

 /* Vector operations shared between ARM and AArch64.  */
-extern const GVecGen3 bsl_op;
-extern const GVecGen3 bit_op;
-extern const GVecGen3 bif_op;
 extern const GVecGen3 mla_op[4];
 extern const GVecGen3 mls_op[4];
 extern const GVecGen3 cmtst_op[4];
--- a/target/arm/vfp-uncond.decode
+++ b/target/arm/vfp-uncond.decode
@ -0,0 +1,63 @@
+# AArch32 VFP instruction descriptions (unconditional insns)
+#
+#  Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# Encodings for the unconditional VFP instructions are here:
+# generally anything matching A32
+#  1111 1110 .... .... .... 101. ...0 ....
+# and T32
+#  1111 110. .... .... .... 101. .... ....
+#  1111 1110 .... .... .... 101. .... ....
+# (but those patterns might also cover some Neon instructions,
+# which do not live in this file.)
+
+# VFP registers have an odd encoding with a four-bit field
+# and a one-bit field which are assembled in different orders
+# depending on whether the register is double or single precision.
+# Each individual instruction function must do the checks for
+# "double register selected but CPU does not have double support"
+# and "double register number has bit 4 set but CPU does not
+# support D16-D31" (which should UNDEF).
+%vm_dp  5:1 0:4
+%vm_sp  0:4 5:1
+%vn_dp  7:1 16:4
+%vn_sp  16:4 7:1
+%vd_dp  22:1 12:4
+%vd_sp  12:4 22:1
+
+VSEL        1111 1110 0. cc:2 .... .... 1010 .0.0 .... \
+            vm=%vm_sp vn=%vn_sp vd=%vd_sp dp=0
+VSEL        1111 1110 0. cc:2 .... .... 1011 .0.0 .... \
+            vm=%vm_dp vn=%vn_dp vd=%vd_dp dp=1
+
+VMINMAXNM   1111 1110 1.00 .... .... 1010 . op:1 .0 .... \
+            vm=%vm_sp vn=%vn_sp vd=%vd_sp dp=0
+VMINMAXNM   1111 1110 1.00 .... .... 1011 . op:1 .0 .... \
+            vm=%vm_dp vn=%vn_dp vd=%vd_dp dp=1
+
+VRINT       1111 1110 1.11 10 rm:2 .... 1010 01.0 .... \
+            vm=%vm_sp vd=%vd_sp dp=0
+VRINT       1111 1110 1.11 10 rm:2 .... 1011 01.0 .... \
+            vm=%vm_dp vd=%vd_dp dp=1
+
+# VCVT float to int with specified rounding mode; Vd is always single-precision
+VCVT        1111 1110 1.11 11 rm:2 .... 1010 op:1 1.0 .... \
+            vm=%vm_sp vd=%vd_sp dp=0
+VCVT        1111 1110 1.11 11 rm:2 .... 1011 op:1 1.0 .... \
+            vm=%vm_dp vd=%vd_sp dp=1
--- a/target/arm/vfp.decode
+++ b/target/arm/vfp.decode
@ -0,0 +1,242 @@
+# AArch32 VFP instruction descriptions (conditional insns)
+#
+#  Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# Encodings for the conditional VFP instructions are here:
+# generally anything matching A32
+#  cccc 11.. .... .... .... 101. .... ....
+# and T32
+#  1110 110. .... .... .... 101. .... ....
+#  1110 1110 .... .... .... 101. .... ....
+# (but those patterns might also cover some Neon instructions,
+# which do not live in this file.)
+
+# VFP registers have an odd encoding with a four-bit field
+# and a one-bit field which are assembled in different orders
+# depending on whether the register is double or single precision.
+# Each individual instruction function must do the checks for
+# "double register selected but CPU does not have double support"
+# and "double register number has bit 4 set but CPU does not
+# support D16-D31" (which should UNDEF).
+%vm_dp  5:1 0:4
+%vm_sp  0:4 5:1
+%vn_dp  7:1 16:4
+%vn_sp  16:4 7:1
+%vd_dp  22:1 12:4
+%vd_sp  12:4 22:1
+
+%vmov_idx_b     21:1 5:2
+%vmov_idx_h     21:1 6:1
+
+# VMOV scalar to general-purpose register; note that this does
+# include some Neon cases.
+VMOV_to_gp   ---- 1110 u:1 1.        1 .... rt:4 1011 ... 1 0000 \
+             vn=%vn_dp size=0 index=%vmov_idx_b
+VMOV_to_gp   ---- 1110 u:1 0.        1 .... rt:4 1011 ..1 1 0000 \
+             vn=%vn_dp size=1 index=%vmov_idx_h
+VMOV_to_gp   ---- 1110 0   0 index:1 1 .... rt:4 1011 .00 1 0000 \
+             vn=%vn_dp size=2 u=0
+
+VMOV_from_gp ---- 1110 0 1.        0 .... rt:4 1011 ... 1 0000 \
+             vn=%vn_dp size=0 index=%vmov_idx_b
+VMOV_from_gp ---- 1110 0 0.        0 .... rt:4 1011 ..1 1 0000 \
+             vn=%vn_dp size=1 index=%vmov_idx_h
+VMOV_from_gp ---- 1110 0 0 index:1 0 .... rt:4 1011 .00 1 0000 \
+             vn=%vn_dp size=2
+
+VDUP         ---- 1110 1 b:1 q:1 0 .... rt:4 1011 . 0 e:1 1 0000 \
+             vn=%vn_dp
+
+VMSR_VMRS    ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000
+VMOV_single  ---- 1110 000 l:1 .... rt:4 1010 . 001 0000 \
+             vn=%vn_sp
+
+VMOV_64_sp   ---- 1100 010 op:1 rt2:4 rt:4 1010 00.1 .... \
+             vm=%vm_sp
+VMOV_64_dp   ---- 1100 010 op:1 rt2:4 rt:4 1011 00.1 .... \
+             vm=%vm_dp
+
+# Note that the half-precision variants of VLDR and VSTR are
+# not part of this decodetree at all because they have bits [9:8] == 0b01
+VLDR_VSTR_sp ---- 1101 u:1 .0 l:1 rn:4 .... 1010 imm:8 \
+             vd=%vd_sp
+VLDR_VSTR_dp ---- 1101 u:1 .0 l:1 rn:4 .... 1011 imm:8 \
+             vd=%vd_dp
+
+# We split the load/store multiple up into two patterns to avoid
+# overlap with other insns in the "Advanced SIMD load/store and 64-bit move"
+# grouping:
+#   P=0 U=0 W=0 is 64-bit VMOV
+#   P=1 W=0 is VLDR/VSTR
+#   P=U W=1 is UNDEF
+# leaving P=0 U=1 W=x and P=1 U=0 W=1 for load/store multiple.
+# These include FSTM/FLDM.
+VLDM_VSTM_sp ---- 1100 1 . w:1 l:1 rn:4 .... 1010 imm:8 \
+             vd=%vd_sp p=0 u=1
+VLDM_VSTM_dp ---- 1100 1 . w:1 l:1 rn:4 .... 1011 imm:8 \
+             vd=%vd_dp p=0 u=1
+
+VLDM_VSTM_sp ---- 1101 0.1 l:1 rn:4 .... 1010 imm:8 \
+             vd=%vd_sp p=1 u=0 w=1
+VLDM_VSTM_dp ---- 1101 0.1 l:1 rn:4 .... 1011 imm:8 \
+             vd=%vd_dp p=1 u=0 w=1
+
+# 3-register VFP data-processing; bits [23,21:20,6] identify the operation.
+VMLA_sp      ---- 1110 0.00 .... .... 1010 .0.0 .... \
+             vm=%vm_sp vn=%vn_sp vd=%vd_sp
+VMLA_dp      ---- 1110 0.00 .... .... 1011 .0.0 .... \
+             vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VMLS_sp      ---- 1110 0.00 .... .... 1010 .1.0 .... \
+             vm=%vm_sp vn=%vn_sp vd=%vd_sp
+VMLS_dp      ---- 1110 0.00 .... .... 1011 .1.0 .... \
+             vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VNMLS_sp     ---- 1110 0.01 .... .... 1010 .0.0 .... \
+             vm=%vm_sp vn=%vn_sp vd=%vd_sp
+VNMLS_dp     ---- 1110 0.01 .... .... 1011 .0.0 .... \
+             vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VNMLA_sp     ---- 1110 0.01 .... .... 1010 .1.0 .... \
+             vm=%vm_sp vn=%vn_sp vd=%vd_sp
+VNMLA_dp     ---- 1110 0.01 .... .... 1011 .1.0 .... \
+             vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VMUL_sp      ---- 1110 0.10 .... .... 1010 .0.0 .... \
+             vm=%vm_sp vn=%vn_sp vd=%vd_sp
+VMUL_dp      ---- 1110 0.10 .... .... 1011 .0.0 .... \
+             vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VNMUL_sp     ---- 1110 0.10 .... .... 1010 .1.0 .... \
+             vm=%vm_sp vn=%vn_sp vd=%vd_sp
+VNMUL_dp     ---- 1110 0.10 .... .... 1011 .1.0 .... \
+             vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VADD_sp      ---- 1110 0.11 .... .... 1010 .0.0 .... \
+             vm=%vm_sp vn=%vn_sp vd=%vd_sp
+VADD_dp      ---- 1110 0.11 .... .... 1011 .0.0 .... \
+             vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VSUB_sp      ---- 1110 0.11 .... .... 1010 .1.0 .... \
+             vm=%vm_sp vn=%vn_sp vd=%vd_sp
+VSUB_dp      ---- 1110 0.11 .... .... 1011 .1.0 .... \
+             vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VDIV_sp      ---- 1110 1.00 .... .... 1010 .0.0 .... \
+             vm=%vm_sp vn=%vn_sp vd=%vd_sp
+VDIV_dp      ---- 1110 1.00 .... .... 1011 .0.0 .... \
+             vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VFM_sp       ---- 1110 1.01 .... .... 1010 . o2:1 . 0 .... \
+             vm=%vm_sp vn=%vn_sp vd=%vd_sp o1=1
+VFM_dp       ---- 1110 1.01 .... .... 1011 . o2:1 . 0 .... \
+             vm=%vm_dp vn=%vn_dp vd=%vd_dp o1=1
+VFM_sp       ---- 1110 1.10 .... .... 1010 . o2:1 . 0 .... \
+             vm=%vm_sp vn=%vn_sp vd=%vd_sp o1=2
+VFM_dp       ---- 1110 1.10 .... .... 1011 . o2:1 . 0 .... \
+             vm=%vm_dp vn=%vn_dp vd=%vd_dp o1=2
+
+VMOV_imm_sp  ---- 1110 1.11 imm4h:4 .... 1010 0000 imm4l:4 \
+             vd=%vd_sp
+VMOV_imm_dp  ---- 1110 1.11 imm4h:4 .... 1011 0000 imm4l:4 \
+             vd=%vd_dp
+
+VMOV_reg_sp  ---- 1110 1.11 0000 .... 1010 01.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VMOV_reg_dp  ---- 1110 1.11 0000 .... 1011 01.0 .... \
+             vd=%vd_dp vm=%vm_dp
+
+VABS_sp      ---- 1110 1.11 0000 .... 1010 11.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VABS_dp      ---- 1110 1.11 0000 .... 1011 11.0 .... \
+             vd=%vd_dp vm=%vm_dp
+
+VNEG_sp      ---- 1110 1.11 0001 .... 1010 01.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VNEG_dp      ---- 1110 1.11 0001 .... 1011 01.0 .... \
+             vd=%vd_dp vm=%vm_dp
+
+VSQRT_sp     ---- 1110 1.11 0001 .... 1010 11.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VSQRT_dp     ---- 1110 1.11 0001 .... 1011 11.0 .... \
+             vd=%vd_dp vm=%vm_dp
+
+VCMP_sp      ---- 1110 1.11 010 z:1 .... 1010 e:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCMP_dp      ---- 1110 1.11 010 z:1 .... 1011 e:1 1.0 .... \
+             vd=%vd_dp vm=%vm_dp
+
+# VCVTT and VCVTB from f16: Vd format depends on size bit; Vm is always vm_sp
+VCVT_f32_f16 ---- 1110 1.11 0010 .... 1010 t:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCVT_f64_f16 ---- 1110 1.11 0010 .... 1011 t:1 1.0 .... \
+             vd=%vd_dp vm=%vm_sp
+
+# VCVTB and VCVTT to f16: Vd format is always vd_sp; Vm format depends on size bit
+VCVT_f16_f32 ---- 1110 1.11 0011 .... 1010 t:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCVT_f16_f64 ---- 1110 1.11 0011 .... 1011 t:1 1.0 .... \
+             vd=%vd_sp vm=%vm_dp
+
+VRINTR_sp    ---- 1110 1.11 0110 .... 1010 01.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VRINTR_dp    ---- 1110 1.11 0110 .... 1011 01.0 .... \
+             vd=%vd_dp vm=%vm_dp
+
+VRINTZ_sp    ---- 1110 1.11 0110 .... 1010 11.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VRINTZ_dp    ---- 1110 1.11 0110 .... 1011 11.0 .... \
+             vd=%vd_dp vm=%vm_dp
+
+VRINTX_sp    ---- 1110 1.11 0111 .... 1010 01.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VRINTX_dp    ---- 1110 1.11 0111 .... 1011 01.0 .... \
+             vd=%vd_dp vm=%vm_dp
+
+# VCVT between single and double: Vm precision depends on size; Vd is its reverse
+VCVT_sp      ---- 1110 1.11 0111 .... 1010 11.0 .... \
+             vd=%vd_dp vm=%vm_sp
+VCVT_dp      ---- 1110 1.11 0111 .... 1011 11.0 .... \
+             vd=%vd_sp vm=%vm_dp
+
+# VCVT from integer to floating point: Vm always single; Vd depends on size
+VCVT_int_sp  ---- 1110 1.11 1000 .... 1010 s:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCVT_int_dp  ---- 1110 1.11 1000 .... 1011 s:1 1.0 .... \
+             vd=%vd_dp vm=%vm_sp
+
+# VJCVT is always dp to sp
+VJCVT        ---- 1110 1.11 1001 .... 1011 11.0 .... \
+             vd=%vd_sp vm=%vm_dp
+
+# VCVT between floating-point and fixed-point. The immediate value
+# is in the same format as a Vm single-precision register number.
+# We assemble bits 18 (op), 16 (u) and 7 (sx) into a single opc field
+# for the convenience of the trans_VCVT_fix functions.
+%vcvt_fix_op 18:1 16:1 7:1
+VCVT_fix_sp  ---- 1110 1.11 1.1. .... 1010 .1.0 .... \
+             vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op
+VCVT_fix_dp  ---- 1110 1.11 1.1. .... 1011 .1.0 .... \
+             vd=%vd_dp imm=%vm_sp opc=%vcvt_fix_op
+
+# VCVT float to integer (VCVT and VCVTR): Vd always single; Vd depends on size
+VCVT_sp_int  ---- 1110 1.11 110 s:1 .... 1010 rz:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCVT_dp_int  ---- 1110 1.11 110 s:1 .... 1011 rz:1 1.0 .... \
+             vd=%vd_sp vm=%vm_dp
--- a/tests/tcg/aarch64/Makefile.target
+++ b/tests/tcg/aarch64/Makefile.target
@ -15,7 +15,7 @@ run-fcvt: fcvt
 	$(call run-test,$<,$(QEMU) $<, "$< on $(TARGET_NAME)")
 	$(call diff-out,$<,$(AARCH64_SRC)/fcvt.ref)

-AARCH64_TESTS += pauth-1
+AARCH64_TESTS += pauth-1 pauth-2
 run-pauth-%: QEMU += -cpu max

 TESTS:=$(AARCH64_TESTS)
--- a/tests/tcg/aarch64/pauth-2.c
+++ b/tests/tcg/aarch64/pauth-2.c
@ -0,0 +1,61 @@
+#include <stdint.h>
+#include <assert.h>
+
+asm(".arch armv8.4-a");
+
+void do_test(uint64_t value)
+{
+    uint64_t salt1, salt2;
+    uint64_t encode, decode;
+
+    /*
+     * With TBI enabled and a 48-bit VA, there are 7 bits of auth,
+     * and so a 1/128 chance of encode = pac(value,key,salt) producing
+     * an auth for which leaves value unchanged.
+     * Iterate until we find a salt for which encode != value.
+     */
+    for (salt1 = 1; ; salt1++) {
+        asm volatile("pacda %0, %2" : "=r"(encode) : "0"(value), "r"(salt1));
+        if (encode != value) {
+            break;
+        }
+    }
+
+    /* A valid salt must produce a valid authorization.  */
+    asm volatile("autda %0, %2" : "=r"(decode) : "0"(encode), "r"(salt1));
+    assert(decode == value);
+
+    /*
+     * An invalid salt usually fails authorization, but again there
+     * is a chance of choosing another salt that works.
+     * Iterate until we find another salt which does fail.
+     */
+    for (salt2 = salt1 + 1; ; salt2++) {
+        asm volatile("autda %0, %2" : "=r"(decode) : "0"(encode), "r"(salt2));
+        if (decode != value) {
+            break;
+        }
+    }
+
+    /* The VA bits, bit 55, and the TBI bits, should be unchanged.  */
+    assert(((decode ^ value) & 0xff80ffffffffffffull) == 0);
+
+    /*
+     * Bits [54:53] are an error indicator based on the key used;
+     * the DA key above is keynumber 0, so error == 0b01.  Otherwise
+     * bit 55 of the original is sign-extended into the rest of the auth.
+     */
+    if ((value >> 55) & 1) {
+        assert(((decode >> 48) & 0xff) == 0b10111111);
+    } else {
+        assert(((decode >> 48) & 0xff) == 0b00100000);
+    }
+}
+
+int main()
+{
+    do_test(0);
+    do_test(-1);
+    do_test(0xda004acedeadbeefull);
+    return 0;
+}