diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index e0ea599769..fc0b179df4 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -2372,6 +2372,17 @@ float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
             }
         }
         /* Zero plus something non-zero : just return the something */
+        if (flags & float_muladd_halve_result) {
+            if (cExp == 0) {
+                normalizeFloat32Subnormal(cSig, &cExp, &cSig);
+            }
+            /* Subtract one to halve, and one again because roundAndPackFloat32
+             * wants one less than the true exponent.
+             */
+            cExp -= 2;
+            cSig = (cSig | 0x00800000) << 7;
+            return roundAndPackFloat32(cSign ^ signflip, cExp, cSig STATUS_VAR);
+        }
         return packFloat32(cSign ^ signflip, cExp, cSig);
     }
 
@@ -2408,6 +2419,9 @@ float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
             /* Throw out the special case of c being an exact zero now */
             shift64RightJamming(pSig64, 32, &pSig64);
             pSig = pSig64;
+            if (flags & float_muladd_halve_result) {
+                pExp--;
+            }
             return roundAndPackFloat32(zSign, pExp - 1,
                                        pSig STATUS_VAR);
         }
@@ -2472,6 +2486,10 @@ float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
         zSig64 <<= shiftcount;
         zExp -= shiftcount;
     }
+    if (flags & float_muladd_halve_result) {
+        zExp--;
+    }
+
     shift64RightJamming(zSig64, 32, &zSig64);
     return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
 }
@@ -4088,6 +4106,17 @@ float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
             }
         }
         /* Zero plus something non-zero : just return the something */
+        if (flags & float_muladd_halve_result) {
+            if (cExp == 0) {
+                normalizeFloat64Subnormal(cSig, &cExp, &cSig);
+            }
+            /* Subtract one to halve, and one again because roundAndPackFloat64
+             * wants one less than the true exponent.
+             */
+            cExp -= 2;
+            cSig = (cSig | 0x0010000000000000ULL) << 10;
+            return roundAndPackFloat64(cSign ^ signflip, cExp, cSig STATUS_VAR);
+        }
         return packFloat64(cSign ^ signflip, cExp, cSig);
     }
 
@@ -4123,6 +4152,9 @@ float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
         if (!cSig) {
             /* Throw out the special case of c being an exact zero now */
             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
+            if (flags & float_muladd_halve_result) {
+                pExp--;
+            }
             return roundAndPackFloat64(zSign, pExp - 1,
                                        pSig1 STATUS_VAR);
         }
@@ -4159,6 +4191,9 @@ float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
             zExp--;
         }
         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
+        if (flags & float_muladd_halve_result) {
+            zExp--;
+        }
         return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
     } else {
         /* Subtraction */
@@ -4209,6 +4244,9 @@ float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
                 zExp -= (shiftcount + 64);
             }
         }
+        if (flags & float_muladd_halve_result) {
+            zExp--;
+        }
         return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
     }
 }
diff --git a/hw/arm/pxa2xx.c b/hw/arm/pxa2xx.c
index 66bc0dccc9..45a99c819d 100644
--- a/hw/arm/pxa2xx.c
+++ b/hw/arm/pxa2xx.c
@@ -224,27 +224,24 @@ static const VMStateDescription vmstate_pxa2xx_cm = {
     }
 };
 
-static int pxa2xx_clkcfg_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                              uint64_t *value)
+static uint64_t pxa2xx_clkcfg_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     PXA2xxState *s = (PXA2xxState *)ri->opaque;
-    *value = s->clkcfg;
-    return 0;
+    return s->clkcfg;
 }
 
-static int pxa2xx_clkcfg_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                               uint64_t value)
+static void pxa2xx_clkcfg_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                uint64_t value)
 {
     PXA2xxState *s = (PXA2xxState *)ri->opaque;
     s->clkcfg = value & 0xf;
     if (value & 2) {
         printf("%s: CPU frequency change attempt\n", __func__);
     }
-    return 0;
 }
 
-static int pxa2xx_pwrmode_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                                uint64_t value)
+static void pxa2xx_pwrmode_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                 uint64_t value)
 {
     PXA2xxState *s = (PXA2xxState *)ri->opaque;
     static const char *pwrmode[8] = {
@@ -310,36 +307,29 @@ static int pxa2xx_pwrmode_write(CPUARMState *env, const ARMCPRegInfo *ri,
         printf("%s: machine entered %s mode\n", __func__,
                pwrmode[value & 7]);
     }
-
-    return 0;
 }
 
-static int pxa2xx_cppmnc_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                              uint64_t *value)
+static uint64_t pxa2xx_cppmnc_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     PXA2xxState *s = (PXA2xxState *)ri->opaque;
-    *value = s->pmnc;
-    return 0;
+    return s->pmnc;
 }
 
-static int pxa2xx_cppmnc_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                               uint64_t value)
+static void pxa2xx_cppmnc_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                uint64_t value)
 {
     PXA2xxState *s = (PXA2xxState *)ri->opaque;
     s->pmnc = value;
-    return 0;
 }
 
-static int pxa2xx_cpccnt_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                              uint64_t *value)
+static uint64_t pxa2xx_cpccnt_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     PXA2xxState *s = (PXA2xxState *)ri->opaque;
     if (s->pmnc & 1) {
-        *value = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+        return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
     } else {
-        *value = 0;
+        return 0;
     }
-    return 0;
 }
 
 static const ARMCPRegInfo pxa_cp_reginfo[] = {
diff --git a/hw/arm/pxa2xx_pic.c b/hw/arm/pxa2xx_pic.c
index 46d337cf84..345fa4a491 100644
--- a/hw/arm/pxa2xx_pic.c
+++ b/hw/arm/pxa2xx_pic.c
@@ -217,20 +217,17 @@ static const int pxa2xx_cp_reg_map[0x10] = {
     [0xa] = ICPR2,
 };
 
-static int pxa2xx_pic_cp_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                              uint64_t *value)
+static uint64_t pxa2xx_pic_cp_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     int offset = pxa2xx_cp_reg_map[ri->crn];
-    *value = pxa2xx_pic_mem_read(ri->opaque, offset, 4);
-    return 0;
+    return pxa2xx_pic_mem_read(ri->opaque, offset, 4);
 }
 
-static int pxa2xx_pic_cp_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                               uint64_t value)
+static void pxa2xx_pic_cp_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                uint64_t value)
 {
     int offset = pxa2xx_cp_reg_map[ri->crn];
     pxa2xx_pic_mem_write(ri->opaque, offset, value, 4);
-    return 0;
 }
 
 #define REGINFO_FOR_PIC_CP(NAME, CRN) \
diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c
index 93eaa6b2fa..955b8d4945 100644
--- a/hw/intc/arm_gic.c
+++ b/hw/intc/arm_gic.c
@@ -189,7 +189,7 @@ uint32_t gic_acknowledge_irq(GICState *s, int cpu)
     }
     s->last_active[irq][cpu] = s->running_irq[cpu];
 
-    if (s->revision == REV_11MPCORE) {
+    if (s->revision == REV_11MPCORE || s->revision == REV_NVIC) {
         /* Clear pending flags for both level and edge triggered interrupts.
          * Level triggered IRQs will be reasserted once they become inactive.
          */
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index 806ae13780..4b4df88527 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -249,11 +249,14 @@ void float_raise( int8 flags STATUS_PARAM);
 | Using these differs from negating an input or output before calling
 | the muladd function in that this means that a NaN doesn't have its
 | sign bit inverted before it is propagated.
+| We also support halving the result before rounding, as a special
+| case to support the ARM fused-sqrt-step instruction FRSQRTS.
 *----------------------------------------------------------------------------*/
 enum {
     float_muladd_negate_c = 1,
     float_muladd_negate_product = 2,
     float_muladd_negate_result = 4,
+    float_muladd_halve_result = 8,
 };
 
 /*----------------------------------------------------------------------------
diff --git a/linux-user/main.c b/linux-user/main.c
index cabc9e1a0e..919297736c 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -953,7 +953,8 @@ static int do_strex_a64(CPUARMState *env)
             goto finish;
         }
     }
-    val = env->xregs[rt];
+    /* handle the zero register */
+    val = rt == 31 ? 0 : env->xregs[rt];
     switch (size) {
     case 0:
         segv = put_user_u8(val, addr);
@@ -972,7 +973,8 @@ static int do_strex_a64(CPUARMState *env)
         goto error;
     }
     if (is_pair) {
-        val = env->xregs[rt2];
+        /* handle the zero register */
+        val = rt2 == 31 ? 0 : env->xregs[rt2];
         if (size == 2) {
             segv = put_user_u32(val, addr + 4);
         } else {
diff --git a/target-arm/cpu.c b/target-arm/cpu.c
index 45ad7f0260..6e7ce8905e 100644
--- a/target-arm/cpu.c
+++ b/target-arm/cpu.c
@@ -128,7 +128,7 @@ static void arm_cpu_reset(CPUState *s)
         }
     }
 
-    if (env->cp15.c1_sys & (1 << 13)) {
+    if (env->cp15.c1_sys & SCTLR_V) {
             env->regs[15] = 0xFFFF0000;
     }
 
@@ -681,14 +681,12 @@ static void cortex_a9_initfn(Object *obj)
 }
 
 #ifndef CONFIG_USER_ONLY
-static int a15_l2ctlr_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                           uint64_t *value)
+static uint64_t a15_l2ctlr_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     /* Linux wants the number of processors from here.
      * Might as well set the interrupt-controller bit too.
      */
-    *value = ((smp_cpus - 1) << 24) | (1 << 23);
-    return 0;
+    return ((smp_cpus - 1) << 24) | (1 << 23);
 }
 #endif
 
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 383c58221e..3c8a2dba2f 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -217,11 +217,6 @@ typedef struct CPUARMState {
         uint32_t c15_power_control; /* power control */
     } cp15;
 
-    /* System registers (AArch64) */
-    struct {
-        uint64_t tpidr_el0;
-    } sr;
-
     struct {
         uint32_t other_sp;
         uint32_t vecbase;
@@ -337,6 +332,58 @@ int cpu_arm_handle_mmu_fault (CPUARMState *env, target_ulong address, int rw,
                               int mmu_idx);
 #define cpu_handle_mmu_fault cpu_arm_handle_mmu_fault
 
+/* SCTLR bit meanings. Several bits have been reused in newer
+ * versions of the architecture; in that case we define constants
+ * for both old and new bit meanings. Code which tests against those
+ * bits should probably check or otherwise arrange that the CPU
+ * is the architectural version it expects.
+ */
+#define SCTLR_M       (1U << 0)
+#define SCTLR_A       (1U << 1)
+#define SCTLR_C       (1U << 2)
+#define SCTLR_W       (1U << 3) /* up to v6; RAO in v7 */
+#define SCTLR_SA      (1U << 3)
+#define SCTLR_P       (1U << 4) /* up to v5; RAO in v6 and v7 */
+#define SCTLR_SA0     (1U << 4) /* v8 onward, AArch64 only */
+#define SCTLR_D       (1U << 5) /* up to v5; RAO in v6 */
+#define SCTLR_CP15BEN (1U << 5) /* v7 onward */
+#define SCTLR_L       (1U << 6) /* up to v5; RAO in v6 and v7; RAZ in v8 */
+#define SCTLR_B       (1U << 7) /* up to v6; RAZ in v7 */
+#define SCTLR_ITD     (1U << 7) /* v8 onward */
+#define SCTLR_S       (1U << 8) /* up to v6; RAZ in v7 */
+#define SCTLR_SED     (1U << 8) /* v8 onward */
+#define SCTLR_R       (1U << 9) /* up to v6; RAZ in v7 */
+#define SCTLR_UMA     (1U << 9) /* v8 onward, AArch64 only */
+#define SCTLR_F       (1U << 10) /* up to v6 */
+#define SCTLR_SW      (1U << 10) /* v7 onward */
+#define SCTLR_Z       (1U << 11)
+#define SCTLR_I       (1U << 12)
+#define SCTLR_V       (1U << 13)
+#define SCTLR_RR      (1U << 14) /* up to v7 */
+#define SCTLR_DZE     (1U << 14) /* v8 onward, AArch64 only */
+#define SCTLR_L4      (1U << 15) /* up to v6; RAZ in v7 */
+#define SCTLR_UCT     (1U << 15) /* v8 onward, AArch64 only */
+#define SCTLR_DT      (1U << 16) /* up to ??, RAO in v6 and v7 */
+#define SCTLR_nTWI    (1U << 16) /* v8 onward */
+#define SCTLR_HA      (1U << 17)
+#define SCTLR_IT      (1U << 18) /* up to ??, RAO in v6 and v7 */
+#define SCTLR_nTWE    (1U << 18) /* v8 onward */
+#define SCTLR_WXN     (1U << 19)
+#define SCTLR_ST      (1U << 20) /* up to ??, RAZ in v6 */
+#define SCTLR_UWXN    (1U << 20) /* v7 onward */
+#define SCTLR_FI      (1U << 21)
+#define SCTLR_U       (1U << 22)
+#define SCTLR_XP      (1U << 23) /* up to v6; v7 onward RAO */
+#define SCTLR_VE      (1U << 24) /* up to v7 */
+#define SCTLR_E0E     (1U << 24) /* v8 onward, AArch64 only */
+#define SCTLR_EE      (1U << 25)
+#define SCTLR_L2      (1U << 26) /* up to v6, RAZ in v7 */
+#define SCTLR_UCI     (1U << 26) /* v8 onward, AArch64 only */
+#define SCTLR_NMFI    (1U << 27)
+#define SCTLR_TRE     (1U << 28)
+#define SCTLR_AFE     (1U << 29)
+#define SCTLR_TE      (1U << 30)
+
 #define CPSR_M (0x1fU)
 #define CPSR_T (1U << 5)
 #define CPSR_F (1U << 6)
@@ -764,14 +811,30 @@ static inline int arm_current_pl(CPUARMState *env)
 
 typedef struct ARMCPRegInfo ARMCPRegInfo;
 
-/* Access functions for coprocessor registers. These should return
- * 0 on success, or one of the EXCP_* constants if access should cause
- * an exception (in which case *value is not written).
+typedef enum CPAccessResult {
+    /* Access is permitted */
+    CP_ACCESS_OK = 0,
+    /* Access fails due to a configurable trap or enable which would
+     * result in a categorized exception syndrome giving information about
+     * the failing instruction (ie syndrome category 0x3, 0x4, 0x5, 0x6,
+     * 0xc or 0x18).
+     */
+    CP_ACCESS_TRAP = 1,
+    /* Access fails and results in an exception syndrome 0x0 ("uncategorized").
+     * Note that this is not a catch-all case -- the set of cases which may
+     * result in this failure is specifically defined by the architecture.
+     */
+    CP_ACCESS_TRAP_UNCATEGORIZED = 2,
+} CPAccessResult;
+
+/* Access functions for coprocessor registers. These cannot fail and
+ * may not raise exceptions.
  */
-typedef int CPReadFn(CPUARMState *env, const ARMCPRegInfo *opaque,
-                     uint64_t *value);
-typedef int CPWriteFn(CPUARMState *env, const ARMCPRegInfo *opaque,
-                      uint64_t value);
+typedef uint64_t CPReadFn(CPUARMState *env, const ARMCPRegInfo *opaque);
+typedef void CPWriteFn(CPUARMState *env, const ARMCPRegInfo *opaque,
+                       uint64_t value);
+/* Access permission check functions for coprocessor registers. */
+typedef CPAccessResult CPAccessFn(CPUARMState *env, const ARMCPRegInfo *opaque);
 /* Hook function for register reset */
 typedef void CPResetFn(CPUARMState *env, const ARMCPRegInfo *opaque);
 
@@ -825,6 +888,12 @@ struct ARMCPRegInfo {
      *  2. both readfn and writefn are specified
      */
     ptrdiff_t fieldoffset; /* offsetof(CPUARMState, field) */
+    /* Function for making any access checks for this register in addition to
+     * those specified by the 'access' permissions bits. If NULL, no extra
+     * checks required. The access check is performed at runtime, not at
+     * translate time.
+     */
+    CPAccessFn *accessfn;
     /* Function for handling reads of this register. If NULL, then reads
      * will be done by loading from the offset into CPUARMState specified
      * by fieldoffset.
@@ -838,14 +907,14 @@ struct ARMCPRegInfo {
     /* Function for doing a "raw" read; used when we need to copy
      * coprocessor state to the kernel for KVM or out for
      * migration. This only needs to be provided if there is also a
-     * readfn and it makes an access permission check.
+     * readfn and it has side effects (for instance clear-on-read bits).
      */
     CPReadFn *raw_readfn;
     /* Function for doing a "raw" write; used when we need to copy KVM
      * kernel coprocessor state into userspace, or for inbound
      * migration. This only needs to be provided if there is also a
-     * writefn and it makes an access permission check or masks out
-     * "unwritable" bits or has write-one-to-clear or similar behaviour.
+     * writefn and it masks out "unwritable" bits or has write-one-to-clear
+     * or similar behaviour.
      */
     CPWriteFn *raw_writefn;
     /* Function for resetting the register. If NULL, then reset will be done
@@ -880,10 +949,10 @@ static inline void define_one_arm_cp_reg(ARMCPU *cpu, const ARMCPRegInfo *regs)
 const ARMCPRegInfo *get_arm_cp_reginfo(GHashTable *cpregs, uint32_t encoded_cp);
 
 /* CPWriteFn that can be used to implement writes-ignored behaviour */
-int arm_cp_write_ignore(CPUARMState *env, const ARMCPRegInfo *ri,
-                        uint64_t value);
+void arm_cp_write_ignore(CPUARMState *env, const ARMCPRegInfo *ri,
+                         uint64_t value);
 /* CPReadFn that can be used for read-as-zero behaviour */
-int arm_cp_read_zero(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t *value);
+uint64_t arm_cp_read_zero(CPUARMState *env, const ARMCPRegInfo *ri);
 
 /* CPResetFn that does nothing, for use if no reset is required even
  * if fieldoffset is non zero.
diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index 6ca958afb1..c2ce33ee88 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -123,6 +123,32 @@ uint64_t HELPER(vfp_cmped_a64)(float64 x, float64 y, void *fp_status)
     return float_rel_to_flags(float64_compare(x, y, fp_status));
 }
 
+float32 HELPER(vfp_mulxs)(float32 a, float32 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    if ((float32_is_zero(a) && float32_is_infinity(b)) ||
+        (float32_is_infinity(a) && float32_is_zero(b))) {
+        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
+        return make_float32((1U << 30) |
+                            ((float32_val(a) ^ float32_val(b)) & (1U << 31)));
+    }
+    return float32_mul(a, b, fpst);
+}
+
+float64 HELPER(vfp_mulxd)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    if ((float64_is_zero(a) && float64_is_infinity(b)) ||
+        (float64_is_infinity(a) && float64_is_zero(b))) {
+        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
+        return make_float64((1ULL << 62) |
+                            ((float64_val(a) ^ float64_val(b)) & (1ULL << 63)));
+    }
+    return float64_mul(a, b, fpst);
+}
+
 uint64_t HELPER(simd_tbl)(CPUARMState *env, uint64_t result, uint64_t indices,
                           uint32_t rn, uint32_t numregs)
 {
@@ -153,3 +179,82 @@ uint64_t HELPER(simd_tbl)(CPUARMState *env, uint64_t result, uint64_t indices,
     }
     return result;
 }
+
+/* 64bit/double versions of the neon float compare functions */
+uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return -float64_eq_quiet(a, b, fpst);
+}
+
+uint64_t HELPER(neon_cge_f64)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return -float64_le(b, a, fpst);
+}
+
+uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return -float64_lt(b, a, fpst);
+}
+
+/* Reciprocal step and sqrt step. Note that unlike the A32/T32
+ * versions, these do a fully fused multiply-add or
+ * multiply-add-and-halve.
+ */
+#define float32_two make_float32(0x40000000)
+#define float32_three make_float32(0x40400000)
+#define float32_one_point_five make_float32(0x3fc00000)
+
+#define float64_two make_float64(0x4000000000000000ULL)
+#define float64_three make_float64(0x4008000000000000ULL)
+#define float64_one_point_five make_float64(0x3FF8000000000000ULL)
+
+float32 HELPER(recpsf_f32)(float32 a, float32 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float32_chs(a);
+    if ((float32_is_infinity(a) && float32_is_zero(b)) ||
+        (float32_is_infinity(b) && float32_is_zero(a))) {
+        return float32_two;
+    }
+    return float32_muladd(a, b, float32_two, 0, fpst);
+}
+
+float64 HELPER(recpsf_f64)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float64_chs(a);
+    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
+        (float64_is_infinity(b) && float64_is_zero(a))) {
+        return float64_two;
+    }
+    return float64_muladd(a, b, float64_two, 0, fpst);
+}
+
+float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float32_chs(a);
+    if ((float32_is_infinity(a) && float32_is_zero(b)) ||
+        (float32_is_infinity(b) && float32_is_zero(a))) {
+        return float32_one_point_five;
+    }
+    return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
+}
+
+float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float64_chs(a);
+    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
+        (float64_is_infinity(b) && float64_is_zero(a))) {
+        return float64_one_point_five;
+    }
+    return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
+}
diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
index 99832ee55e..ab9933cab0 100644
--- a/target-arm/helper-a64.h
+++ b/target-arm/helper-a64.h
@@ -27,3 +27,12 @@ DEF_HELPER_3(vfp_cmpes_a64, i64, f32, f32, ptr)
 DEF_HELPER_3(vfp_cmpd_a64, i64, f64, f64, ptr)
 DEF_HELPER_3(vfp_cmped_a64, i64, f64, f64, ptr)
 DEF_HELPER_FLAGS_5(simd_tbl, TCG_CALL_NO_RWG_SE, i64, env, i64, i64, i32, i32)
+DEF_HELPER_FLAGS_3(vfp_mulxs, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
+DEF_HELPER_FLAGS_3(vfp_mulxd, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
+DEF_HELPER_FLAGS_3(neon_ceq_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
+DEF_HELPER_FLAGS_3(neon_cge_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
+DEF_HELPER_FLAGS_3(neon_cgt_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
+DEF_HELPER_FLAGS_3(recpsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
+DEF_HELPER_FLAGS_3(recpsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
+DEF_HELPER_FLAGS_3(rsqrtsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
+DEF_HELPER_FLAGS_3(rsqrtsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
diff --git a/target-arm/helper.c b/target-arm/helper.c
index 5ae08c9ad1..1b111b6e85 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -107,65 +107,56 @@ static int aarch64_fpu_gdb_set_reg(CPUARMState *env, uint8_t *buf, int reg)
     }
 }
 
-static int raw_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                    uint64_t *value)
+static uint64_t raw_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     if (ri->type & ARM_CP_64BIT) {
-        *value = CPREG_FIELD64(env, ri);
+        return CPREG_FIELD64(env, ri);
     } else {
-        *value = CPREG_FIELD32(env, ri);
+        return CPREG_FIELD32(env, ri);
     }
-    return 0;
 }
 
-static int raw_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                     uint64_t value)
+static void raw_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                      uint64_t value)
 {
     if (ri->type & ARM_CP_64BIT) {
         CPREG_FIELD64(env, ri) = value;
     } else {
         CPREG_FIELD32(env, ri) = value;
     }
-    return 0;
 }
 
-static bool read_raw_cp_reg(CPUARMState *env, const ARMCPRegInfo *ri,
-                            uint64_t *v)
+static uint64_t read_raw_cp_reg(CPUARMState *env, const ARMCPRegInfo *ri)
 {
-    /* Raw read of a coprocessor register (as needed for migration, etc)
-     * return true on success, false if the read is impossible for some reason.
-     */
+    /* Raw read of a coprocessor register (as needed for migration, etc). */
     if (ri->type & ARM_CP_CONST) {
-        *v = ri->resetvalue;
+        return ri->resetvalue;
     } else if (ri->raw_readfn) {
-        return (ri->raw_readfn(env, ri, v) == 0);
+        return ri->raw_readfn(env, ri);
     } else if (ri->readfn) {
-        return (ri->readfn(env, ri, v) == 0);
+        return ri->readfn(env, ri);
     } else {
-        raw_read(env, ri, v);
+        return raw_read(env, ri);
     }
-    return true;
 }
 
-static bool write_raw_cp_reg(CPUARMState *env, const ARMCPRegInfo *ri,
-                             int64_t v)
+static void write_raw_cp_reg(CPUARMState *env, const ARMCPRegInfo *ri,
+                             uint64_t v)
 {
     /* Raw write of a coprocessor register (as needed for migration, etc).
-     * Return true on success, false if the write is impossible for some reason.
      * Note that constant registers are treated as write-ignored; the
      * caller should check for success by whether a readback gives the
      * value written.
      */
     if (ri->type & ARM_CP_CONST) {
-        return true;
+        return;
     } else if (ri->raw_writefn) {
-        return (ri->raw_writefn(env, ri, v) == 0);
+        ri->raw_writefn(env, ri, v);
     } else if (ri->writefn) {
-        return (ri->writefn(env, ri, v) == 0);
+        ri->writefn(env, ri, v);
     } else {
         raw_write(env, ri, v);
     }
-    return true;
 }
 
 bool write_cpustate_to_list(ARMCPU *cpu)
@@ -177,7 +168,7 @@ bool write_cpustate_to_list(ARMCPU *cpu)
     for (i = 0; i < cpu->cpreg_array_len; i++) {
         uint32_t regidx = kvm_to_cpreg_id(cpu->cpreg_indexes[i]);
         const ARMCPRegInfo *ri;
-        uint64_t v;
+
         ri = get_arm_cp_reginfo(cpu->cp_regs, regidx);
         if (!ri) {
             ok = false;
@@ -186,11 +177,7 @@ bool write_cpustate_to_list(ARMCPU *cpu)
         if (ri->type & ARM_CP_NO_MIGRATE) {
             continue;
         }
-        if (!read_raw_cp_reg(&cpu->env, ri, &v)) {
-            ok = false;
-            continue;
-        }
-        cpu->cpreg_values[i] = v;
+        cpu->cpreg_values[i] = read_raw_cp_reg(&cpu->env, ri);
     }
     return ok;
 }
@@ -203,7 +190,6 @@ bool write_list_to_cpustate(ARMCPU *cpu)
     for (i = 0; i < cpu->cpreg_array_len; i++) {
         uint32_t regidx = kvm_to_cpreg_id(cpu->cpreg_indexes[i]);
         uint64_t v = cpu->cpreg_values[i];
-        uint64_t readback;
         const ARMCPRegInfo *ri;
 
         ri = get_arm_cp_reginfo(cpu->cp_regs, regidx);
@@ -218,9 +204,8 @@ bool write_list_to_cpustate(ARMCPU *cpu)
          * (to catch read-only registers and partially read-only
          * registers where the incoming migration value doesn't match)
          */
-        if (!write_raw_cp_reg(&cpu->env, ri, v) ||
-            !read_raw_cp_reg(&cpu->env, ri, &readback) ||
-            readback != v) {
+        write_raw_cp_reg(&cpu->env, ri, v);
+        if (read_raw_cp_reg(&cpu->env, ri) != v) {
             ok = false;
         }
     }
@@ -309,14 +294,13 @@ void init_cpreg_list(ARMCPU *cpu)
     g_list_free(keys);
 }
 
-static int dacr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
+static void dacr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
 {
     env->cp15.c3 = value;
     tlb_flush(env, 1); /* Flush TLB as domain not tracked in TLB */
-    return 0;
 }
 
-static int fcse_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
+static void fcse_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
 {
     if (env->cp15.c13_fcse != value) {
         /* Unlike real hardware the qemu TLB uses virtual addresses,
@@ -325,10 +309,10 @@ static int fcse_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
         tlb_flush(env, 1);
         env->cp15.c13_fcse = value;
     }
-    return 0;
 }
-static int contextidr_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                            uint64_t value)
+
+static void contextidr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                             uint64_t value)
 {
     if (env->cp15.c13_context != value && !arm_feature(env, ARM_FEATURE_MPU)) {
         /* For VMSA (when not using the LPAE long descriptor page table
@@ -338,39 +322,34 @@ static int contextidr_write(CPUARMState *env, const ARMCPRegInfo *ri,
         tlb_flush(env, 1);
     }
     env->cp15.c13_context = value;
-    return 0;
 }
 
-static int tlbiall_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                         uint64_t value)
+static void tlbiall_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                          uint64_t value)
 {
     /* Invalidate all (TLBIALL) */
     tlb_flush(env, 1);
-    return 0;
 }
 
-static int tlbimva_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                         uint64_t value)
+static void tlbimva_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                          uint64_t value)
 {
     /* Invalidate single TLB entry by MVA and ASID (TLBIMVA) */
     tlb_flush_page(env, value & TARGET_PAGE_MASK);
-    return 0;
 }
 
-static int tlbiasid_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                          uint64_t value)
+static void tlbiasid_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                           uint64_t value)
 {
     /* Invalidate by ASID (TLBIASID) */
     tlb_flush(env, value == 0);
-    return 0;
 }
 
-static int tlbimvaa_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                          uint64_t value)
+static void tlbimvaa_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                           uint64_t value)
 {
     /* Invalidate single entry by MVA, all ASIDs (TLBIMVAA) */
     tlb_flush_page(env, value & TARGET_PAGE_MASK);
-    return 0;
 }
 
 static const ARMCPRegInfo cp_reginfo[] = {
@@ -450,14 +429,14 @@ static const ARMCPRegInfo not_v7_cp_reginfo[] = {
     REGINFO_SENTINEL
 };
 
-static int cpacr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
+static void cpacr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                        uint64_t value)
 {
     if (env->cp15.c1_coproc != value) {
         env->cp15.c1_coproc = value;
         /* ??? Is this safe when called from within a TB?  */
         tb_flush(env);
     }
-    return 0;
 }
 
 static const ARMCPRegInfo v6_cp_reginfo[] = {
@@ -485,118 +464,88 @@ static const ARMCPRegInfo v6_cp_reginfo[] = {
     REGINFO_SENTINEL
 };
 
-
-static int pmreg_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                      uint64_t *value)
+static CPAccessResult pmreg_access(CPUARMState *env, const ARMCPRegInfo *ri)
 {
-    /* Generic performance monitor register read function for where
-     * user access may be allowed by PMUSERENR.
+    /* Perfomance monitor registers user accessibility is controlled
+     * by PMUSERENR.
      */
     if (arm_current_pl(env) == 0 && !env->cp15.c9_pmuserenr) {
-        return EXCP_UDEF;
+        return CP_ACCESS_TRAP;
     }
-    *value = CPREG_FIELD32(env, ri);
-    return 0;
+    return CP_ACCESS_OK;
 }
 
-static int pmcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                      uint64_t value)
+static void pmcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                       uint64_t value)
 {
-    if (arm_current_pl(env) == 0 && !env->cp15.c9_pmuserenr) {
-        return EXCP_UDEF;
-    }
     /* only the DP, X, D and E bits are writable */
     env->cp15.c9_pmcr &= ~0x39;
     env->cp15.c9_pmcr |= (value & 0x39);
-    return 0;
 }
 
-static int pmcntenset_write(CPUARMState *env, const ARMCPRegInfo *ri,
+static void pmcntenset_write(CPUARMState *env, const ARMCPRegInfo *ri,
                             uint64_t value)
 {
-    if (arm_current_pl(env) == 0 && !env->cp15.c9_pmuserenr) {
-        return EXCP_UDEF;
-    }
     value &= (1 << 31);
     env->cp15.c9_pmcnten |= value;
-    return 0;
 }
 
-static int pmcntenclr_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                            uint64_t value)
+static void pmcntenclr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                             uint64_t value)
 {
-    if (arm_current_pl(env) == 0 && !env->cp15.c9_pmuserenr) {
-        return EXCP_UDEF;
-    }
     value &= (1 << 31);
     env->cp15.c9_pmcnten &= ~value;
-    return 0;
 }
 
-static int pmovsr_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                        uint64_t value)
+static void pmovsr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                         uint64_t value)
 {
-    if (arm_current_pl(env) == 0 && !env->cp15.c9_pmuserenr) {
-        return EXCP_UDEF;
-    }
     env->cp15.c9_pmovsr &= ~value;
-    return 0;
 }
 
-static int pmxevtyper_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                            uint64_t value)
+static void pmxevtyper_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                             uint64_t value)
 {
-    if (arm_current_pl(env) == 0 && !env->cp15.c9_pmuserenr) {
-        return EXCP_UDEF;
-    }
     env->cp15.c9_pmxevtyper = value & 0xff;
-    return 0;
 }
 
-static int pmuserenr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+static void pmuserenr_write(CPUARMState *env, const ARMCPRegInfo *ri,
                             uint64_t value)
 {
     env->cp15.c9_pmuserenr = value & 1;
-    return 0;
 }
 
-static int pmintenset_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                            uint64_t value)
+static void pmintenset_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                             uint64_t value)
 {
     /* We have no event counters so only the C bit can be changed */
     value &= (1 << 31);
     env->cp15.c9_pminten |= value;
-    return 0;
 }
 
-static int pmintenclr_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                            uint64_t value)
+static void pmintenclr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                             uint64_t value)
 {
     value &= (1 << 31);
     env->cp15.c9_pminten &= ~value;
-    return 0;
 }
 
-static int vbar_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                      uint64_t value)
+static void vbar_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                       uint64_t value)
 {
     env->cp15.c12_vbar = value & ~0x1Ful;
-    return 0;
 }
 
-static int ccsidr_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                       uint64_t *value)
+static uint64_t ccsidr_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     ARMCPU *cpu = arm_env_get_cpu(env);
-    *value = cpu->ccsidr[env->cp15.c0_cssel];
-    return 0;
+    return cpu->ccsidr[env->cp15.c0_cssel];
 }
 
-static int csselr_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                        uint64_t value)
+static void csselr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                         uint64_t value)
 {
     env->cp15.c0_cssel = value & 0xf;
-    return 0;
 }
 
 static const ARMCPRegInfo v7_cp_reginfo[] = {
@@ -624,37 +573,41 @@ static const ARMCPRegInfo v7_cp_reginfo[] = {
     { .name = "PMCNTENSET", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 1,
       .access = PL0_RW, .resetvalue = 0,
       .fieldoffset = offsetof(CPUARMState, cp15.c9_pmcnten),
-      .readfn = pmreg_read, .writefn = pmcntenset_write,
-      .raw_readfn = raw_read, .raw_writefn = raw_write },
+      .writefn = pmcntenset_write,
+      .accessfn = pmreg_access,
+      .raw_writefn = raw_write },
     { .name = "PMCNTENCLR", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 2,
       .access = PL0_RW, .fieldoffset = offsetof(CPUARMState, cp15.c9_pmcnten),
-      .readfn = pmreg_read, .writefn = pmcntenclr_write,
+      .accessfn = pmreg_access,
+      .writefn = pmcntenclr_write,
       .type = ARM_CP_NO_MIGRATE },
     { .name = "PMOVSR", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 3,
       .access = PL0_RW, .fieldoffset = offsetof(CPUARMState, cp15.c9_pmovsr),
-      .readfn = pmreg_read, .writefn = pmovsr_write,
-      .raw_readfn = raw_read, .raw_writefn = raw_write },
-    /* Unimplemented so WI. Strictly speaking write accesses in PL0 should
-     * respect PMUSERENR.
-     */
+      .accessfn = pmreg_access,
+      .writefn = pmovsr_write,
+      .raw_writefn = raw_write },
+    /* Unimplemented so WI. */
     { .name = "PMSWINC", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 4,
-      .access = PL0_W, .type = ARM_CP_NOP },
+      .access = PL0_W, .accessfn = pmreg_access, .type = ARM_CP_NOP },
     /* Since we don't implement any events, writing to PMSELR is UNPREDICTABLE.
-     * We choose to RAZ/WI. XXX should respect PMUSERENR.
+     * We choose to RAZ/WI.
      */
     { .name = "PMSELR", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 5,
-      .access = PL0_RW, .type = ARM_CP_CONST, .resetvalue = 0 },
-    /* Unimplemented, RAZ/WI. XXX PMUSERENR */
+      .access = PL0_RW, .type = ARM_CP_CONST, .resetvalue = 0,
+      .accessfn = pmreg_access },
+    /* Unimplemented, RAZ/WI. */
     { .name = "PMCCNTR", .cp = 15, .crn = 9, .crm = 13, .opc1 = 0, .opc2 = 0,
-      .access = PL0_RW, .type = ARM_CP_CONST, .resetvalue = 0 },
+      .access = PL0_RW, .type = ARM_CP_CONST, .resetvalue = 0,
+      .accessfn = pmreg_access },
     { .name = "PMXEVTYPER", .cp = 15, .crn = 9, .crm = 13, .opc1 = 0, .opc2 = 1,
       .access = PL0_RW,
       .fieldoffset = offsetof(CPUARMState, cp15.c9_pmxevtyper),
-      .readfn = pmreg_read, .writefn = pmxevtyper_write,
-      .raw_readfn = raw_read, .raw_writefn = raw_write },
-    /* Unimplemented, RAZ/WI. XXX PMUSERENR */
+      .accessfn = pmreg_access, .writefn = pmxevtyper_write,
+      .raw_writefn = raw_write },
+    /* Unimplemented, RAZ/WI. */
     { .name = "PMXEVCNTR", .cp = 15, .crn = 9, .crm = 13, .opc1 = 0, .opc2 = 2,
-      .access = PL0_RW, .type = ARM_CP_CONST, .resetvalue = 0 },
+      .access = PL0_RW, .type = ARM_CP_CONST, .resetvalue = 0,
+      .accessfn = pmreg_access },
     { .name = "PMUSERENR", .cp = 15, .crn = 9, .crm = 14, .opc1 = 0, .opc2 = 0,
       .access = PL0_R | PL1_RW,
       .fieldoffset = offsetof(CPUARMState, cp15.c9_pmuserenr),
@@ -689,34 +642,19 @@ static const ARMCPRegInfo v7_cp_reginfo[] = {
     REGINFO_SENTINEL
 };
 
-static int teecr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
+static void teecr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                        uint64_t value)
 {
     value &= 1;
     env->teecr = value;
-    return 0;
 }
 
-static int teehbr_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                       uint64_t *value)
-{
-    /* This is a helper function because the user access rights
-     * depend on the value of the TEECR.
-     */
-    if (arm_current_pl(env) == 0 && (env->teecr & 1)) {
-        return EXCP_UDEF;
-    }
-    *value = env->teehbr;
-    return 0;
-}
-
-static int teehbr_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                        uint64_t value)
+static CPAccessResult teehbr_access(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     if (arm_current_pl(env) == 0 && (env->teecr & 1)) {
-        return EXCP_UDEF;
+        return CP_ACCESS_TRAP;
     }
-    env->teehbr = value;
-    return 0;
+    return CP_ACCESS_OK;
 }
 
 static const ARMCPRegInfo t2ee_cp_reginfo[] = {
@@ -726,8 +664,7 @@ static const ARMCPRegInfo t2ee_cp_reginfo[] = {
       .writefn = teecr_write },
     { .name = "TEEHBR", .cp = 14, .crn = 1, .crm = 0, .opc1 = 6, .opc2 = 0,
       .access = PL0_RW, .fieldoffset = offsetof(CPUARMState, teehbr),
-      .resetvalue = 0, .raw_readfn = raw_read, .raw_writefn = raw_write,
-      .readfn = teehbr_read, .writefn = teehbr_write },
+      .accessfn = teehbr_access, .resetvalue = 0 },
     REGINFO_SENTINEL
 };
 
@@ -757,6 +694,59 @@ static const ARMCPRegInfo v6k_cp_reginfo[] = {
 
 #ifndef CONFIG_USER_ONLY
 
+static CPAccessResult gt_cntfrq_access(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+    /* CNTFRQ: not visible from PL0 if both PL0PCTEN and PL0VCTEN are zero */
+    if (arm_current_pl(env) == 0 && !extract32(env->cp15.c14_cntkctl, 0, 2)) {
+        return CP_ACCESS_TRAP;
+    }
+    return CP_ACCESS_OK;
+}
+
+static CPAccessResult gt_counter_access(CPUARMState *env, int timeridx)
+{
+    /* CNT[PV]CT: not visible from PL0 if ELO[PV]CTEN is zero */
+    if (arm_current_pl(env) == 0 &&
+        !extract32(env->cp15.c14_cntkctl, timeridx, 1)) {
+        return CP_ACCESS_TRAP;
+    }
+    return CP_ACCESS_OK;
+}
+
+static CPAccessResult gt_timer_access(CPUARMState *env, int timeridx)
+{
+    /* CNT[PV]_CVAL, CNT[PV]_CTL, CNT[PV]_TVAL: not visible from PL0 if
+     * EL0[PV]TEN is zero.
+     */
+    if (arm_current_pl(env) == 0 &&
+        !extract32(env->cp15.c14_cntkctl, 9 - timeridx, 1)) {
+        return CP_ACCESS_TRAP;
+    }
+    return CP_ACCESS_OK;
+}
+
+static CPAccessResult gt_pct_access(CPUARMState *env,
+                                         const ARMCPRegInfo *ri)
+{
+    return gt_counter_access(env, GTIMER_PHYS);
+}
+
+static CPAccessResult gt_vct_access(CPUARMState *env,
+                                         const ARMCPRegInfo *ri)
+{
+    return gt_counter_access(env, GTIMER_VIRT);
+}
+
+static CPAccessResult gt_ptimer_access(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+    return gt_timer_access(env, GTIMER_PHYS);
+}
+
+static CPAccessResult gt_vtimer_access(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+    return gt_timer_access(env, GTIMER_VIRT);
+}
+
 static uint64_t gt_get_countervalue(CPUARMState *env)
 {
     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / GTIMER_SCALE;
@@ -802,17 +792,6 @@ static void gt_recalc_timer(ARMCPU *cpu, int timeridx)
     }
 }
 
-static int gt_cntfrq_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                          uint64_t *value)
-{
-    /* Not visible from PL0 if both PL0PCTEN and PL0VCTEN are zero */
-    if (arm_current_pl(env) == 0 && !extract32(env->cp15.c14_cntkctl, 0, 2)) {
-        return EXCP_UDEF;
-    }
-    *value = env->cp15.c14_cntfrq;
-    return 0;
-}
-
 static void gt_cnt_reset(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     ARMCPU *cpu = arm_env_get_cpu(env);
@@ -821,81 +800,40 @@ static void gt_cnt_reset(CPUARMState *env, const ARMCPRegInfo *ri)
     timer_del(cpu->gt_timer[timeridx]);
 }
 
-static int gt_cnt_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                       uint64_t *value)
+static uint64_t gt_cnt_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
-    int timeridx = ri->opc1 & 1;
-
-    if (arm_current_pl(env) == 0 &&
-        !extract32(env->cp15.c14_cntkctl, timeridx, 1)) {
-        return EXCP_UDEF;
-    }
-    *value = gt_get_countervalue(env);
-    return 0;
+    return gt_get_countervalue(env);
 }
 
-static int gt_cval_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                        uint64_t *value)
-{
-    int timeridx = ri->opc1 & 1;
-
-    if (arm_current_pl(env) == 0 &&
-        !extract32(env->cp15.c14_cntkctl, 9 - timeridx, 1)) {
-        return EXCP_UDEF;
-    }
-    *value = env->cp15.c14_timer[timeridx].cval;
-    return 0;
-}
-
-static int gt_cval_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                         uint64_t value)
+static void gt_cval_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                          uint64_t value)
 {
     int timeridx = ri->opc1 & 1;
 
     env->cp15.c14_timer[timeridx].cval = value;
     gt_recalc_timer(arm_env_get_cpu(env), timeridx);
-    return 0;
 }
-static int gt_tval_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                        uint64_t *value)
+
+static uint64_t gt_tval_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     int timeridx = ri->crm & 1;
 
-    if (arm_current_pl(env) == 0 &&
-        !extract32(env->cp15.c14_cntkctl, 9 - timeridx, 1)) {
-        return EXCP_UDEF;
-    }
-    *value = (uint32_t)(env->cp15.c14_timer[timeridx].cval -
-                        gt_get_countervalue(env));
-    return 0;
+    return (uint32_t)(env->cp15.c14_timer[timeridx].cval -
+                      gt_get_countervalue(env));
 }
 
-static int gt_tval_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                         uint64_t value)
+static void gt_tval_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                          uint64_t value)
 {
     int timeridx = ri->crm & 1;
 
     env->cp15.c14_timer[timeridx].cval = gt_get_countervalue(env) +
         + sextract64(value, 0, 32);
     gt_recalc_timer(arm_env_get_cpu(env), timeridx);
-    return 0;
 }
 
-static int gt_ctl_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                       uint64_t *value)
-{
-    int timeridx = ri->crm & 1;
-
-    if (arm_current_pl(env) == 0 &&
-        !extract32(env->cp15.c14_cntkctl, 9 - timeridx, 1)) {
-        return EXCP_UDEF;
-    }
-    *value = env->cp15.c14_timer[timeridx].ctl;
-    return 0;
-}
-
-static int gt_ctl_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                        uint64_t value)
+static void gt_ctl_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                         uint64_t value)
 {
     ARMCPU *cpu = arm_env_get_cpu(env);
     int timeridx = ri->crm & 1;
@@ -912,7 +850,6 @@ static int gt_ctl_write(CPUARMState *env, const ARMCPRegInfo *ri,
         qemu_set_irq(cpu->gt_timer_outputs[timeridx],
                      (oldval & 4) && (value & 2));
     }
-    return 0;
 }
 
 void arm_gt_ptimer_cb(void *opaque)
@@ -938,7 +875,7 @@ static const ARMCPRegInfo generic_timer_cp_reginfo[] = {
       .access = PL1_RW | PL0_R,
       .fieldoffset = offsetof(CPUARMState, cp15.c14_cntfrq),
       .resetvalue = (1000 * 1000 * 1000) / GTIMER_SCALE,
-      .readfn = gt_cntfrq_read, .raw_readfn = raw_read,
+      .accessfn = gt_cntfrq_access,
     },
     /* overall control: mostly access permissions */
     { .name = "CNTKCTL", .cp = 15, .crn = 14, .crm = 1, .opc1 = 0, .opc2 = 0,
@@ -951,32 +888,36 @@ static const ARMCPRegInfo generic_timer_cp_reginfo[] = {
       .type = ARM_CP_IO, .access = PL1_RW | PL0_R,
       .fieldoffset = offsetof(CPUARMState, cp15.c14_timer[GTIMER_PHYS].ctl),
       .resetvalue = 0,
-      .readfn = gt_ctl_read, .writefn = gt_ctl_write,
-      .raw_readfn = raw_read, .raw_writefn = raw_write,
+      .accessfn = gt_ptimer_access,
+      .writefn = gt_ctl_write, .raw_writefn = raw_write,
     },
     { .name = "CNTV_CTL", .cp = 15, .crn = 14, .crm = 3, .opc1 = 0, .opc2 = 1,
       .type = ARM_CP_IO, .access = PL1_RW | PL0_R,
       .fieldoffset = offsetof(CPUARMState, cp15.c14_timer[GTIMER_VIRT].ctl),
       .resetvalue = 0,
-      .readfn = gt_ctl_read, .writefn = gt_ctl_write,
-      .raw_readfn = raw_read, .raw_writefn = raw_write,
+      .accessfn = gt_vtimer_access,
+      .writefn = gt_ctl_write, .raw_writefn = raw_write,
     },
     /* TimerValue views: a 32 bit downcounting view of the underlying state */
     { .name = "CNTP_TVAL", .cp = 15, .crn = 14, .crm = 2, .opc1 = 0, .opc2 = 0,
       .type = ARM_CP_NO_MIGRATE | ARM_CP_IO, .access = PL1_RW | PL0_R,
+      .accessfn = gt_ptimer_access,
       .readfn = gt_tval_read, .writefn = gt_tval_write,
     },
     { .name = "CNTV_TVAL", .cp = 15, .crn = 14, .crm = 3, .opc1 = 0, .opc2 = 0,
       .type = ARM_CP_NO_MIGRATE | ARM_CP_IO, .access = PL1_RW | PL0_R,
+      .accessfn = gt_vtimer_access,
       .readfn = gt_tval_read, .writefn = gt_tval_write,
     },
     /* The counter itself */
     { .name = "CNTPCT", .cp = 15, .crm = 14, .opc1 = 0,
       .access = PL0_R, .type = ARM_CP_64BIT | ARM_CP_NO_MIGRATE | ARM_CP_IO,
+      .accessfn = gt_pct_access,
       .readfn = gt_cnt_read, .resetfn = gt_cnt_reset,
     },
     { .name = "CNTVCT", .cp = 15, .crm = 14, .opc1 = 1,
       .access = PL0_R, .type = ARM_CP_64BIT | ARM_CP_NO_MIGRATE | ARM_CP_IO,
+      .accessfn = gt_vct_access,
       .readfn = gt_cnt_read, .resetfn = gt_cnt_reset,
     },
     /* Comparison value, indicating when the timer goes off */
@@ -985,16 +926,16 @@ static const ARMCPRegInfo generic_timer_cp_reginfo[] = {
       .type = ARM_CP_64BIT | ARM_CP_IO,
       .fieldoffset = offsetof(CPUARMState, cp15.c14_timer[GTIMER_PHYS].cval),
       .resetvalue = 0,
-      .readfn = gt_cval_read, .writefn = gt_cval_write,
-      .raw_readfn = raw_read, .raw_writefn = raw_write,
+      .accessfn = gt_ptimer_access,
+      .writefn = gt_cval_write, .raw_writefn = raw_write,
     },
     { .name = "CNTV_CVAL", .cp = 15, .crm = 14, .opc1 = 3,
       .access = PL1_RW | PL0_R,
       .type = ARM_CP_64BIT | ARM_CP_IO,
       .fieldoffset = offsetof(CPUARMState, cp15.c14_timer[GTIMER_VIRT].cval),
       .resetvalue = 0,
-      .readfn = gt_cval_read, .writefn = gt_cval_write,
-      .raw_readfn = raw_read, .raw_writefn = raw_write,
+      .accessfn = gt_vtimer_access,
+      .writefn = gt_cval_write, .raw_writefn = raw_write,
     },
     REGINFO_SENTINEL
 };
@@ -1010,7 +951,7 @@ static const ARMCPRegInfo generic_timer_cp_reginfo[] = {
 
 #endif
 
-static int par_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
+static void par_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
 {
     if (arm_feature(env, ARM_FEATURE_LPAE)) {
         env->cp15.c7_par = value;
@@ -1019,7 +960,6 @@ static int par_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
     } else {
         env->cp15.c7_par = value & 0xfffff1ff;
     }
-    return 0;
 }
 
 #ifndef CONFIG_USER_ONLY
@@ -1035,7 +975,20 @@ static inline bool extended_addresses_enabled(CPUARMState *env)
         && (env->cp15.c2_control & (1U << 31));
 }
 
-static int ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
+static CPAccessResult ats_access(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+    if (ri->opc2 & 4) {
+        /* Other states are only available with TrustZone; in
+         * a non-TZ implementation these registers don't exist
+         * at all, which is an Uncategorized trap. This underdecoding
+         * is safe because the reginfo is NO_MIGRATE.
+         */
+        return CP_ACCESS_TRAP_UNCATEGORIZED;
+    }
+    return CP_ACCESS_OK;
+}
+
+static void ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
 {
     hwaddr phys_addr;
     target_ulong page_size;
@@ -1043,10 +996,6 @@ static int ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
     int ret, is_user = ri->opc2 & 2;
     int access_type = ri->opc2 & 1;
 
-    if (ri->opc2 & 4) {
-        /* Other states are only available with TrustZone */
-        return EXCP_UDEF;
-    }
     ret = get_phys_addr(env, value, access_type, is_user,
                         &phys_addr, &prot, &page_size);
     if (extended_addresses_enabled(env)) {
@@ -1088,7 +1037,6 @@ static int ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
         }
         env->cp15.c7_par_hi = 0;
     }
-    return 0;
 }
 #endif
 
@@ -1099,7 +1047,8 @@ static const ARMCPRegInfo vapa_cp_reginfo[] = {
       .writefn = par_write },
 #ifndef CONFIG_USER_ONLY
     { .name = "ATS", .cp = 15, .crn = 7, .crm = 8, .opc1 = 0, .opc2 = CP_ANY,
-      .access = PL1_W, .writefn = ats_write, .type = ARM_CP_NO_MIGRATE },
+      .access = PL1_W, .accessfn = ats_access,
+      .writefn = ats_write, .type = ARM_CP_NO_MIGRATE },
 #endif
     REGINFO_SENTINEL
 };
@@ -1134,52 +1083,26 @@ static uint32_t extended_mpu_ap_bits(uint32_t val)
     return ret;
 }
 
-static int pmsav5_data_ap_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                                uint64_t value)
+static void pmsav5_data_ap_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                 uint64_t value)
 {
     env->cp15.c5_data = extended_mpu_ap_bits(value);
-    return 0;
 }
 
-static int pmsav5_data_ap_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                               uint64_t *value)
+static uint64_t pmsav5_data_ap_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
-    *value = simple_mpu_ap_bits(env->cp15.c5_data);
-    return 0;
+    return simple_mpu_ap_bits(env->cp15.c5_data);
 }
 
-static int pmsav5_insn_ap_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                                uint64_t value)
+static void pmsav5_insn_ap_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                 uint64_t value)
 {
     env->cp15.c5_insn = extended_mpu_ap_bits(value);
-    return 0;
 }
 
-static int pmsav5_insn_ap_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                               uint64_t *value)
+static uint64_t pmsav5_insn_ap_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
-    *value = simple_mpu_ap_bits(env->cp15.c5_insn);
-    return 0;
-}
-
-static int arm946_prbs_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                            uint64_t *value)
-{
-    if (ri->crm >= 8) {
-        return EXCP_UDEF;
-    }
-    *value = env->cp15.c6_region[ri->crm];
-    return 0;
-}
-
-static int arm946_prbs_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                             uint64_t value)
-{
-    if (ri->crm >= 8) {
-        return EXCP_UDEF;
-    }
-    env->cp15.c6_region[ri->crm] = value;
-    return 0;
+    return simple_mpu_ap_bits(env->cp15.c5_insn);
 }
 
 static const ARMCPRegInfo pmsav5_cp_reginfo[] = {
@@ -1204,14 +1127,35 @@ static const ARMCPRegInfo pmsav5_cp_reginfo[] = {
       .access = PL1_RW,
       .fieldoffset = offsetof(CPUARMState, cp15.c2_insn), .resetvalue = 0, },
     /* Protection region base and size registers */
-    { .name = "946_PRBS", .cp = 15, .crn = 6, .crm = CP_ANY, .opc1 = 0,
-      .opc2 = CP_ANY, .access = PL1_RW,
-      .readfn = arm946_prbs_read, .writefn = arm946_prbs_write, },
+    { .name = "946_PRBS0", .cp = 15, .crn = 6, .crm = 0, .opc1 = 0,
+      .opc2 = CP_ANY, .access = PL1_RW, .resetvalue = 0,
+      .fieldoffset = offsetof(CPUARMState, cp15.c6_region[0]) },
+    { .name = "946_PRBS1", .cp = 15, .crn = 6, .crm = 1, .opc1 = 0,
+      .opc2 = CP_ANY, .access = PL1_RW, .resetvalue = 0,
+      .fieldoffset = offsetof(CPUARMState, cp15.c6_region[1]) },
+    { .name = "946_PRBS2", .cp = 15, .crn = 6, .crm = 2, .opc1 = 0,
+      .opc2 = CP_ANY, .access = PL1_RW, .resetvalue = 0,
+      .fieldoffset = offsetof(CPUARMState, cp15.c6_region[2]) },
+    { .name = "946_PRBS3", .cp = 15, .crn = 6, .crm = 3, .opc1 = 0,
+      .opc2 = CP_ANY, .access = PL1_RW, .resetvalue = 0,
+      .fieldoffset = offsetof(CPUARMState, cp15.c6_region[3]) },
+    { .name = "946_PRBS4", .cp = 15, .crn = 6, .crm = 4, .opc1 = 0,
+      .opc2 = CP_ANY, .access = PL1_RW, .resetvalue = 0,
+      .fieldoffset = offsetof(CPUARMState, cp15.c6_region[4]) },
+    { .name = "946_PRBS5", .cp = 15, .crn = 6, .crm = 5, .opc1 = 0,
+      .opc2 = CP_ANY, .access = PL1_RW, .resetvalue = 0,
+      .fieldoffset = offsetof(CPUARMState, cp15.c6_region[5]) },
+    { .name = "946_PRBS6", .cp = 15, .crn = 6, .crm = 6, .opc1 = 0,
+      .opc2 = CP_ANY, .access = PL1_RW, .resetvalue = 0,
+      .fieldoffset = offsetof(CPUARMState, cp15.c6_region[6]) },
+    { .name = "946_PRBS7", .cp = 15, .crn = 6, .crm = 7, .opc1 = 0,
+      .opc2 = CP_ANY, .access = PL1_RW, .resetvalue = 0,
+      .fieldoffset = offsetof(CPUARMState, cp15.c6_region[7]) },
     REGINFO_SENTINEL
 };
 
-static int vmsa_ttbcr_raw_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                                uint64_t value)
+static void vmsa_ttbcr_raw_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                 uint64_t value)
 {
     int maskshift = extract32(value, 0, 3);
 
@@ -1228,11 +1172,10 @@ static int vmsa_ttbcr_raw_write(CPUARMState *env, const ARMCPRegInfo *ri,
     env->cp15.c2_control = value;
     env->cp15.c2_mask = ~(((uint32_t)0xffffffffu) >> maskshift);
     env->cp15.c2_base_mask = ~((uint32_t)0x3fffu >> maskshift);
-    return 0;
 }
 
-static int vmsa_ttbcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                            uint64_t value)
+static void vmsa_ttbcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                             uint64_t value)
 {
     if (arm_feature(env, ARM_FEATURE_LPAE)) {
         /* With LPAE the TTBCR could result in a change of ASID
@@ -1240,7 +1183,7 @@ static int vmsa_ttbcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
          */
         tlb_flush(env, 1);
     }
-    return vmsa_ttbcr_raw_write(env, ri, value);
+    vmsa_ttbcr_raw_write(env, ri, value);
 }
 
 static void vmsa_ttbcr_reset(CPUARMState *env, const ARMCPRegInfo *ri)
@@ -1273,40 +1216,36 @@ static const ARMCPRegInfo vmsa_cp_reginfo[] = {
     REGINFO_SENTINEL
 };
 
-static int omap_ticonfig_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                               uint64_t value)
+static void omap_ticonfig_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                uint64_t value)
 {
     env->cp15.c15_ticonfig = value & 0xe7;
     /* The OS_TYPE bit in this register changes the reported CPUID! */
     env->cp15.c0_cpuid = (value & (1 << 5)) ?
         ARM_CPUID_TI915T : ARM_CPUID_TI925T;
-    return 0;
 }
 
-static int omap_threadid_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                               uint64_t value)
+static void omap_threadid_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                uint64_t value)
 {
     env->cp15.c15_threadid = value & 0xffff;
-    return 0;
 }
 
-static int omap_wfi_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                          uint64_t value)
+static void omap_wfi_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                           uint64_t value)
 {
     /* Wait-for-interrupt (deprecated) */
     cpu_interrupt(CPU(arm_env_get_cpu(env)), CPU_INTERRUPT_HALT);
-    return 0;
 }
 
-static int omap_cachemaint_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                                 uint64_t value)
+static void omap_cachemaint_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                                  uint64_t value)
 {
     /* On OMAP there are registers indicating the max/min index of dcache lines
      * containing a dirty line; cache flush operations have to reset these.
      */
     env->cp15.c15_i_max = 0x000;
     env->cp15.c15_i_min = 0xff0;
-    return 0;
 }
 
 static const ARMCPRegInfo omap_cp_reginfo[] = {
@@ -1348,8 +1287,8 @@ static const ARMCPRegInfo omap_cp_reginfo[] = {
     REGINFO_SENTINEL
 };
 
-static int xscale_cpar_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                             uint64_t value)
+static void xscale_cpar_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                              uint64_t value)
 {
     value &= 0x3fff;
     if (env->cp15.c15_cpar != value) {
@@ -1357,7 +1296,6 @@ static int xscale_cpar_write(CPUARMState *env, const ARMCPRegInfo *ri,
         tb_flush(env);
         env->cp15.c15_cpar = value;
     }
-    return 0;
 }
 
 static const ARMCPRegInfo xscale_cp_reginfo[] = {
@@ -1437,8 +1375,7 @@ static const ARMCPRegInfo strongarm_cp_reginfo[] = {
     REGINFO_SENTINEL
 };
 
-static int mpidr_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                      uint64_t *value)
+static uint64_t mpidr_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     CPUState *cs = CPU(arm_env_get_cpu(env));
     uint32_t mpidr = cs->cpu_index;
@@ -1453,8 +1390,7 @@ static int mpidr_read(CPUARMState *env, const ARMCPRegInfo *ri,
          * not currently model any of those cores.
          */
     }
-    *value = mpidr;
-    return 0;
+    return mpidr;
 }
 
 static const ARMCPRegInfo mpidr_cp_reginfo[] = {
@@ -1463,17 +1399,16 @@ static const ARMCPRegInfo mpidr_cp_reginfo[] = {
     REGINFO_SENTINEL
 };
 
-static int par64_read(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t *value)
+static uint64_t par64_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
-    *value = ((uint64_t)env->cp15.c7_par_hi << 32) | env->cp15.c7_par;
-    return 0;
+    return ((uint64_t)env->cp15.c7_par_hi << 32) | env->cp15.c7_par;
 }
 
-static int par64_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
+static void par64_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                        uint64_t value)
 {
     env->cp15.c7_par_hi = value >> 32;
     env->cp15.c7_par = value;
-    return 0;
 }
 
 static void par64_reset(CPUARMState *env, const ARMCPRegInfo *ri)
@@ -1482,27 +1417,24 @@ static void par64_reset(CPUARMState *env, const ARMCPRegInfo *ri)
     env->cp15.c7_par = 0;
 }
 
-static int ttbr064_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                        uint64_t *value)
+static uint64_t ttbr064_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
-    *value = ((uint64_t)env->cp15.c2_base0_hi << 32) | env->cp15.c2_base0;
-    return 0;
+    return ((uint64_t)env->cp15.c2_base0_hi << 32) | env->cp15.c2_base0;
 }
 
-static int ttbr064_raw_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                             uint64_t value)
+static void ttbr064_raw_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                              uint64_t value)
 {
     env->cp15.c2_base0_hi = value >> 32;
     env->cp15.c2_base0 = value;
-    return 0;
 }
 
-static int ttbr064_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                         uint64_t value)
+static void ttbr064_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                          uint64_t value)
 {
     /* Writes to the 64 bit format TTBRs may change the ASID */
     tlb_flush(env, 1);
-    return ttbr064_raw_write(env, ri, value);
+    ttbr064_raw_write(env, ri, value);
 }
 
 static void ttbr064_reset(CPUARMState *env, const ARMCPRegInfo *ri)
@@ -1511,19 +1443,16 @@ static void ttbr064_reset(CPUARMState *env, const ARMCPRegInfo *ri)
     env->cp15.c2_base0 = 0;
 }
 
-static int ttbr164_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                        uint64_t *value)
+static uint64_t ttbr164_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
-    *value = ((uint64_t)env->cp15.c2_base1_hi << 32) | env->cp15.c2_base1;
-    return 0;
+    return ((uint64_t)env->cp15.c2_base1_hi << 32) | env->cp15.c2_base1;
 }
 
-static int ttbr164_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                         uint64_t value)
+static void ttbr164_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                          uint64_t value)
 {
     env->cp15.c2_base1_hi = value >> 32;
     env->cp15.c2_base1 = value;
-    return 0;
 }
 
 static void ttbr164_reset(CPUARMState *env, const ARMCPRegInfo *ri)
@@ -1560,32 +1489,26 @@ static const ARMCPRegInfo lpae_cp_reginfo[] = {
     REGINFO_SENTINEL
 };
 
-static int aa64_fpcr_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                          uint64_t *value)
+static uint64_t aa64_fpcr_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
-    *value = vfp_get_fpcr(env);
-    return 0;
+    return vfp_get_fpcr(env);
 }
 
-static int aa64_fpcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                           uint64_t value)
+static void aa64_fpcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                            uint64_t value)
 {
     vfp_set_fpcr(env, value);
-    return 0;
 }
 
-static int aa64_fpsr_read(CPUARMState *env, const ARMCPRegInfo *ri,
-                          uint64_t *value)
+static uint64_t aa64_fpsr_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
-    *value = vfp_get_fpsr(env);
-    return 0;
+    return vfp_get_fpsr(env);
 }
 
-static int aa64_fpsr_write(CPUARMState *env, const ARMCPRegInfo *ri,
-                           uint64_t value)
+static void aa64_fpsr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                            uint64_t value)
 {
     vfp_set_fpsr(env, value);
-    return 0;
 }
 
 static const ARMCPRegInfo v8_cp_reginfo[] = {
@@ -1618,13 +1541,13 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
     REGINFO_SENTINEL
 };
 
-static int sctlr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
+static void sctlr_write(CPUARMState *env, const ARMCPRegInfo *ri,
+                        uint64_t value)
 {
     env->cp15.c1_sys = value;
     /* ??? Lots of these bits are not implemented.  */
     /* This may enable/disable the MMU, so do a TLB flush.  */
     tlb_flush(env, 1);
-    return 0;
 }
 
 void register_cp_regs_for_features(ARMCPU *cpu)
@@ -1707,8 +1630,8 @@ void register_cp_regs_for_features(ARMCPU *cpu)
             .name = "PMCR", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 0,
             .access = PL0_RW, .resetvalue = cpu->midr & 0xff000000,
             .fieldoffset = offsetof(CPUARMState, cp15.c9_pmcr),
-            .readfn = pmreg_read, .writefn = pmcr_write,
-            .raw_readfn = raw_read, .raw_writefn = raw_write,
+            .accessfn = pmreg_access, .writefn = pmcr_write,
+            .raw_writefn = raw_write,
         };
         ARMCPRegInfo clidr = {
             .name = "CLIDR", .cp = 15, .crn = 0, .crm = 0, .opc1 = 1, .opc2 = 1,
@@ -2202,17 +2125,15 @@ const ARMCPRegInfo *get_arm_cp_reginfo(GHashTable *cpregs, uint32_t encoded_cp)
     return g_hash_table_lookup(cpregs, &encoded_cp);
 }
 
-int arm_cp_write_ignore(CPUARMState *env, const ARMCPRegInfo *ri,
-                        uint64_t value)
+void arm_cp_write_ignore(CPUARMState *env, const ARMCPRegInfo *ri,
+                         uint64_t value)
 {
     /* Helper coprocessor write function for write-ignore registers */
-    return 0;
 }
 
-int arm_cp_read_zero(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t *value)
+uint64_t arm_cp_read_zero(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     /* Helper coprocessor write function for read-as-zero registers */
-    *value = 0;
     return 0;
 }
 
@@ -2718,7 +2639,7 @@ void arm_cpu_do_interrupt(CPUState *cs)
         return; /* Never happens.  Keep compiler happy.  */
     }
     /* High vectors.  */
-    if (env->cp15.c1_sys & (1 << 13)) {
+    if (env->cp15.c1_sys & SCTLR_V) {
         /* when enabled, base address cannot be remapped.  */
         addr += 0xffff0000;
     } else {
@@ -2741,7 +2662,7 @@ void arm_cpu_do_interrupt(CPUState *cs)
     /* this is a lie, as the was no c1_sys on V4T/V5, but who cares
      * and we should just guard the thumb mode on V4 */
     if (arm_feature(env, ARM_FEATURE_V4T)) {
-        env->thumb = (env->cp15.c1_sys & (1 << 30)) != 0;
+        env->thumb = (env->cp15.c1_sys & SCTLR_TE) != 0;
     }
     env->regs[14] = env->regs[15] + offset;
     env->regs[15] = addr;
@@ -2767,12 +2688,15 @@ static inline int check_ap(CPUARMState *env, int ap, int domain_prot,
 
   switch (ap) {
   case 0:
+      if (arm_feature(env, ARM_FEATURE_V7)) {
+          return 0;
+      }
       if (access_type == 1)
           return 0;
-      switch ((env->cp15.c1_sys >> 8) & 3) {
-      case 1:
+      switch (env->cp15.c1_sys & (SCTLR_S | SCTLR_R)) {
+      case SCTLR_S:
           return is_user ? 0 : PAGE_READ;
-      case 2:
+      case SCTLR_R:
           return PAGE_READ;
       default:
           return 0;
@@ -3003,7 +2927,7 @@ static int get_phys_addr_v6(CPUARMState *env, uint32_t address, int access_type,
             goto do_fault;
 
         /* The simplified model uses AP[0] as an access control bit.  */
-        if ((env->cp15.c1_sys & (1 << 29)) && (ap & 1) == 0) {
+        if ((env->cp15.c1_sys & SCTLR_AFE) && (ap & 1) == 0) {
             /* Access flag fault.  */
             code = (code == 15) ? 6 : 3;
             goto do_fault;
@@ -3295,7 +3219,7 @@ static inline int get_phys_addr(CPUARMState *env, uint32_t address,
     if (address < 0x02000000)
         address += env->cp15.c13_fcse;
 
-    if ((env->cp15.c1_sys & 1) == 0) {
+    if ((env->cp15.c1_sys & SCTLR_M) == 0) {
         /* MMU/MPU disabled.  */
         *phys_ptr = address;
         *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
@@ -3308,7 +3232,7 @@ static inline int get_phys_addr(CPUARMState *env, uint32_t address,
     } else if (extended_addresses_enabled(env)) {
         return get_phys_addr_lpae(env, address, access_type, is_user, phys_ptr,
                                   prot, page_size);
-    } else if (env->cp15.c1_sys & (1 << 23)) {
+    } else if (env->cp15.c1_sys & SCTLR_XP) {
         return get_phys_addr_v6(env, address, access_type, is_user, phys_ptr,
                                 prot, page_size);
     } else {
diff --git a/target-arm/helper.h b/target-arm/helper.h
index 951e6ada07..19bd620532 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -57,6 +57,7 @@ DEF_HELPER_1(cpsr_read, i32, env)
 DEF_HELPER_3(v7m_msr, void, env, i32, i32)
 DEF_HELPER_2(v7m_mrs, i32, env, i32)
 
+DEF_HELPER_2(access_check_cp_reg, void, env, ptr)
 DEF_HELPER_3(set_cp_reg, void, env, ptr, i32)
 DEF_HELPER_2(get_cp_reg, i32, env, ptr)
 DEF_HELPER_3(set_cp_reg64, void, env, ptr, i64)
@@ -382,6 +383,8 @@ DEF_HELPER_3(neon_cge_f32, i32, i32, i32, ptr)
 DEF_HELPER_3(neon_cgt_f32, i32, i32, i32, ptr)
 DEF_HELPER_3(neon_acge_f32, i32, i32, i32, ptr)
 DEF_HELPER_3(neon_acgt_f32, i32, i32, i32, ptr)
+DEF_HELPER_3(neon_acge_f64, i64, i64, i64, ptr)
+DEF_HELPER_3(neon_acgt_f64, i64, i64, i64, ptr)
 
 /* iwmmxt_helper.c */
 DEF_HELPER_2(iwmmxt_maddsq, i64, i64, i64)
diff --git a/target-arm/kvm-consts.h b/target-arm/kvm-consts.h
index 0e7f889cba..6009a33f10 100644
--- a/target-arm/kvm-consts.h
+++ b/target-arm/kvm-consts.h
@@ -50,15 +50,29 @@ MISMATCH_CHECK(PSCI_FN_CPU_OFF, KVM_PSCI_FN_CPU_OFF)
 MISMATCH_CHECK(PSCI_FN_CPU_ON, KVM_PSCI_FN_CPU_ON)
 MISMATCH_CHECK(PSCI_FN_MIGRATE, KVM_PSCI_FN_MIGRATE)
 
+/* Note that KVM uses overlapping values for AArch32 and AArch64
+ * target CPU numbers. AArch32 targets:
+ */
 #define QEMU_KVM_ARM_TARGET_CORTEX_A15 0
+#define QEMU_KVM_ARM_TARGET_CORTEX_A7 1
+
+/* AArch64 targets: */
+#define QEMU_KVM_ARM_TARGET_AEM_V8 0
+#define QEMU_KVM_ARM_TARGET_FOUNDATION_V8 1
+#define QEMU_KVM_ARM_TARGET_CORTEX_A57 2
 
 /* There's no kernel define for this: sentinel value which
  * matches no KVM target value for either 64 or 32 bit
  */
 #define QEMU_KVM_ARM_TARGET_NONE UINT_MAX
 
-#ifndef TARGET_AARCH64
+#ifdef TARGET_AARCH64
+MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_AEM_V8, KVM_ARM_TARGET_AEM_V8)
+MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_FOUNDATION_V8, KVM_ARM_TARGET_FOUNDATION_V8)
+MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A57, KVM_ARM_TARGET_CORTEX_A57)
+#else
 MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A15, KVM_ARM_TARGET_CORTEX_A15)
+MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A7, KVM_ARM_TARGET_CORTEX_A7)
 #endif
 
 #define CP_REG_ARM64                   0x6000000000000000ULL
diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index b4c86904f4..13752baf63 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -1823,6 +1823,22 @@ uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
     return -float32_lt(f1, f0, fpst);
 }
 
+uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    float64 f0 = float64_abs(make_float64(a));
+    float64 f1 = float64_abs(make_float64(b));
+    return -float64_le(f1, f0, fpst);
+}
+
+uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    float64 f0 = float64_abs(make_float64(a));
+    float64 f1 = float64_abs(make_float64(b));
+    return -float64_lt(f1, f0, fpst);
+}
+
 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
 
 void HELPER(neon_qunzip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index a918e5b27a..eb0fccd98f 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -273,44 +273,50 @@ void HELPER(set_user_reg)(CPUARMState *env, uint32_t regno, uint32_t val)
     }
 }
 
+void HELPER(access_check_cp_reg)(CPUARMState *env, void *rip)
+{
+    const ARMCPRegInfo *ri = rip;
+    switch (ri->accessfn(env, ri)) {
+    case CP_ACCESS_OK:
+        return;
+    case CP_ACCESS_TRAP:
+    case CP_ACCESS_TRAP_UNCATEGORIZED:
+        /* These cases will eventually need to generate different
+         * syndrome information.
+         */
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    raise_exception(env, EXCP_UDEF);
+}
+
 void HELPER(set_cp_reg)(CPUARMState *env, void *rip, uint32_t value)
 {
     const ARMCPRegInfo *ri = rip;
-    int excp = ri->writefn(env, ri, value);
-    if (excp) {
-        raise_exception(env, excp);
-    }
+
+    ri->writefn(env, ri, value);
 }
 
 uint32_t HELPER(get_cp_reg)(CPUARMState *env, void *rip)
 {
     const ARMCPRegInfo *ri = rip;
-    uint64_t value;
-    int excp = ri->readfn(env, ri, &value);
-    if (excp) {
-        raise_exception(env, excp);
-    }
-    return value;
+
+    return ri->readfn(env, ri);
 }
 
 void HELPER(set_cp_reg64)(CPUARMState *env, void *rip, uint64_t value)
 {
     const ARMCPRegInfo *ri = rip;
-    int excp = ri->writefn(env, ri, value);
-    if (excp) {
-        raise_exception(env, excp);
-    }
+
+    ri->writefn(env, ri, value);
 }
 
 uint64_t HELPER(get_cp_reg64)(CPUARMState *env, void *rip)
 {
     const ARMCPRegInfo *ri = rip;
-    uint64_t value;
-    int excp = ri->readfn(env, ri, &value);
-    if (excp) {
-        raise_exception(env, excp);
-    }
-    return value;
+
+    return ri->readfn(env, ri);
 }
 
 /* ??? Flag setting arithmetic is awkward because we need to do comparisons.
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index d60223af8e..8752e7e16c 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -75,8 +75,12 @@ typedef struct AArch64DecodeTable {
 /* Function prototype for gen_ functions for calling Neon helpers */
 typedef void NeonGenTwoOpFn(TCGv_i32, TCGv_i32, TCGv_i32);
 typedef void NeonGenTwoOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32);
+typedef void NeonGenTwo64OpFn(TCGv_i64, TCGv_i64, TCGv_i64);
 typedef void NeonGenNarrowFn(TCGv_i32, TCGv_i64);
 typedef void NeonGenNarrowEnvFn(TCGv_i32, TCGv_ptr, TCGv_i64);
+typedef void NeonGenWidenFn(TCGv_i64, TCGv_i32);
+typedef void NeonGenTwoSingleOPFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr);
+typedef void NeonGenTwoDoubleOPFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr);
 
 /* initialize TCG globals.  */
 void a64_translate_init(void)
@@ -614,20 +618,26 @@ static void gen_adc_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
  */
 
 /*
- * Store from GPR register to memory
+ * Store from GPR register to memory.
  */
+static void do_gpr_st_memidx(DisasContext *s, TCGv_i64 source,
+                             TCGv_i64 tcg_addr, int size, int memidx)
+{
+    g_assert(size <= 3);
+    tcg_gen_qemu_st_i64(source, tcg_addr, memidx, MO_TE + size);
+}
+
 static void do_gpr_st(DisasContext *s, TCGv_i64 source,
                       TCGv_i64 tcg_addr, int size)
 {
-    g_assert(size <= 3);
-    tcg_gen_qemu_st_i64(source, tcg_addr, get_mem_index(s), MO_TE + size);
+    do_gpr_st_memidx(s, source, tcg_addr, size, get_mem_index(s));
 }
 
 /*
  * Load from memory to GPR register
  */
-static void do_gpr_ld(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr,
-                      int size, bool is_signed, bool extend)
+static void do_gpr_ld_memidx(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr,
+                             int size, bool is_signed, bool extend, int memidx)
 {
     TCGMemOp memop = MO_TE + size;
 
@@ -637,7 +647,7 @@ static void do_gpr_ld(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr,
         memop += MO_SIGN;
     }
 
-    tcg_gen_qemu_ld_i64(dest, tcg_addr, get_mem_index(s), memop);
+    tcg_gen_qemu_ld_i64(dest, tcg_addr, memidx, memop);
 
     if (extend && is_signed) {
         g_assert(size < 3);
@@ -645,6 +655,13 @@ static void do_gpr_ld(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr,
     }
 }
 
+static void do_gpr_ld(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr,
+                      int size, bool is_signed, bool extend)
+{
+    do_gpr_ld_memidx(s, dest, tcg_addr, size, is_signed, extend,
+                     get_mem_index(s));
+}
+
 /*
  * Store from FP register to memory
  */
@@ -1175,7 +1192,12 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
                                                crn, crm, op0, op1, op2));
 
     if (!ri) {
-        /* Unknown register */
+        /* Unknown register; this might be a guest error or a QEMU
+         * unimplemented feature.
+         */
+        qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch64 "
+                      "system register op0:%d op1:%d crn:%d crm:%d op2:%d\n",
+                      isread ? "read" : "write", op0, op1, crn, crm, op2);
         unallocated_encoding(s);
         return;
     }
@@ -1186,6 +1208,17 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
         return;
     }
 
+    if (ri->accessfn) {
+        /* Emit code to perform further access permissions checks at
+         * runtime; this may result in an exception.
+         */
+        TCGv_ptr tmpptr;
+        gen_a64_set_pc_im(s->pc - 4);
+        tmpptr = tcg_const_ptr(ri);
+        gen_helper_access_check_cp_reg(cpu_env, tmpptr);
+        tcg_temp_free_ptr(tmpptr);
+    }
+
     /* Handle special cases first */
     switch (ri->type & ~(ARM_CP_FLAG_MASK & ~ARM_CP_SPECIAL)) {
     case ARM_CP_NOP:
@@ -1213,7 +1246,6 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
             tcg_gen_movi_i64(tcg_rt, ri->resetvalue);
         } else if (ri->readfn) {
             TCGv_ptr tmpptr;
-            gen_a64_set_pc_im(s->pc - 4);
             tmpptr = tcg_const_ptr(ri);
             gen_helper_get_cp_reg64(tcg_rt, cpu_env, tmpptr);
             tcg_temp_free_ptr(tmpptr);
@@ -1226,7 +1258,6 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
             return;
         } else if (ri->writefn) {
             TCGv_ptr tmpptr;
-            gen_a64_set_pc_im(s->pc - 4);
             tmpptr = tcg_const_ptr(ri);
             gen_helper_set_cp_reg64(cpu_env, tmpptr, tcg_rt);
             tcg_temp_free_ptr(tmpptr);
@@ -1471,12 +1502,68 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
 }
 #else
 static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
-                                TCGv_i64 addr, int size, int is_pair)
+                                TCGv_i64 inaddr, int size, int is_pair)
 {
-    qemu_log_mask(LOG_UNIMP,
-                  "%s:%d: system mode store_exclusive unsupported "
-                  "at pc=%016" PRIx64 "\n",
-                  __FILE__, __LINE__, s->pc - 4);
+    /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]
+     *     && (!is_pair || env->exclusive_high == [addr + datasize])) {
+     *     [addr] = {Rt};
+     *     if (is_pair) {
+     *         [addr + datasize] = {Rt2};
+     *     }
+     *     {Rd} = 0;
+     * } else {
+     *     {Rd} = 1;
+     * }
+     * env->exclusive_addr = -1;
+     */
+    int fail_label = gen_new_label();
+    int done_label = gen_new_label();
+    TCGv_i64 addr = tcg_temp_local_new_i64();
+    TCGv_i64 tmp;
+
+    /* Copy input into a local temp so it is not trashed when the
+     * basic block ends at the branch insn.
+     */
+    tcg_gen_mov_i64(addr, inaddr);
+    tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label);
+
+    tmp = tcg_temp_new_i64();
+    tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), MO_TE + size);
+    tcg_gen_brcond_i64(TCG_COND_NE, tmp, cpu_exclusive_val, fail_label);
+    tcg_temp_free_i64(tmp);
+
+    if (is_pair) {
+        TCGv_i64 addrhi = tcg_temp_new_i64();
+        TCGv_i64 tmphi = tcg_temp_new_i64();
+
+        tcg_gen_addi_i64(addrhi, addr, 1 << size);
+        tcg_gen_qemu_ld_i64(tmphi, addrhi, get_mem_index(s), MO_TE + size);
+        tcg_gen_brcond_i64(TCG_COND_NE, tmphi, cpu_exclusive_high, fail_label);
+
+        tcg_temp_free_i64(tmphi);
+        tcg_temp_free_i64(addrhi);
+    }
+
+    /* We seem to still have the exclusive monitor, so do the store */
+    tcg_gen_qemu_st_i64(cpu_reg(s, rt), addr, get_mem_index(s), MO_TE + size);
+    if (is_pair) {
+        TCGv_i64 addrhi = tcg_temp_new_i64();
+
+        tcg_gen_addi_i64(addrhi, addr, 1 << size);
+        tcg_gen_qemu_st_i64(cpu_reg(s, rt2), addrhi,
+                            get_mem_index(s), MO_TE + size);
+        tcg_temp_free_i64(addrhi);
+    }
+
+    tcg_temp_free_i64(addr);
+
+    tcg_gen_movi_i64(cpu_reg(s, rd), 0);
+    tcg_gen_br(done_label);
+    gen_set_label(fail_label);
+    tcg_gen_movi_i64(cpu_reg(s, rd), 1);
+    gen_set_label(done_label);
+    tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+
 }
 #endif
 
@@ -1750,6 +1837,7 @@ static void disas_ldst_pair(DisasContext *s, uint32_t insn)
  * +----+-------+---+-----+-----+---+--------+-----+------+------+
  *
  * idx = 01 -> post-indexed, 11 pre-indexed, 00 unscaled imm. (no writeback)
+         10 -> unprivileged
  * V = 0 -> non-vector
  * size: 00 -> 8 bit, 01 -> 16 bit, 10 -> 32 bit, 11 -> 64bit
  * opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32
@@ -1765,6 +1853,7 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn)
     bool is_signed = false;
     bool is_store = false;
     bool is_extended = false;
+    bool is_unpriv = (idx == 2);
     bool is_vector = extract32(insn, 26, 1);
     bool post_index;
     bool writeback;
@@ -1773,7 +1862,7 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn)
 
     if (is_vector) {
         size |= (opc & 2) << 1;
-        if (size > 4) {
+        if (size > 4 || is_unpriv) {
             unallocated_encoding(s);
             return;
         }
@@ -1781,6 +1870,10 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn)
     } else {
         if (size == 3 && opc == 2) {
             /* PRFM - prefetch */
+            if (is_unpriv) {
+                unallocated_encoding(s);
+                return;
+            }
             return;
         }
         if (opc == 3 && size > 1) {
@@ -1794,6 +1887,7 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn)
 
     switch (idx) {
     case 0:
+    case 2:
         post_index = false;
         writeback = false;
         break;
@@ -1805,9 +1899,6 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn)
         post_index = false;
         writeback = true;
         break;
-    case 2:
-        g_assert(false);
-        break;
     }
 
     if (rn == 31) {
@@ -1827,10 +1918,13 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn)
         }
     } else {
         TCGv_i64 tcg_rt = cpu_reg(s, rt);
+        int memidx = is_unpriv ? 1 : get_mem_index(s);
+
         if (is_store) {
-            do_gpr_st(s, tcg_rt, tcg_addr, size);
+            do_gpr_st_memidx(s, tcg_rt, tcg_addr, size, memidx);
         } else {
-            do_gpr_ld(s, tcg_rt, tcg_addr, size, is_signed, is_extended);
+            do_gpr_ld_memidx(s, tcg_rt, tcg_addr, size,
+                             is_signed, is_extended, memidx);
         }
     }
 
@@ -2010,25 +2104,6 @@ static void disas_ldst_reg_unsigned_imm(DisasContext *s, uint32_t insn)
     }
 }
 
-/* Load/store register (immediate forms) */
-static void disas_ldst_reg_imm(DisasContext *s, uint32_t insn)
-{
-    switch (extract32(insn, 10, 2)) {
-    case 0: case 1: case 3:
-        /* Load/store register (unscaled immediate) */
-        /* Load/store immediate pre/post-indexed */
-        disas_ldst_reg_imm9(s, insn);
-        break;
-    case 2:
-        /* Load/store register unprivileged */
-        unsupported_encoding(s, insn);
-        break;
-    default:
-        unallocated_encoding(s);
-        break;
-    }
-}
-
 /* Load/store register (all forms) */
 static void disas_ldst_reg(DisasContext *s, uint32_t insn)
 {
@@ -2037,7 +2112,11 @@ static void disas_ldst_reg(DisasContext *s, uint32_t insn)
         if (extract32(insn, 21, 1) == 1 && extract32(insn, 10, 2) == 2) {
             disas_ldst_reg_roffset(s, insn);
         } else {
-            disas_ldst_reg_imm(s, insn);
+            /* Load/store register (unscaled immediate)
+             * Load/store immediate pre/post-indexed
+             * Load/store register unprivileged
+             */
+            disas_ldst_reg_imm9(s, insn);
         }
         break;
     case 1:
@@ -5838,7 +5917,100 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
  */
 static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn)
 {
-    unsupported_encoding(s, insn);
+    bool is_u = extract32(insn, 29, 1);
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 4);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+
+    if (is_u) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 0x9: /* SQDMLAL, SQDMLAL2 */
+    case 0xb: /* SQDMLSL, SQDMLSL2 */
+    case 0xd: /* SQDMULL, SQDMULL2 */
+        if (size == 0 || size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (size == 2) {
+        TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+        TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+        TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+        read_vec_element(s, tcg_op1, rn, 0, MO_32 | MO_SIGN);
+        read_vec_element(s, tcg_op2, rm, 0, MO_32 | MO_SIGN);
+
+        tcg_gen_mul_i64(tcg_res, tcg_op1, tcg_op2);
+        gen_helper_neon_addl_saturate_s64(tcg_res, cpu_env, tcg_res, tcg_res);
+
+        switch (opcode) {
+        case 0xd: /* SQDMULL, SQDMULL2 */
+            break;
+        case 0xb: /* SQDMLSL, SQDMLSL2 */
+            tcg_gen_neg_i64(tcg_res, tcg_res);
+            /* fall through */
+        case 0x9: /* SQDMLAL, SQDMLAL2 */
+            read_vec_element(s, tcg_op1, rd, 0, MO_64);
+            gen_helper_neon_addl_saturate_s64(tcg_res, cpu_env,
+                                              tcg_res, tcg_op1);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        write_fp_dreg(s, rd, tcg_res);
+
+        tcg_temp_free_i64(tcg_op1);
+        tcg_temp_free_i64(tcg_op2);
+        tcg_temp_free_i64(tcg_res);
+    } else {
+        TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+        TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+        TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+        read_vec_element_i32(s, tcg_op1, rn, 0, MO_16);
+        read_vec_element_i32(s, tcg_op2, rm, 0, MO_16);
+
+        gen_helper_neon_mull_s16(tcg_res, tcg_op1, tcg_op2);
+        gen_helper_neon_addl_saturate_s32(tcg_res, cpu_env, tcg_res, tcg_res);
+
+        switch (opcode) {
+        case 0xd: /* SQDMULL, SQDMULL2 */
+            break;
+        case 0xb: /* SQDMLSL, SQDMLSL2 */
+            gen_helper_neon_negl_u32(tcg_res, tcg_res);
+            /* fall through */
+        case 0x9: /* SQDMLAL, SQDMLAL2 */
+        {
+            TCGv_i64 tcg_op3 = tcg_temp_new_i64();
+            read_vec_element(s, tcg_op3, rd, 0, MO_32);
+            gen_helper_neon_addl_saturate_s32(tcg_res, cpu_env,
+                                              tcg_res, tcg_op3);
+            tcg_temp_free_i64(tcg_op3);
+            break;
+        }
+        default:
+            g_assert_not_reached();
+        }
+
+        tcg_gen_ext32u_i64(tcg_res, tcg_res);
+        write_fp_dreg(s, rd, tcg_res);
+
+        tcg_temp_free_i32(tcg_op1);
+        tcg_temp_free_i32(tcg_op2);
+        tcg_temp_free_i64(tcg_res);
+    }
 }
 
 static void handle_3same_64(DisasContext *s, int opcode, bool u,
@@ -5950,15 +6122,33 @@ static void handle_3same_float(DisasContext *s, int size, int elements,
             read_vec_element(s, tcg_op2, rm, pass, MO_64);
 
             switch (fpopcode) {
+            case 0x39: /* FMLS */
+                /* As usual for ARM, separate negation for fused multiply-add */
+                gen_helper_vfp_negd(tcg_op1, tcg_op1);
+                /* fall through */
+            case 0x19: /* FMLA */
+                read_vec_element(s, tcg_res, rd, pass, MO_64);
+                gen_helper_vfp_muladdd(tcg_res, tcg_op1, tcg_op2,
+                                       tcg_res, fpst);
+                break;
             case 0x18: /* FMAXNM */
                 gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
             case 0x1a: /* FADD */
                 gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
+            case 0x1b: /* FMULX */
+                gen_helper_vfp_mulxd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1c: /* FCMEQ */
+                gen_helper_neon_ceq_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
             case 0x1e: /* FMAX */
                 gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
+            case 0x1f: /* FRECPS */
+                gen_helper_recpsf_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
             case 0x38: /* FMINNM */
                 gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
@@ -5968,9 +6158,18 @@ static void handle_3same_float(DisasContext *s, int size, int elements,
             case 0x3e: /* FMIN */
                 gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
+            case 0x3f: /* FRSQRTS */
+                gen_helper_rsqrtsf_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
             case 0x5b: /* FMUL */
                 gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
+            case 0x5c: /* FCMGE */
+                gen_helper_neon_cge_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5d: /* FACGE */
+                gen_helper_neon_acge_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
             case 0x5f: /* FDIV */
                 gen_helper_vfp_divd(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
@@ -5978,6 +6177,12 @@ static void handle_3same_float(DisasContext *s, int size, int elements,
                 gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
                 gen_helper_vfp_absd(tcg_res, tcg_res);
                 break;
+            case 0x7c: /* FCMGT */
+                gen_helper_neon_cgt_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7d: /* FACGT */
+                gen_helper_neon_acgt_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
             default:
                 g_assert_not_reached();
             }
@@ -5997,12 +6202,30 @@ static void handle_3same_float(DisasContext *s, int size, int elements,
             read_vec_element_i32(s, tcg_op2, rm, pass, MO_32);
 
             switch (fpopcode) {
+            case 0x39: /* FMLS */
+                /* As usual for ARM, separate negation for fused multiply-add */
+                gen_helper_vfp_negs(tcg_op1, tcg_op1);
+                /* fall through */
+            case 0x19: /* FMLA */
+                read_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+                gen_helper_vfp_muladds(tcg_res, tcg_op1, tcg_op2,
+                                       tcg_res, fpst);
+                break;
             case 0x1a: /* FADD */
                 gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
+            case 0x1b: /* FMULX */
+                gen_helper_vfp_mulxs(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1c: /* FCMEQ */
+                gen_helper_neon_ceq_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
             case 0x1e: /* FMAX */
                 gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
+            case 0x1f: /* FRECPS */
+                gen_helper_recpsf_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
             case 0x18: /* FMAXNM */
                 gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
@@ -6015,9 +6238,18 @@ static void handle_3same_float(DisasContext *s, int size, int elements,
             case 0x3e: /* FMIN */
                 gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
+            case 0x3f: /* FRSQRTS */
+                gen_helper_rsqrtsf_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
             case 0x5b: /* FMUL */
                 gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
+            case 0x5c: /* FCMGE */
+                gen_helper_neon_cge_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5d: /* FACGE */
+                gen_helper_neon_acge_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
             case 0x5f: /* FDIV */
                 gen_helper_vfp_divs(tcg_res, tcg_op1, tcg_op2, fpst);
                 break;
@@ -6025,6 +6257,12 @@ static void handle_3same_float(DisasContext *s, int size, int elements,
                 gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
                 gen_helper_vfp_abss(tcg_res, tcg_res);
                 break;
+            case 0x7c: /* FCMGT */
+                gen_helper_neon_cgt_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7d: /* FACGT */
+                gen_helper_neon_acgt_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
             default:
                 g_assert_not_reached();
             }
@@ -6075,15 +6313,13 @@ static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn)
         int fpopcode = opcode | (extract32(size, 1, 1) << 5) | (u << 6);
         switch (fpopcode) {
         case 0x1b: /* FMULX */
-        case 0x1c: /* FCMEQ */
         case 0x1f: /* FRECPS */
         case 0x3f: /* FRSQRTS */
-        case 0x5c: /* FCMGE */
         case 0x5d: /* FACGE */
-        case 0x7c: /* FCMGT */
         case 0x7d: /* FACGT */
-            unsupported_encoding(s, insn);
-            return;
+        case 0x1c: /* FCMEQ */
+        case 0x5c: /* FCMGE */
+        case 0x7c: /* FCMGT */
         case 0x7a: /* FABD */
             break;
         default:
@@ -6268,6 +6504,115 @@ static void handle_2misc_64(DisasContext *s, int opcode, bool u,
     }
 }
 
+static void handle_2misc_fcmp_zero(DisasContext *s, int opcode,
+                                   bool is_scalar, bool is_u, bool is_q,
+                                   int size, int rn, int rd)
+{
+    bool is_double = (size == 3);
+    TCGv_ptr fpst = get_fpstatus_ptr();
+
+    if (is_double) {
+        TCGv_i64 tcg_op = tcg_temp_new_i64();
+        TCGv_i64 tcg_zero = tcg_const_i64(0);
+        TCGv_i64 tcg_res = tcg_temp_new_i64();
+        NeonGenTwoDoubleOPFn *genfn;
+        bool swap = false;
+        int pass;
+
+        switch (opcode) {
+        case 0x2e: /* FCMLT (zero) */
+            swap = true;
+            /* fallthrough */
+        case 0x2c: /* FCMGT (zero) */
+            genfn = gen_helper_neon_cgt_f64;
+            break;
+        case 0x2d: /* FCMEQ (zero) */
+            genfn = gen_helper_neon_ceq_f64;
+            break;
+        case 0x6d: /* FCMLE (zero) */
+            swap = true;
+            /* fall through */
+        case 0x6c: /* FCMGE (zero) */
+            genfn = gen_helper_neon_cge_f64;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+            if (swap) {
+                genfn(tcg_res, tcg_zero, tcg_op, fpst);
+            } else {
+                genfn(tcg_res, tcg_op, tcg_zero, fpst);
+            }
+            write_vec_element(s, tcg_res, rd, pass, MO_64);
+        }
+        if (is_scalar) {
+            clear_vec_high(s, rd);
+        }
+
+        tcg_temp_free_i64(tcg_res);
+        tcg_temp_free_i64(tcg_zero);
+        tcg_temp_free_i64(tcg_op);
+    } else {
+        TCGv_i32 tcg_op = tcg_temp_new_i32();
+        TCGv_i32 tcg_zero = tcg_const_i32(0);
+        TCGv_i32 tcg_res = tcg_temp_new_i32();
+        NeonGenTwoSingleOPFn *genfn;
+        bool swap = false;
+        int pass, maxpasses;
+
+        switch (opcode) {
+        case 0x2e: /* FCMLT (zero) */
+            swap = true;
+            /* fall through */
+        case 0x2c: /* FCMGT (zero) */
+            genfn = gen_helper_neon_cgt_f32;
+            break;
+        case 0x2d: /* FCMEQ (zero) */
+            genfn = gen_helper_neon_ceq_f32;
+            break;
+        case 0x6d: /* FCMLE (zero) */
+            swap = true;
+            /* fall through */
+        case 0x6c: /* FCMGE (zero) */
+            genfn = gen_helper_neon_cge_f32;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        if (is_scalar) {
+            maxpasses = 1;
+        } else {
+            maxpasses = is_q ? 4 : 2;
+        }
+
+        for (pass = 0; pass < maxpasses; pass++) {
+            read_vec_element_i32(s, tcg_op, rn, pass, MO_32);
+            if (swap) {
+                genfn(tcg_res, tcg_zero, tcg_op, fpst);
+            } else {
+                genfn(tcg_res, tcg_op, tcg_zero, fpst);
+            }
+            if (is_scalar) {
+                write_fp_sreg(s, rd, tcg_res);
+            } else {
+                write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+            }
+        }
+        tcg_temp_free_i32(tcg_res);
+        tcg_temp_free_i32(tcg_zero);
+        tcg_temp_free_i32(tcg_op);
+        if (!is_q && !is_scalar) {
+            clear_vec_high(s, rd);
+        }
+    }
+
+    tcg_temp_free_ptr(fpst);
+}
+
 /* C3.6.12 AdvSIMD scalar two reg misc
  *  31 30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
  * +-----+---+-----------+------+-----------+--------+-----+------+------+
@@ -6297,9 +6642,47 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
             return;
         }
         break;
+    case 0xc ... 0xf:
+    case 0x16 ... 0x1d:
+    case 0x1f:
+        /* Floating point: U, size[1] and opcode indicate operation;
+         * size[0] indicates single or double precision.
+         */
+        opcode |= (extract32(size, 1, 1) << 5) | (u << 6);
+        size = extract32(size, 0, 1) ? 3 : 2;
+        switch (opcode) {
+        case 0x2c: /* FCMGT (zero) */
+        case 0x2d: /* FCMEQ (zero) */
+        case 0x2e: /* FCMLT (zero) */
+        case 0x6c: /* FCMGE (zero) */
+        case 0x6d: /* FCMLE (zero) */
+            handle_2misc_fcmp_zero(s, opcode, true, u, true, size, rn, rd);
+            return;
+        case 0x1a: /* FCVTNS */
+        case 0x1b: /* FCVTMS */
+        case 0x1c: /* FCVTAS */
+        case 0x1d: /* SCVTF */
+        case 0x3a: /* FCVTPS */
+        case 0x3b: /* FCVTZS */
+        case 0x3d: /* FRECPE */
+        case 0x3f: /* FRECPX */
+        case 0x56: /* FCVTXN, FCVTXN2 */
+        case 0x5a: /* FCVTNU */
+        case 0x5b: /* FCVTMU */
+        case 0x5c: /* FCVTAU */
+        case 0x5d: /* UCVTF */
+        case 0x7a: /* FCVTPU */
+        case 0x7b: /* FCVTZU */
+        case 0x7d: /* FRSQRTE */
+            unsupported_encoding(s, insn);
+            return;
+        default:
+            unallocated_encoding(s);
+            return;
+        }
+        break;
     default:
         /* Other categories of encoding in this class:
-         *  + floating point (single and double)
          *  + SUQADD/USQADD/SQABS/SQNEG : size 8, 16, 32 or 64
          *  + SQXTN/SQXTN2/SQXTUN/SQXTUN2/UQXTN/UQXTN2:
          *    narrowing saturate ops: size 64/32/16 -> 32/16/8
@@ -6322,17 +6705,6 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
     }
 }
 
-/* C3.6.13 AdvSIMD scalar x indexed element
- *  31 30  29 28       24 23  22 21  20  19  16 15 12  11  10 9    5 4    0
- * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+
- * | 0 1 | U | 1 1 1 1 1 | size | L | M |  Rm  | opc | H | 0 |  Rn  |  Rd  |
- * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+
- */
-static void disas_simd_scalar_indexed(DisasContext *s, uint32_t insn)
-{
-    unsupported_encoding(s, insn);
-}
-
 /* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */
 static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
                                  int immh, int immb, int opcode, int rn, int rd)
@@ -6514,6 +6886,24 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
     }
 }
 
+/* Generate code to do a "long" addition or subtraction, ie one done in
+ * TCGv_i64 on vector lanes twice the width specified by size.
+ */
+static void gen_neon_addl(int size, bool is_sub, TCGv_i64 tcg_res,
+                          TCGv_i64 tcg_op1, TCGv_i64 tcg_op2)
+{
+    static NeonGenTwo64OpFn * const fns[3][2] = {
+        { gen_helper_neon_addl_u16, gen_helper_neon_subl_u16 },
+        { gen_helper_neon_addl_u32, gen_helper_neon_subl_u32 },
+        { tcg_gen_add_i64, tcg_gen_sub_i64 },
+    };
+    NeonGenTwo64OpFn *genfn;
+    assert(size < 3);
+
+    genfn = fns[size][is_sub];
+    genfn(tcg_res, tcg_op1, tcg_op2);
+}
+
 static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
                                 int opcode, int rd, int rn, int rm)
 {
@@ -6569,6 +6959,12 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
             }
 
             switch (opcode) {
+            case 0: /* SADDL, SADDL2, UADDL, UADDL2 */
+                tcg_gen_add_i64(tcg_passres, tcg_op1, tcg_op2);
+                break;
+            case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */
+                tcg_gen_sub_i64(tcg_passres, tcg_op1, tcg_op2);
+                break;
             case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
             case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
             {
@@ -6589,15 +6985,31 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
             case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
                 tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2);
                 break;
+            case 9: /* SQDMLAL, SQDMLAL2 */
+            case 11: /* SQDMLSL, SQDMLSL2 */
+            case 13: /* SQDMULL, SQDMULL2 */
+                tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2);
+                gen_helper_neon_addl_saturate_s64(tcg_passres, cpu_env,
+                                                  tcg_passres, tcg_passres);
+                break;
             default:
                 g_assert_not_reached();
             }
 
-            if (accop > 0) {
+            if (opcode == 9 || opcode == 11) {
+                /* saturating accumulate ops */
+                if (accop < 0) {
+                    tcg_gen_neg_i64(tcg_passres, tcg_passres);
+                }
+                gen_helper_neon_addl_saturate_s64(tcg_res[pass], cpu_env,
+                                                  tcg_res[pass], tcg_passres);
+            } else if (accop > 0) {
                 tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
-                tcg_temp_free_i64(tcg_passres);
             } else if (accop < 0) {
                 tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+            }
+
+            if (accop != 0) {
                 tcg_temp_free_i64(tcg_passres);
             }
 
@@ -6622,6 +7034,23 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
             }
 
             switch (opcode) {
+            case 0: /* SADDL, SADDL2, UADDL, UADDL2 */
+            case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */
+            {
+                TCGv_i64 tcg_op2_64 = tcg_temp_new_i64();
+                static NeonGenWidenFn * const widenfns[2][2] = {
+                    { gen_helper_neon_widen_s8, gen_helper_neon_widen_u8 },
+                    { gen_helper_neon_widen_s16, gen_helper_neon_widen_u16 },
+                };
+                NeonGenWidenFn *widenfn = widenfns[size][is_u];
+
+                widenfn(tcg_op2_64, tcg_op2);
+                widenfn(tcg_passres, tcg_op1);
+                gen_neon_addl(size, (opcode == 2), tcg_passres,
+                              tcg_passres, tcg_op2_64);
+                tcg_temp_free_i64(tcg_op2_64);
+                break;
+            }
             case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
             case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
                 if (size == 0) {
@@ -6655,28 +7084,32 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
                     }
                 }
                 break;
+            case 9: /* SQDMLAL, SQDMLAL2 */
+            case 11: /* SQDMLSL, SQDMLSL2 */
+            case 13: /* SQDMULL, SQDMULL2 */
+                assert(size == 1);
+                gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2);
+                gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
+                                                  tcg_passres, tcg_passres);
+                break;
             default:
                 g_assert_not_reached();
             }
             tcg_temp_free_i32(tcg_op1);
             tcg_temp_free_i32(tcg_op2);
 
-            if (accop > 0) {
-                if (size == 0) {
-                    gen_helper_neon_addl_u16(tcg_res[pass], tcg_res[pass],
-                                             tcg_passres);
+            if (accop != 0) {
+                if (opcode == 9 || opcode == 11) {
+                    /* saturating accumulate ops */
+                    if (accop < 0) {
+                        gen_helper_neon_negl_u32(tcg_passres, tcg_passres);
+                    }
+                    gen_helper_neon_addl_saturate_s32(tcg_res[pass], cpu_env,
+                                                      tcg_res[pass],
+                                                      tcg_passres);
                 } else {
-                    gen_helper_neon_addl_u32(tcg_res[pass], tcg_res[pass],
-                                             tcg_passres);
-                }
-                tcg_temp_free_i64(tcg_passres);
-            } else if (accop < 0) {
-                if (size == 0) {
-                    gen_helper_neon_subl_u16(tcg_res[pass], tcg_res[pass],
-                                             tcg_passres);
-                } else {
-                    gen_helper_neon_subl_u32(tcg_res[pass], tcg_res[pass],
-                                             tcg_passres);
+                    gen_neon_addl(size, (accop < 0), tcg_res[pass],
+                                  tcg_res[pass], tcg_passres);
                 }
                 tcg_temp_free_i64(tcg_passres);
             }
@@ -6689,6 +7122,95 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
     tcg_temp_free_i64(tcg_res[1]);
 }
 
+static void handle_3rd_wide(DisasContext *s, int is_q, int is_u, int size,
+                            int opcode, int rd, int rn, int rm)
+{
+    TCGv_i64 tcg_res[2];
+    int part = is_q ? 2 : 0;
+    int pass;
+
+    for (pass = 0; pass < 2; pass++) {
+        TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+        TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+        TCGv_i64 tcg_op2_wide = tcg_temp_new_i64();
+        static NeonGenWidenFn * const widenfns[3][2] = {
+            { gen_helper_neon_widen_s8, gen_helper_neon_widen_u8 },
+            { gen_helper_neon_widen_s16, gen_helper_neon_widen_u16 },
+            { tcg_gen_ext_i32_i64, tcg_gen_extu_i32_i64 },
+        };
+        NeonGenWidenFn *widenfn = widenfns[size][is_u];
+
+        read_vec_element(s, tcg_op1, rn, pass, MO_64);
+        read_vec_element_i32(s, tcg_op2, rm, part + pass, MO_32);
+        widenfn(tcg_op2_wide, tcg_op2);
+        tcg_temp_free_i32(tcg_op2);
+        tcg_res[pass] = tcg_temp_new_i64();
+        gen_neon_addl(size, (opcode == 3),
+                      tcg_res[pass], tcg_op1, tcg_op2_wide);
+        tcg_temp_free_i64(tcg_op1);
+        tcg_temp_free_i64(tcg_op2_wide);
+    }
+
+    for (pass = 0; pass < 2; pass++) {
+        write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+        tcg_temp_free_i64(tcg_res[pass]);
+    }
+}
+
+static void do_narrow_high_u32(TCGv_i32 res, TCGv_i64 in)
+{
+    tcg_gen_shri_i64(in, in, 32);
+    tcg_gen_trunc_i64_i32(res, in);
+}
+
+static void do_narrow_round_high_u32(TCGv_i32 res, TCGv_i64 in)
+{
+    tcg_gen_addi_i64(in, in, 1U << 31);
+    do_narrow_high_u32(res, in);
+}
+
+static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size,
+                                 int opcode, int rd, int rn, int rm)
+{
+    TCGv_i32 tcg_res[2];
+    int part = is_q ? 2 : 0;
+    int pass;
+
+    for (pass = 0; pass < 2; pass++) {
+        TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+        TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+        TCGv_i64 tcg_wideres = tcg_temp_new_i64();
+        static NeonGenNarrowFn * const narrowfns[3][2] = {
+            { gen_helper_neon_narrow_high_u8,
+              gen_helper_neon_narrow_round_high_u8 },
+            { gen_helper_neon_narrow_high_u16,
+              gen_helper_neon_narrow_round_high_u16 },
+            { do_narrow_high_u32, do_narrow_round_high_u32 },
+        };
+        NeonGenNarrowFn *gennarrow = narrowfns[size][is_u];
+
+        read_vec_element(s, tcg_op1, rn, pass, MO_64);
+        read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+        gen_neon_addl(size, (opcode == 6), tcg_wideres, tcg_op1, tcg_op2);
+
+        tcg_temp_free_i64(tcg_op1);
+        tcg_temp_free_i64(tcg_op2);
+
+        tcg_res[pass] = tcg_temp_new_i32();
+        gennarrow(tcg_res[pass], tcg_wideres);
+        tcg_temp_free_i64(tcg_wideres);
+    }
+
+    for (pass = 0; pass < 2; pass++) {
+        write_vec_element_i32(s, tcg_res[pass], rd, pass + part, MO_32);
+        tcg_temp_free_i32(tcg_res[pass]);
+    }
+    if (!is_q) {
+        clear_vec_high(s, rd);
+    }
+}
+
 /* C3.6.15 AdvSIMD three different
  *   31  30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0
  * +---+---+---+-----------+------+---+------+--------+-----+------+------+
@@ -6719,31 +7241,43 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
     case 1: /* SADDW, SADDW2, UADDW, UADDW2 */
     case 3: /* SSUBW, SSUBW2, USUBW, USUBW2 */
         /* 64 x 128 -> 128 */
-        unsupported_encoding(s, insn);
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        handle_3rd_wide(s, is_q, is_u, size, opcode, rd, rn, rm);
         break;
     case 4: /* ADDHN, ADDHN2, RADDHN, RADDHN2 */
     case 6: /* SUBHN, SUBHN2, RSUBHN, RSUBHN2 */
         /* 128 x 128 -> 64 */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm);
+        break;
+    case 14: /* PMULL, PMULL2 */
+        if (is_u || size == 1 || size == 2) {
+            unallocated_encoding(s);
+            return;
+        }
         unsupported_encoding(s, insn);
         break;
-    case 9:
-    case 11:
-    case 13:
-    case 14:
-        if (is_u) {
+    case 9: /* SQDMLAL, SQDMLAL2 */
+    case 11: /* SQDMLSL, SQDMLSL2 */
+    case 13: /* SQDMULL, SQDMULL2 */
+        if (is_u || size == 0) {
             unallocated_encoding(s);
             return;
         }
         /* fall through */
-    case 0:
-    case 2:
-        unsupported_encoding(s, insn);
-        break;
-    case 5:
-    case 7:
-    case 8:
-    case 10:
-    case 12:
+    case 0: /* SADDL, SADDL2, UADDL, UADDL2 */
+    case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */
+    case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+    case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+    case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+    case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+    case 12: /* SMULL, SMULL2, UMULL, UMULL2 */
         /* 64 x 64 -> 128 */
         if (size == 3) {
             unallocated_encoding(s);
@@ -6856,39 +7390,22 @@ static void gen_min_u32(TCGv_i32 res, TCGv_i32 op1, TCGv_i32 op2)
     tcg_gen_movcond_i32(TCG_COND_LEU, res, op1, op2, op1, op2);
 }
 
-/* Pairwise op subgroup of C3.6.16. */
-static void disas_simd_3same_pair(DisasContext *s, uint32_t insn)
+/* Pairwise op subgroup of C3.6.16.
+ *
+ * This is called directly or via the handle_3same_float for float pairwise
+ * operations where the opcode and size are calculated differently.
+ */
+static void handle_simd_3same_pair(DisasContext *s, int is_q, int u, int opcode,
+                                   int size, int rn, int rm, int rd)
 {
-    int is_q = extract32(insn, 30, 1);
-    int u = extract32(insn, 29, 1);
-    int size = extract32(insn, 22, 2);
-    int opcode = extract32(insn, 11, 5);
-    int rm = extract32(insn, 16, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
+    TCGv_ptr fpst;
     int pass;
 
-    if (size == 3 && !is_q) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (opcode) {
-    case 0x14: /* SMAXP, UMAXP */
-    case 0x15: /* SMINP, UMINP */
-        if (size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    case 0x17:
-        if (u) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    default:
-        g_assert_not_reached();
+    /* Floating point operations need fpst */
+    if (opcode >= 0x58) {
+        fpst = get_fpstatus_ptr();
+    } else {
+        TCGV_UNUSED_PTR(fpst);
     }
 
     /* These operations work on the concatenated rm:rn, with each pair of
@@ -6906,9 +7423,28 @@ static void disas_simd_3same_pair(DisasContext *s, uint32_t insn)
             read_vec_element(s, tcg_op2, passreg, 1, MO_64);
             tcg_res[pass] = tcg_temp_new_i64();
 
-            /* The only 64 bit pairwise integer op is ADDP */
-            assert(opcode == 0x17);
-            tcg_gen_add_i64(tcg_res[pass], tcg_op1, tcg_op2);
+            switch (opcode) {
+            case 0x17: /* ADDP */
+                tcg_gen_add_i64(tcg_res[pass], tcg_op1, tcg_op2);
+                break;
+            case 0x58: /* FMAXNMP */
+                gen_helper_vfp_maxnumd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5a: /* FADDP */
+                gen_helper_vfp_addd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5e: /* FMAXP */
+                gen_helper_vfp_maxd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x78: /* FMINNMP */
+                gen_helper_vfp_minnumd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7e: /* FMINP */
+                gen_helper_vfp_mind(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
 
             tcg_temp_free_i64(tcg_op1);
             tcg_temp_free_i64(tcg_op2);
@@ -6925,7 +7461,7 @@ static void disas_simd_3same_pair(DisasContext *s, uint32_t insn)
         for (pass = 0; pass < maxpass; pass++) {
             TCGv_i32 tcg_op1 = tcg_temp_new_i32();
             TCGv_i32 tcg_op2 = tcg_temp_new_i32();
-            NeonGenTwoOpFn *genfn;
+            NeonGenTwoOpFn *genfn = NULL;
             int passreg = pass < (maxpass / 2) ? rn : rm;
             int passelt = (is_q && (pass & 1)) ? 2 : 0;
 
@@ -6964,11 +7500,30 @@ static void disas_simd_3same_pair(DisasContext *s, uint32_t insn)
                 genfn = fns[size][u];
                 break;
             }
+            /* The FP operations are all on single floats (32 bit) */
+            case 0x58: /* FMAXNMP */
+                gen_helper_vfp_maxnums(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5a: /* FADDP */
+                gen_helper_vfp_adds(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5e: /* FMAXP */
+                gen_helper_vfp_maxs(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x78: /* FMINNMP */
+                gen_helper_vfp_minnums(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7e: /* FMINP */
+                gen_helper_vfp_mins(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
             default:
                 g_assert_not_reached();
             }
 
-            genfn(tcg_res[pass], tcg_op1, tcg_op2);
+            /* FP ops called directly, otherwise call now */
+            if (genfn) {
+                genfn(tcg_res[pass], tcg_op1, tcg_op2);
+            }
 
             tcg_temp_free_i32(tcg_op1);
             tcg_temp_free_i32(tcg_op2);
@@ -6982,6 +7537,10 @@ static void disas_simd_3same_pair(DisasContext *s, uint32_t insn)
             clear_vec_high(s, rd);
         }
     }
+
+    if (!TCGV_IS_UNUSED_PTR(fpst)) {
+        tcg_temp_free_ptr(fpst);
+    }
 }
 
 /* Floating point op subgroup of C3.6.16. */
@@ -7015,30 +7574,32 @@ static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
     case 0x5e: /* FMAXP */
     case 0x78: /* FMINNMP */
     case 0x7e: /* FMINP */
-        /* pairwise ops */
-        unsupported_encoding(s, insn);
+        if (size && !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+        handle_simd_3same_pair(s, is_q, 0, fpopcode, size ? MO_64 : MO_32,
+                               rn, rm, rd);
         return;
     case 0x1b: /* FMULX */
-    case 0x1c: /* FCMEQ */
     case 0x1f: /* FRECPS */
     case 0x3f: /* FRSQRTS */
-    case 0x5c: /* FCMGE */
     case 0x5d: /* FACGE */
-    case 0x7c: /* FCMGT */
     case 0x7d: /* FACGT */
     case 0x19: /* FMLA */
     case 0x39: /* FMLS */
-        unsupported_encoding(s, insn);
-        return;
     case 0x18: /* FMAXNM */
     case 0x1a: /* FADD */
+    case 0x1c: /* FCMEQ */
     case 0x1e: /* FMAX */
     case 0x38: /* FMINNM */
     case 0x3a: /* FSUB */
     case 0x3e: /* FMIN */
     case 0x5b: /* FMUL */
+    case 0x5c: /* FCMGE */
     case 0x5f: /* FDIV */
     case 0x7a: /* FABD */
+    case 0x7c: /* FCMGT */
         handle_3same_float(s, size, elements, fpopcode, rd, rn, rm);
         return;
     default:
@@ -7366,9 +7927,28 @@ static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn)
     case 0x17: /* ADDP */
     case 0x14: /* SMAXP, UMAXP */
     case 0x15: /* SMINP, UMINP */
+    {
         /* Pairwise operations */
-        disas_simd_3same_pair(s, insn);
+        int is_q = extract32(insn, 30, 1);
+        int u = extract32(insn, 29, 1);
+        int size = extract32(insn, 22, 2);
+        int rm = extract32(insn, 16, 5);
+        int rn = extract32(insn, 5, 5);
+        int rd = extract32(insn, 0, 5);
+        if (opcode == 0x17) {
+            if (u || (size == 3 && !is_q)) {
+                unallocated_encoding(s);
+                return;
+            }
+        } else {
+            if (size == 3) {
+                unallocated_encoding(s);
+                return;
+            }
+        }
+        handle_simd_3same_pair(s, is_q, u, opcode, size, rn, rm, rd);
         break;
+    }
     case 0x18 ... 0x31:
         /* floating point ops, sz[1] and U are part of opcode */
         disas_simd_3same_float(s, insn);
@@ -7618,6 +8198,17 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                 return;
             }
             break;
+        case 0x2c: /* FCMGT (zero) */
+        case 0x2d: /* FCMEQ (zero) */
+        case 0x2e: /* FCMLT (zero) */
+        case 0x6c: /* FCMGE (zero) */
+        case 0x6d: /* FCMLE (zero) */
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            handle_2misc_fcmp_zero(s, opcode, false, u, is_q, size, rn, rd);
+            return;
         case 0x16: /* FCVTN, FCVTN2 */
         case 0x17: /* FCVTL, FCVTL2 */
         case 0x18: /* FRINTN */
@@ -7626,9 +8217,6 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
         case 0x1b: /* FCVTMS */
         case 0x1c: /* FCVTAS */
         case 0x1d: /* SCVTF */
-        case 0x2c: /* FCMGT (zero) */
-        case 0x2d: /* FCMEQ (zero) */
-        case 0x2e: /* FCMLT (zero) */
         case 0x38: /* FRINTP */
         case 0x39: /* FRINTZ */
         case 0x3a: /* FCVTPS */
@@ -7642,8 +8230,6 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
         case 0x5b: /* FCVTMU */
         case 0x5c: /* FCVTAU */
         case 0x5d: /* UCVTF */
-        case 0x6c: /* FCMGE (zero) */
-        case 0x6d: /* FCMLE (zero) */
         case 0x79: /* FRINTI */
         case 0x7a: /* FCVTPU */
         case 0x7b: /* FCVTZU */
@@ -7805,15 +8391,455 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
     }
 }
 
-/* C3.6.18 AdvSIMD vector x indexed element
+/* C3.6.13 AdvSIMD scalar x indexed element
+ *  31 30  29 28       24 23  22 21  20  19  16 15 12  11  10 9    5 4    0
+ * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+
+ * | 0 1 | U | 1 1 1 1 1 | size | L | M |  Rm  | opc | H | 0 |  Rn  |  Rd  |
+ * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+
+ * C3.6.18 AdvSIMD vector x indexed element
  *   31  30  29 28       24 23  22 21  20  19  16 15 12  11  10 9    5 4    0
  * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+
  * | 0 | Q | U | 0 1 1 1 1 | size | L | M |  Rm  | opc | H | 0 |  Rn  |  Rd  |
  * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+
  */
-static void disas_simd_indexed_vector(DisasContext *s, uint32_t insn)
+static void disas_simd_indexed(DisasContext *s, uint32_t insn)
 {
-    unsupported_encoding(s, insn);
+    /* This encoding has two kinds of instruction:
+     *  normal, where we perform elt x idxelt => elt for each
+     *     element in the vector
+     *  long, where we perform elt x idxelt and generate a result of
+     *     double the width of the input element
+     * The long ops have a 'part' specifier (ie come in INSN, INSN2 pairs).
+     */
+    bool is_scalar = extract32(insn, 28, 1);
+    bool is_q = extract32(insn, 30, 1);
+    bool u = extract32(insn, 29, 1);
+    int size = extract32(insn, 22, 2);
+    int l = extract32(insn, 21, 1);
+    int m = extract32(insn, 20, 1);
+    /* Note that the Rm field here is only 4 bits, not 5 as it usually is */
+    int rm = extract32(insn, 16, 4);
+    int opcode = extract32(insn, 12, 4);
+    int h = extract32(insn, 11, 1);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    bool is_long = false;
+    bool is_fp = false;
+    int index;
+    TCGv_ptr fpst;
+
+    switch (opcode) {
+    case 0x0: /* MLA */
+    case 0x4: /* MLS */
+        if (!u || is_scalar) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+    case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+    case 0xa: /* SMULL, SMULL2, UMULL, UMULL2 */
+        if (is_scalar) {
+            unallocated_encoding(s);
+            return;
+        }
+        is_long = true;
+        break;
+    case 0x3: /* SQDMLAL, SQDMLAL2 */
+    case 0x7: /* SQDMLSL, SQDMLSL2 */
+    case 0xb: /* SQDMULL, SQDMULL2 */
+        is_long = true;
+        /* fall through */
+    case 0xc: /* SQDMULH */
+    case 0xd: /* SQRDMULH */
+        if (u) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x8: /* MUL */
+        if (u || is_scalar) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x1: /* FMLA */
+    case 0x5: /* FMLS */
+        if (u) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0x9: /* FMUL, FMULX */
+        if (!extract32(size, 1, 1)) {
+            unallocated_encoding(s);
+            return;
+        }
+        is_fp = true;
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (is_fp) {
+        /* low bit of size indicates single/double */
+        size = extract32(size, 0, 1) ? 3 : 2;
+        if (size == 2) {
+            index = h << 1 | l;
+        } else {
+            if (l || !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            index = h;
+        }
+        rm |= (m << 4);
+    } else {
+        switch (size) {
+        case 1:
+            index = h << 2 | l << 1 | m;
+            break;
+        case 2:
+            index = h << 1 | l;
+            rm |= (m << 4);
+            break;
+        default:
+            unallocated_encoding(s);
+            return;
+        }
+    }
+
+    if (is_fp) {
+        fpst = get_fpstatus_ptr();
+    } else {
+        TCGV_UNUSED_PTR(fpst);
+    }
+
+    if (size == 3) {
+        TCGv_i64 tcg_idx = tcg_temp_new_i64();
+        int pass;
+
+        assert(is_fp && is_q && !is_long);
+
+        read_vec_element(s, tcg_idx, rm, index, MO_64);
+
+        for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+            TCGv_i64 tcg_op = tcg_temp_new_i64();
+            TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+
+            switch (opcode) {
+            case 0x5: /* FMLS */
+                /* As usual for ARM, separate negation for fused multiply-add */
+                gen_helper_vfp_negd(tcg_op, tcg_op);
+                /* fall through */
+            case 0x1: /* FMLA */
+                read_vec_element(s, tcg_res, rd, pass, MO_64);
+                gen_helper_vfp_muladdd(tcg_res, tcg_op, tcg_idx, tcg_res, fpst);
+                break;
+            case 0x9: /* FMUL, FMULX */
+                if (u) {
+                    gen_helper_vfp_mulxd(tcg_res, tcg_op, tcg_idx, fpst);
+                } else {
+                    gen_helper_vfp_muld(tcg_res, tcg_op, tcg_idx, fpst);
+                }
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            write_vec_element(s, tcg_res, rd, pass, MO_64);
+            tcg_temp_free_i64(tcg_op);
+            tcg_temp_free_i64(tcg_res);
+        }
+
+        if (is_scalar) {
+            clear_vec_high(s, rd);
+        }
+
+        tcg_temp_free_i64(tcg_idx);
+    } else if (!is_long) {
+        /* 32 bit floating point, or 16 or 32 bit integer.
+         * For the 16 bit scalar case we use the usual Neon helpers and
+         * rely on the fact that 0 op 0 == 0 with no side effects.
+         */
+        TCGv_i32 tcg_idx = tcg_temp_new_i32();
+        int pass, maxpasses;
+
+        if (is_scalar) {
+            maxpasses = 1;
+        } else {
+            maxpasses = is_q ? 4 : 2;
+        }
+
+        read_vec_element_i32(s, tcg_idx, rm, index, size);
+
+        if (size == 1 && !is_scalar) {
+            /* The simplest way to handle the 16x16 indexed ops is to duplicate
+             * the index into both halves of the 32 bit tcg_idx and then use
+             * the usual Neon helpers.
+             */
+            tcg_gen_deposit_i32(tcg_idx, tcg_idx, tcg_idx, 16, 16);
+        }
+
+        for (pass = 0; pass < maxpasses; pass++) {
+            TCGv_i32 tcg_op = tcg_temp_new_i32();
+            TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_op, rn, pass, is_scalar ? size : MO_32);
+
+            switch (opcode) {
+            case 0x0: /* MLA */
+            case 0x4: /* MLS */
+            case 0x8: /* MUL */
+            {
+                static NeonGenTwoOpFn * const fns[2][2] = {
+                    { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
+                    { tcg_gen_add_i32, tcg_gen_sub_i32 },
+                };
+                NeonGenTwoOpFn *genfn;
+                bool is_sub = opcode == 0x4;
+
+                if (size == 1) {
+                    gen_helper_neon_mul_u16(tcg_res, tcg_op, tcg_idx);
+                } else {
+                    tcg_gen_mul_i32(tcg_res, tcg_op, tcg_idx);
+                }
+                if (opcode == 0x8) {
+                    break;
+                }
+                read_vec_element_i32(s, tcg_op, rd, pass, MO_32);
+                genfn = fns[size - 1][is_sub];
+                genfn(tcg_res, tcg_op, tcg_res);
+                break;
+            }
+            case 0x5: /* FMLS */
+                /* As usual for ARM, separate negation for fused multiply-add */
+                gen_helper_vfp_negs(tcg_op, tcg_op);
+                /* fall through */
+            case 0x1: /* FMLA */
+                read_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+                gen_helper_vfp_muladds(tcg_res, tcg_op, tcg_idx, tcg_res, fpst);
+                break;
+            case 0x9: /* FMUL, FMULX */
+                if (u) {
+                    gen_helper_vfp_mulxs(tcg_res, tcg_op, tcg_idx, fpst);
+                } else {
+                    gen_helper_vfp_muls(tcg_res, tcg_op, tcg_idx, fpst);
+                }
+                break;
+            case 0xc: /* SQDMULH */
+                if (size == 1) {
+                    gen_helper_neon_qdmulh_s16(tcg_res, cpu_env,
+                                               tcg_op, tcg_idx);
+                } else {
+                    gen_helper_neon_qdmulh_s32(tcg_res, cpu_env,
+                                               tcg_op, tcg_idx);
+                }
+                break;
+            case 0xd: /* SQRDMULH */
+                if (size == 1) {
+                    gen_helper_neon_qrdmulh_s16(tcg_res, cpu_env,
+                                                tcg_op, tcg_idx);
+                } else {
+                    gen_helper_neon_qrdmulh_s32(tcg_res, cpu_env,
+                                                tcg_op, tcg_idx);
+                }
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            if (is_scalar) {
+                write_fp_sreg(s, rd, tcg_res);
+            } else {
+                write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+            }
+
+            tcg_temp_free_i32(tcg_op);
+            tcg_temp_free_i32(tcg_res);
+        }
+
+        tcg_temp_free_i32(tcg_idx);
+
+        if (!is_q) {
+            clear_vec_high(s, rd);
+        }
+    } else {
+        /* long ops: 16x16->32 or 32x32->64 */
+        TCGv_i64 tcg_res[2];
+        int pass;
+        bool satop = extract32(opcode, 0, 1);
+        TCGMemOp memop = MO_32;
+
+        if (satop || !u) {
+            memop |= MO_SIGN;
+        }
+
+        if (size == 2) {
+            TCGv_i64 tcg_idx = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_idx, rm, index, memop);
+
+            for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+                TCGv_i64 tcg_op = tcg_temp_new_i64();
+                TCGv_i64 tcg_passres;
+                int passelt;
+
+                if (is_scalar) {
+                    passelt = 0;
+                } else {
+                    passelt = pass + (is_q * 2);
+                }
+
+                read_vec_element(s, tcg_op, rn, passelt, memop);
+
+                tcg_res[pass] = tcg_temp_new_i64();
+
+                if (opcode == 0xa || opcode == 0xb) {
+                    /* Non-accumulating ops */
+                    tcg_passres = tcg_res[pass];
+                } else {
+                    tcg_passres = tcg_temp_new_i64();
+                }
+
+                tcg_gen_mul_i64(tcg_passres, tcg_op, tcg_idx);
+                tcg_temp_free_i64(tcg_op);
+
+                if (satop) {
+                    /* saturating, doubling */
+                    gen_helper_neon_addl_saturate_s64(tcg_passres, cpu_env,
+                                                      tcg_passres, tcg_passres);
+                }
+
+                if (opcode == 0xa || opcode == 0xb) {
+                    continue;
+                }
+
+                /* Accumulating op: handle accumulate step */
+                read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+
+                switch (opcode) {
+                case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+                    tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+                    break;
+                case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+                    tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+                    break;
+                case 0x7: /* SQDMLSL, SQDMLSL2 */
+                    tcg_gen_neg_i64(tcg_passres, tcg_passres);
+                    /* fall through */
+                case 0x3: /* SQDMLAL, SQDMLAL2 */
+                    gen_helper_neon_addl_saturate_s64(tcg_res[pass], cpu_env,
+                                                      tcg_res[pass],
+                                                      tcg_passres);
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+                tcg_temp_free_i64(tcg_passres);
+            }
+            tcg_temp_free_i64(tcg_idx);
+
+            if (is_scalar) {
+                clear_vec_high(s, rd);
+            }
+        } else {
+            TCGv_i32 tcg_idx = tcg_temp_new_i32();
+
+            assert(size == 1);
+            read_vec_element_i32(s, tcg_idx, rm, index, size);
+
+            if (!is_scalar) {
+                /* The simplest way to handle the 16x16 indexed ops is to
+                 * duplicate the index into both halves of the 32 bit tcg_idx
+                 * and then use the usual Neon helpers.
+                 */
+                tcg_gen_deposit_i32(tcg_idx, tcg_idx, tcg_idx, 16, 16);
+            }
+
+            for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+                TCGv_i32 tcg_op = tcg_temp_new_i32();
+                TCGv_i64 tcg_passres;
+
+                if (is_scalar) {
+                    read_vec_element_i32(s, tcg_op, rn, pass, size);
+                } else {
+                    read_vec_element_i32(s, tcg_op, rn,
+                                         pass + (is_q * 2), MO_32);
+                }
+
+                tcg_res[pass] = tcg_temp_new_i64();
+
+                if (opcode == 0xa || opcode == 0xb) {
+                    /* Non-accumulating ops */
+                    tcg_passres = tcg_res[pass];
+                } else {
+                    tcg_passres = tcg_temp_new_i64();
+                }
+
+                if (memop & MO_SIGN) {
+                    gen_helper_neon_mull_s16(tcg_passres, tcg_op, tcg_idx);
+                } else {
+                    gen_helper_neon_mull_u16(tcg_passres, tcg_op, tcg_idx);
+                }
+                if (satop) {
+                    gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
+                                                      tcg_passres, tcg_passres);
+                }
+                tcg_temp_free_i32(tcg_op);
+
+                if (opcode == 0xa || opcode == 0xb) {
+                    continue;
+                }
+
+                /* Accumulating op: handle accumulate step */
+                read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+
+                switch (opcode) {
+                case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+                    gen_helper_neon_addl_u32(tcg_res[pass], tcg_res[pass],
+                                             tcg_passres);
+                    break;
+                case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+                    gen_helper_neon_subl_u32(tcg_res[pass], tcg_res[pass],
+                                             tcg_passres);
+                    break;
+                case 0x7: /* SQDMLSL, SQDMLSL2 */
+                    gen_helper_neon_negl_u32(tcg_passres, tcg_passres);
+                    /* fall through */
+                case 0x3: /* SQDMLAL, SQDMLAL2 */
+                    gen_helper_neon_addl_saturate_s32(tcg_res[pass], cpu_env,
+                                                      tcg_res[pass],
+                                                      tcg_passres);
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+                tcg_temp_free_i64(tcg_passres);
+            }
+            tcg_temp_free_i32(tcg_idx);
+
+            if (is_scalar) {
+                tcg_gen_ext32u_i64(tcg_res[0], tcg_res[0]);
+            }
+        }
+
+        if (is_scalar) {
+            tcg_res[1] = tcg_const_i64(0);
+        }
+
+        for (pass = 0; pass < 2; pass++) {
+            write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+            tcg_temp_free_i64(tcg_res[pass]);
+        }
+    }
+
+    if (!TCGV_IS_UNUSED_PTR(fpst)) {
+        tcg_temp_free_ptr(fpst);
+    }
 }
 
 /* C3.6.19 Crypto AES
@@ -7861,7 +8887,7 @@ static const AArch64DecodeTable data_proc_simd[] = {
     { 0x0e200800, 0x9f3e0c00, disas_simd_two_reg_misc },
     { 0x0e300800, 0x9f3e0c00, disas_simd_across_lanes },
     { 0x0e000400, 0x9fe08400, disas_simd_copy },
-    { 0x0f000000, 0x9f000400, disas_simd_indexed_vector },
+    { 0x0f000000, 0x9f000400, disas_simd_indexed }, /* vector indexed */
     /* simd_mod_imm decode is a subset of simd_shift_imm, so must precede it */
     { 0x0f000400, 0x9ff80400, disas_simd_mod_imm },
     { 0x0f000400, 0x9f800400, disas_simd_shift_imm },
@@ -7873,7 +8899,7 @@ static const AArch64DecodeTable data_proc_simd[] = {
     { 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc },
     { 0x5e300800, 0xdf3e0c00, disas_simd_scalar_pairwise },
     { 0x5e000400, 0xdfe08400, disas_simd_scalar_copy },
-    { 0x5f000000, 0xdf000400, disas_simd_scalar_indexed },
+    { 0x5f000000, 0xdf000400, disas_simd_indexed }, /* scalar indexed */
     { 0x5f000400, 0xdf800400, disas_simd_scalar_shift_imm },
     { 0x4e280800, 0xff3e0c00, disas_crypto_aes },
     { 0x5e000000, 0xff208c00, disas_crypto_three_reg_sha },
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 782aab8b58..6ccf0ba482 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -6837,6 +6837,17 @@ static int disas_coproc_insn(CPUARMState * env, DisasContext *s, uint32_t insn)
             return 1;
         }
 
+        if (ri->accessfn) {
+            /* Emit code to perform further access permissions checks at
+             * runtime; this may result in an exception.
+             */
+            TCGv_ptr tmpptr;
+            gen_set_pc_im(s, s->pc);
+            tmpptr = tcg_const_ptr(ri);
+            gen_helper_access_check_cp_reg(cpu_env, tmpptr);
+            tcg_temp_free_ptr(tmpptr);
+        }
+
         /* Handle special cases first */
         switch (ri->type & ~(ARM_CP_FLAG_MASK & ~ARM_CP_SPECIAL)) {
         case ARM_CP_NOP:
@@ -6865,7 +6876,6 @@ static int disas_coproc_insn(CPUARMState * env, DisasContext *s, uint32_t insn)
                     tmp64 = tcg_const_i64(ri->resetvalue);
                 } else if (ri->readfn) {
                     TCGv_ptr tmpptr;
-                    gen_set_pc_im(s, s->pc);
                     tmp64 = tcg_temp_new_i64();
                     tmpptr = tcg_const_ptr(ri);
                     gen_helper_get_cp_reg64(tmp64, cpu_env, tmpptr);
@@ -6888,7 +6898,6 @@ static int disas_coproc_insn(CPUARMState * env, DisasContext *s, uint32_t insn)
                     tmp = tcg_const_i32(ri->resetvalue);
                 } else if (ri->readfn) {
                     TCGv_ptr tmpptr;
-                    gen_set_pc_im(s, s->pc);
                     tmp = tcg_temp_new_i32();
                     tmpptr = tcg_const_ptr(ri);
                     gen_helper_get_cp_reg(tmp, cpu_env, tmpptr);
@@ -6923,7 +6932,6 @@ static int disas_coproc_insn(CPUARMState * env, DisasContext *s, uint32_t insn)
                 tcg_temp_free_i32(tmphi);
                 if (ri->writefn) {
                     TCGv_ptr tmpptr = tcg_const_ptr(ri);
-                    gen_set_pc_im(s, s->pc);
                     gen_helper_set_cp_reg64(cpu_env, tmpptr, tmp64);
                     tcg_temp_free_ptr(tmpptr);
                 } else {
@@ -6934,7 +6942,6 @@ static int disas_coproc_insn(CPUARMState * env, DisasContext *s, uint32_t insn)
                 if (ri->writefn) {
                     TCGv_i32 tmp;
                     TCGv_ptr tmpptr;
-                    gen_set_pc_im(s, s->pc);
                     tmp = load_reg(s, rt);
                     tmpptr = tcg_const_ptr(ri);
                     gen_helper_set_cp_reg(cpu_env, tmpptr, tmp);
@@ -6962,6 +6969,19 @@ static int disas_coproc_insn(CPUARMState * env, DisasContext *s, uint32_t insn)
         return 0;
     }
 
+    /* Unknown register; this might be a guest error or a QEMU
+     * unimplemented feature.
+     */
+    if (is64) {
+        qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch32 "
+                      "64 bit system register cp:%d opc1: %d crm:%d\n",
+                      isread ? "read" : "write", cpnum, opc1, crm);
+    } else {
+        qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch32 "
+                      "system register cp:%d opc1:%d crn:%d crm:%d opc2:%d\n",
+                      isread ? "read" : "write", cpnum, opc1, crn, crm, opc2);
+    }
+
     return 1;
 }