target-arm queue:

* target/arm: Fix Neon emulation bugs on big-endian hosts * target/arm: fix handling of HCR.FB * target/arm: fix LORID_EL1 access check * disas/capstone: Fix monitor disassembly of >32 bytes * hw/arm/smmuv3: Fix potential integer overflow (CID 1432363) * hw/arm/boot: fix SVE for EL3 direct kernel boot * hw/display/omap_lcdc: Fix potential NULL pointer dereference * hw/display/exynos4210_fimd: Fix potential NULL pointer dereference * target/arm: Get correct MMU index for other-security-state * configure: Test that gio libs from pkg-config work * hw/intc/arm_gicv3_cpuif: Make GIC maintenance interrupts work * docs: Fix building with Sphinx 3 * tests/qtest/npcm7xx_rng-test: Disable randomness tests -----BEGIN PGP SIGNATURE----- iQJNBAABCAA3FiEE4aXFk81BneKOgxXPPCUl7RQ2DN4FAl+gPSwZHHBldGVyLm1h eWRlbGxAbGluYXJvLm9yZwAKCRA8JSXtFDYM3rBwD/9kNodk0LilJEbE/UVL5niv EnLo0xo+qFx8jPR19VVG6Cp3mBwwImV7MVebAuuh6cgzdyofKwpd03h/XMwIOY0T gHlfk/npJnob/7bambBU5UTAZnOHj8EnuCwTKq3AuRROdi35p4OqDZTxAYNNJNQa 1dRRTEODxuPRi/bmwuYLp1esrjXlJa5KSlv+3gjunVG+uEJ6ygHJOZlgJ22704D/ 2IB3rrtwx/oYBsaQCd9TQ/uIVgkvfRo1feQp5/ukeb4nYDNWtjkk5usPYcGh7h5P dCgneinXvyTqZXgk9FpT25rVrp01IBZXNkGjEy/HMmpib6ABsKGywBQfif4ZQXc7 KlO+A8yCvAvRuJcjsVMV71z9j0MIu5eU9aOW7Oqu/ORMnRSlEionCypPaO3J/kF2 e6XoGQZJaziIo5hg8hxyALcKKtpwgd2ckAdNxQhw3vsNA7uDe1acs6BzJbiT1J1o 05zZs6Xy4OheHkFGOKoZyVAmSCsfwqgaHspl62owCRrNcT8URLzGpjEal+l4+FHN 8kMs012aiOSkDAWldPH2hjt0sYV/F4bVDID4PZj5Cwrz9lXQyq8e8Lw6WUvqXEzf Kgl/XVu9mZx4wMWLNg78cneXbM+RQNmJsWMMA/qZn5Lh2p/73a1jA9vjRBl406Tt RW00VEbywkOYtj1EZKdgqw== =PdV9 -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20201102' into staging target-arm queue: * target/arm: Fix Neon emulation bugs on big-endian hosts * target/arm: fix handling of HCR.FB * target/arm: fix LORID_EL1 access check * disas/capstone: Fix monitor disassembly of >32 bytes * hw/arm/smmuv3: Fix potential integer overflow (CID 1432363) * hw/arm/boot: fix SVE for EL3 direct kernel boot * hw/display/omap_lcdc: Fix potential NULL pointer dereference * hw/display/exynos4210_fimd: Fix potential NULL pointer dereference * target/arm: Get correct MMU index for other-security-state * configure: Test that gio libs from pkg-config work * hw/intc/arm_gicv3_cpuif: Make GIC maintenance interrupts work * docs: Fix building with Sphinx 3 * tests/qtest/npcm7xx_rng-test: Disable randomness tests # gpg: Signature made Mon 02 Nov 2020 17:09:00 GMT # gpg: using RSA key E1A5C593CD419DE28E8315CF3C2525ED14360CDE # gpg: issuer "peter.maydell@linaro.org" # gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>" [ultimate] # gpg: aka "Peter Maydell <pmaydell@gmail.com>" [ultimate] # gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>" [ultimate] # Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE * remotes/pmaydell/tags/pull-target-arm-20201102: (26 commits) tests/qtest/npcm7xx_rng-test: Disable randomness tests qemu-option-trace.rst.inc: Don't use option:: markup scripts/kerneldoc: For Sphinx 3 use c:macro for macros with arguments hw/intc/arm_gicv3_cpuif: Make GIC maintenance interrupts work configure: Test that gio libs from pkg-config work target/arm: Get correct MMU index for other-security-state hw/display/exynos4210_fimd: Fix potential NULL pointer dereference hw/display/omap_lcdc: Fix potential NULL pointer dereference hw/arm/boot: fix SVE for EL3 direct kernel boot hw/arm/smmuv3: Fix potential integer overflow (CID 1432363) disas/capstone: Fix monitor disassembly of >32 bytes target/arm: fix LORID_EL1 access check target/arm: fix handling of HCR.FB target/arm: Fix VUDOT/VSDOT (scalar) on big-endian hosts target/arm: Fix float16 pairwise Neon ops on big-endian hosts target/arm: Improve do_prewiden_3d target/arm: Simplify do_long_3d and do_2scalar_long target/arm: Rename neon_load_reg64 to vfp_load_reg64 target/arm: Add read/write_neon_element64 target/arm: Rename neon_load_reg32 to vfp_load_reg32 ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2020-11-03 10:38:05 +00:00 · 2020-11-03 10:38:05 +00:00 · c7a7a877b7
parent 8545ae485b ffb4fbf90a
commit c7a7a877b7
17 changed files with 588 additions and 493 deletions
--- a/10
+++ b/10
@ -3489,13 +3489,21 @@ if test "$static" = yes && test "$mingw32" = yes; then
 fi

 if $pkg_config --atleast-version=$glib_req_ver gio-2.0; then
-    gio=yes
    gio_cflags=$($pkg_config --cflags gio-2.0)
    gio_libs=$($pkg_config --libs gio-2.0)
    gdbus_codegen=$($pkg_config --variable=gdbus_codegen gio-2.0)
    if [ ! -x "$gdbus_codegen" ]; then
        gdbus_codegen=
    fi
+    # Check that the libraries actually work -- Ubuntu 18.04 ships
+    # with pkg-config --static --libs data for gio-2.0 that is missing
+    # -lblkid and will give a link error.
+    write_c_skeleton
+    if compile_prog "" "gio_libs" ; then
+        gio=yes
+    else
+        gio=no
+    fi
 else
    gio=no
 fi
--- a/disas/capstone.c
+++ b/disas/capstone.c
@ -286,7 +286,7 @@ bool cap_disas_monitor(disassemble_info *info, uint64_t pc, int count)

        /* Make certain that we can make progress.  */
        assert(tsize != 0);
-        info->read_memory_func(pc, cap_buf + csize, tsize, info);
+        info->read_memory_func(pc + csize, cap_buf + csize, tsize, info);
        csize += tsize;

        if (cs_disasm_iter(handle, &cbuf, &csize, &pc, insn)) {
--- a/docs/qemu-option-trace.rst.inc
+++ b/docs/qemu-option-trace.rst.inc
@ -1,7 +1,7 @@

 Specify tracing options.

-.. option:: [enable=]PATTERN
+``[enable=]PATTERN``

  Immediately enable events matching *PATTERN*
  (either event name or a globbing pattern).  This option is only
@ -11,7 +11,7 @@ Specify tracing options.

  Use :option:`-trace help` to print a list of names of trace points.

-.. option:: events=FILE
+``events=FILE``

  Immediately enable events listed in *FILE*.
  The file must contain one event name (as listed in the ``trace-events-all``
@ -19,7 +19,7 @@ Specify tracing options.
  available if QEMU has been compiled with the ``simple``, ``log`` or
  ``ftrace`` tracing backend.

-.. option:: file=FILE
+``file=FILE``

  Log output traces to *FILE*.
  This option is only available if QEMU has been compiled with
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@ -742,6 +742,9 @@ static void do_cpu_reset(void *opaque)
                    if (cpu_isar_feature(aa64_mte, cpu)) {
                        env->cp15.scr_el3 |= SCR_ATA;
                    }
+                    if (cpu_isar_feature(aa64_sve, cpu)) {
+                        env->cp15.cptr_el[3] |= CPTR_EZ;
+                    }
                    /* AArch64 kernels never boot in secure mode */
                    assert(!info->secure_boot);
                    /* This hook is only supported for AArch32 currently:
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@ -17,6 +17,7 @@
 */

 #include "qemu/osdep.h"
+#include "qemu/bitops.h"
 #include "hw/irq.h"
 #include "hw/sysbus.h"
 #include "migration/vmstate.h"
@ -864,7 +865,7 @@ static void smmuv3_s1_range_inval(SMMUState *s, Cmd *cmd)
        scale = CMD_SCALE(cmd);
        num = CMD_NUM(cmd);
        ttl = CMD_TTL(cmd);
-        num_pages = (num + 1) * (1 << (scale));
+        num_pages = (num + 1) * BIT_ULL(scale);
    }

    if (type == SMMU_CMD_TLBI_NH_VA) {
--- a/hw/display/exynos4210_fimd.c
+++ b/hw/display/exynos4210_fimd.c
@ -1275,12 +1275,14 @@ static void exynos4210_fimd_update(void *opaque)
    bool blend = false;
    uint8_t *host_fb_addr;
    bool is_dirty = false;
-    const int global_width = (s->vidtcon[2] & FIMD_VIDTCON2_SIZE_MASK) + 1;
+    int global_width;

    if (!s || !s->console || !s->enabled ||
        surface_bits_per_pixel(qemu_console_surface(s->console)) == 0) {
        return;
    }
+
+    global_width = (s->vidtcon[2] & FIMD_VIDTCON2_SIZE_MASK) + 1;
    exynos4210_update_resolution(s);
    surface = qemu_console_surface(s->console);

--- a/hw/display/omap_lcdc.c
+++ b/hw/display/omap_lcdc.c
@ -78,14 +78,18 @@ static void omap_lcd_interrupts(struct omap_lcd_panel_s *s)
 static void omap_update_display(void *opaque)
 {
    struct omap_lcd_panel_s *omap_lcd = (struct omap_lcd_panel_s *) opaque;
-    DisplaySurface *surface = qemu_console_surface(omap_lcd->con);
+    DisplaySurface *surface;
    draw_line_func draw_line;
    int size, height, first, last;
    int width, linesize, step, bpp, frame_offset;
    hwaddr frame_base;

-    if (!omap_lcd || omap_lcd->plm == 1 || !omap_lcd->enable ||
-        !surface_bits_per_pixel(surface)) {
+    if (!omap_lcd || omap_lcd->plm == 1 || !omap_lcd->enable) {
+        return;
+    }
+
+    surface = qemu_console_surface(omap_lcd->con);
+    if (!surface_bits_per_pixel(surface)) {
        return;
    }

--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@ -399,6 +399,7 @@ static void gicv3_cpuif_virt_update(GICv3CPUState *cs)
    int irqlevel = 0;
    int fiqlevel = 0;
    int maintlevel = 0;
+    ARMCPU *cpu = ARM_CPU(cs->cpu);

    idx = hppvi_index(cs);
    trace_gicv3_cpuif_virt_update(gicv3_redist_affid(cs), idx);
@ -424,7 +425,7 @@ static void gicv3_cpuif_virt_update(GICv3CPUState *cs)

    qemu_set_irq(cs->parent_vfiq, fiqlevel);
    qemu_set_irq(cs->parent_virq, irqlevel);
-    qemu_set_irq(cs->maintenance_irq, maintlevel);
+    qemu_set_irq(cpu->gicv3_maintenance_interrupt, maintlevel);
 }

 static uint64_t icv_ap_read(CPUARMState *env, const ARMCPRegInfo *ri)
@ -2624,8 +2625,6 @@ void gicv3_init_cpuif(GICv3State *s)
            && cpu->gic_num_lrs) {
            int j;

-            cs->maintenance_irq = cpu->gicv3_maintenance_interrupt;
-
            cs->num_list_regs = cpu->gic_num_lrs;
            cs->vpribits = cpu->gic_vpribits;
            cs->vprebits = cpu->gic_vprebits;
--- a/include/hw/intc/arm_gicv3_common.h
+++ b/include/hw/intc/arm_gicv3_common.h
@ -153,7 +153,6 @@ struct GICv3CPUState {
    qemu_irq parent_fiq;
    qemu_irq parent_virq;
    qemu_irq parent_vfiq;
-    qemu_irq maintenance_irq;

    /* Redistributor */
    uint32_t level;                  /* Current IRQ level */
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@ -839,7 +839,23 @@ sub output_function_rst(%) {
 	output_highlight_rst($args{'purpose'});
 	$start = "\n\n**Syntax**\n\n  ``";
    } else {
-	print ".. c:function:: ";
+        if ((split(/\./, $sphinx_version))[0] >= 3) {
+            # Sphinx 3 and later distinguish macros and functions and
+            # complain if you use c:function with something that's not
+            # syntactically valid as a function declaration.
+            # We assume that anything with a return type is a function
+            # and anything without is a macro.
+            if ($args{'functiontype'} ne "") {
+                print ".. c:function:: ";
+            } else {
+                print ".. c:macro:: ";
+            }
+        } else {
+            # Older Sphinx don't support documenting macros that take
+            # arguments with c:macro, and don't complain about the use
+            # of c:function for this.
+            print ".. c:function:: ";
+        }
    }
    if ($args{'functiontype'} ne "") {
 	$start .= $args{'functiontype'} . " " . $args{'function'} . " (";
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@ -731,13 +731,12 @@ static void tlbimvaa_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

 /*
 * Non-IS variants of TLB operations are upgraded to
- * IS versions if we are at NS EL1 and HCR_EL2.FB is set to
+ * IS versions if we are at EL1 and HCR_EL2.FB is effectively set to
 * force broadcast of these operations.
 */
 static bool tlb_force_broadcast(CPUARMState *env)
 {
-    return (env->cp15.hcr_el2 & HCR_FB) &&
-        arm_current_el(env) == 1 && arm_is_secure_below_el3(env);
+    return arm_current_el(env) == 1 && (arm_hcr_el2_eff(env) & HCR_FB);
 }

 static void tlbiall_write(CPUARMState *env, const ARMCPRegInfo *ri,
@ -6680,9 +6679,10 @@ static uint64_t id_aa64pfr0_read(CPUARMState *env, const ARMCPRegInfo *ri)
 #endif

 /* Shared logic between LORID and the rest of the LOR* registers.
- * Secure state has already been delt with.
+ * Secure state exclusion has already been dealt with.
 */
-static CPAccessResult access_lor_ns(CPUARMState *env)
+static CPAccessResult access_lor_ns(CPUARMState *env,
+                                    const ARMCPRegInfo *ri, bool isread)
 {
    int el = arm_current_el(env);

@ -6695,16 +6695,6 @@ static CPAccessResult access_lor_ns(CPUARMState *env)
    return CP_ACCESS_OK;
 }

-static CPAccessResult access_lorid(CPUARMState *env, const ARMCPRegInfo *ri,
-                                   bool isread)
-{
-    if (arm_is_secure_below_el3(env)) {
-        /* Access ok in secure mode.  */
-        return CP_ACCESS_OK;
-    }
-    return access_lor_ns(env);
-}
-
 static CPAccessResult access_lor_other(CPUARMState *env,
                                       const ARMCPRegInfo *ri, bool isread)
 {
@ -6712,7 +6702,7 @@ static CPAccessResult access_lor_other(CPUARMState *env,
        /* Access denied in secure mode.  */
        return CP_ACCESS_TRAP;
    }
-    return access_lor_ns(env);
+    return access_lor_ns(env, ri, isread);
 }

 /*
@ -6739,7 +6729,7 @@ static const ARMCPRegInfo lor_reginfo[] = {
      .type = ARM_CP_CONST, .resetvalue = 0 },
    { .name = "LORID_EL1", .state = ARM_CP_STATE_AA64,
      .opc0 = 3, .opc1 = 0, .crn = 10, .crm = 4, .opc2 = 7,
-      .access = PL1_R, .accessfn = access_lorid,
+      .access = PL1_R, .accessfn = access_lor_ns,
      .type = ARM_CP_CONST, .resetvalue = 0 },
    REGINFO_SENTINEL
 };
--- a/target/arm/m_helper.c
+++ b/target/arm/m_helper.c
@ -2719,7 +2719,8 @@ ARMMMUIdx arm_v7m_mmu_idx_for_secstate_and_priv(CPUARMState *env,
 /* Return the MMU index for a v7M CPU in the specified security state */
 ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate)
 {
-    bool priv = arm_current_el(env) != 0;
+    bool priv = arm_v7m_is_handler_mode(env) ||
+        !(env->v7m.control[secstate] & 1);

    return arm_v7m_mmu_idx_for_secstate_and_priv(env, secstate, priv);
 }
--- a/target/arm/translate-neon.c.inc
+++ b/target/arm/translate-neon.c.inc
@ -60,25 +60,6 @@ static inline int neon_3same_fp_size(DisasContext *s, int x)
 #include "decode-neon-ls.c.inc"
 #include "decode-neon-shared.c.inc"

-/* Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
- * where 0 is the least significant end of the register.
- */
-static inline long
-neon_element_offset(int reg, int element, MemOp size)
-{
-    int element_size = 1 << size;
-    int ofs = element * element_size;
-#ifdef HOST_WORDS_BIGENDIAN
-    /* Calculate the offset assuming fully little-endian,
-     * then XOR to account for the order of the 8-byte units.
-     */
-    if (element_size < 8) {
-        ofs ^= 8 - element_size;
-    }
-#endif
-    return neon_reg_offset(reg, 0) + ofs;
-}
-
 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
 {
    long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
@ -585,12 +566,12 @@ static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
             * We cannot write 16 bytes at once because the
             * destination is unaligned.
             */
-            tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
+            tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
                                 8, 8, tmp);
-            tcg_gen_gvec_mov(0, neon_reg_offset(vd + 1, 0),
-                             neon_reg_offset(vd, 0), 8, 8);
+            tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
+                             neon_full_reg_offset(vd), 8, 8);
        } else {
-            tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0),
+            tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
                                 vec_size, vec_size, tmp);
        }
        tcg_gen_addi_i32(addr, addr, 1 << size);
@ -691,9 +672,9 @@ static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 {
    int vec_size = a->q ? 16 : 8;
-    int rd_ofs = neon_reg_offset(a->vd, 0);
-    int rn_ofs = neon_reg_offset(a->vn, 0);
-    int rm_ofs = neon_reg_offset(a->vm, 0);
+    int rd_ofs = neon_full_reg_offset(a->vd);
+    int rn_ofs = neon_full_reg_offset(a->vn);
+    int rm_ofs = neon_full_reg_offset(a->vm);

    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
        return false;
@ -975,18 +956,24 @@ static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
     * early. Since Q is 0 there are always just two passes, so instead
     * of a complicated loop over each pass we just unroll.
     */
-    tmp = neon_load_reg(a->vn, 0);
-    tmp2 = neon_load_reg(a->vn, 1);
+    tmp = tcg_temp_new_i32();
+    tmp2 = tcg_temp_new_i32();
+    tmp3 = tcg_temp_new_i32();
+
+    read_neon_element32(tmp, a->vn, 0, MO_32);
+    read_neon_element32(tmp2, a->vn, 1, MO_32);
    fn(tmp, tmp, tmp2);
-    tcg_temp_free_i32(tmp2);

-    tmp3 = neon_load_reg(a->vm, 0);
-    tmp2 = neon_load_reg(a->vm, 1);
+    read_neon_element32(tmp3, a->vm, 0, MO_32);
+    read_neon_element32(tmp2, a->vm, 1, MO_32);
    fn(tmp3, tmp3, tmp2);
-    tcg_temp_free_i32(tmp2);

-    neon_store_reg(a->vd, 0, tmp);
-    neon_store_reg(a->vd, 1, tmp3);
+    write_neon_element32(tmp, a->vd, 0, MO_32);
+    write_neon_element32(tmp3, a->vd, 1, MO_32);
+
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(tmp2);
+    tcg_temp_free_i32(tmp3);
    return true;
 }

@ -1177,8 +1164,8 @@ static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
 {
    /* Handle a 2-reg-shift insn which can be vectorized. */
    int vec_size = a->q ? 16 : 8;
-    int rd_ofs = neon_reg_offset(a->vd, 0);
-    int rm_ofs = neon_reg_offset(a->vm, 0);
+    int rd_ofs = neon_full_reg_offset(a->vd);
+    int rm_ofs = neon_full_reg_offset(a->vm);

    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
        return false;
@ -1278,9 +1265,9 @@ static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
    for (pass = 0; pass < a->q + 1; pass++) {
        TCGv_i64 tmp = tcg_temp_new_i64();

-        neon_load_reg64(tmp, a->vm + pass);
+        read_neon_element64(tmp, a->vm, pass, MO_64);
        fn(tmp, cpu_env, tmp, constimm);
-        neon_store_reg64(tmp, a->vd + pass);
+        write_neon_element64(tmp, a->vd, pass, MO_64);
        tcg_temp_free_i64(tmp);
    }
    tcg_temp_free_i64(constimm);
@ -1294,7 +1281,7 @@ static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
     * 2-reg-and-shift operations, size < 3 case, where the
     * helper needs to be passed cpu_env.
     */
-    TCGv_i32 constimm;
+    TCGv_i32 constimm, tmp;
    int pass;

    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
@ -1320,12 +1307,14 @@ static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
     * by immediate using the variable shift operations.
     */
    constimm = tcg_const_i32(dup_const(a->size, a->shift));
+    tmp = tcg_temp_new_i32();

    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
-        TCGv_i32 tmp = neon_load_reg(a->vm, pass);
+        read_neon_element32(tmp, a->vm, pass, MO_32);
        fn(tmp, cpu_env, tmp, constimm);
-        neon_store_reg(a->vd, pass, tmp);
+        write_neon_element32(tmp, a->vd, pass, MO_32);
    }
+    tcg_temp_free_i32(tmp);
    tcg_temp_free_i32(constimm);
    return true;
 }
@ -1383,21 +1372,21 @@ static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
    constimm = tcg_const_i64(-a->shift);
    rm1 = tcg_temp_new_i64();
    rm2 = tcg_temp_new_i64();
+    rd = tcg_temp_new_i32();

    /* Load both inputs first to avoid potential overwrite if rm == rd */
-    neon_load_reg64(rm1, a->vm);
-    neon_load_reg64(rm2, a->vm + 1);
+    read_neon_element64(rm1, a->vm, 0, MO_64);
+    read_neon_element64(rm2, a->vm, 1, MO_64);

    shiftfn(rm1, rm1, constimm);
-    rd = tcg_temp_new_i32();
    narrowfn(rd, cpu_env, rm1);
-    neon_store_reg(a->vd, 0, rd);
+    write_neon_element32(rd, a->vd, 0, MO_32);

    shiftfn(rm2, rm2, constimm);
-    rd = tcg_temp_new_i32();
    narrowfn(rd, cpu_env, rm2);
-    neon_store_reg(a->vd, 1, rd);
+    write_neon_element32(rd, a->vd, 1, MO_32);

+    tcg_temp_free_i32(rd);
    tcg_temp_free_i64(rm1);
    tcg_temp_free_i64(rm2);
    tcg_temp_free_i64(constimm);
@ -1447,10 +1436,14 @@ static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
    constimm = tcg_const_i32(imm);

    /* Load all inputs first to avoid potential overwrite */
-    rm1 = neon_load_reg(a->vm, 0);
-    rm2 = neon_load_reg(a->vm, 1);
-    rm3 = neon_load_reg(a->vm + 1, 0);
-    rm4 = neon_load_reg(a->vm + 1, 1);
+    rm1 = tcg_temp_new_i32();
+    rm2 = tcg_temp_new_i32();
+    rm3 = tcg_temp_new_i32();
+    rm4 = tcg_temp_new_i32();
+    read_neon_element32(rm1, a->vm, 0, MO_32);
+    read_neon_element32(rm2, a->vm, 1, MO_32);
+    read_neon_element32(rm3, a->vm, 2, MO_32);
+    read_neon_element32(rm4, a->vm, 3, MO_32);
    rtmp = tcg_temp_new_i64();

    shiftfn(rm1, rm1, constimm);
@ -1460,7 +1453,8 @@ static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
    tcg_temp_free_i32(rm2);

    narrowfn(rm1, cpu_env, rtmp);
-    neon_store_reg(a->vd, 0, rm1);
+    write_neon_element32(rm1, a->vd, 0, MO_32);
+    tcg_temp_free_i32(rm1);

    shiftfn(rm3, rm3, constimm);
    shiftfn(rm4, rm4, constimm);
@ -1471,7 +1465,8 @@ static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,

    narrowfn(rm3, cpu_env, rtmp);
    tcg_temp_free_i64(rtmp);
-    neon_store_reg(a->vd, 1, rm3);
+    write_neon_element32(rm3, a->vd, 1, MO_32);
+    tcg_temp_free_i32(rm3);
    return true;
 }

@ -1572,8 +1567,10 @@ static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
        widen_mask = dup_const(a->size + 1, widen_mask);
    }

-    rm0 = neon_load_reg(a->vm, 0);
-    rm1 = neon_load_reg(a->vm, 1);
+    rm0 = tcg_temp_new_i32();
+    rm1 = tcg_temp_new_i32();
+    read_neon_element32(rm0, a->vm, 0, MO_32);
+    read_neon_element32(rm1, a->vm, 1, MO_32);
    tmp = tcg_temp_new_i64();

    widenfn(tmp, rm0);
@ -1582,7 +1579,7 @@ static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
        tcg_gen_shli_i64(tmp, tmp, a->shift);
        tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
    }
-    neon_store_reg64(tmp, a->vd);
+    write_neon_element64(tmp, a->vd, 0, MO_64);

    widenfn(tmp, rm1);
    tcg_temp_free_i32(rm1);
@ -1590,7 +1587,7 @@ static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
        tcg_gen_shli_i64(tmp, tmp, a->shift);
        tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
    }
-    neon_store_reg64(tmp, a->vd + 1);
+    write_neon_element64(tmp, a->vd, 1, MO_64);
    tcg_temp_free_i64(tmp);
    return true;
 }
@ -1620,8 +1617,8 @@ static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
 {
    /* FP operations in 2-reg-and-shift group */
    int vec_size = a->q ? 16 : 8;
-    int rd_ofs = neon_reg_offset(a->vd, 0);
-    int rm_ofs = neon_reg_offset(a->vm, 0);
+    int rd_ofs = neon_full_reg_offset(a->vd);
+    int rm_ofs = neon_full_reg_offset(a->vm);
    TCGv_ptr fpst;

    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
@ -1756,7 +1753,7 @@ static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
        return true;
    }

-    reg_ofs = neon_reg_offset(a->vd, 0);
+    reg_ofs = neon_full_reg_offset(a->vd);
    vec_size = a->q ? 16 : 8;
    imm = asimd_imm_const(a->imm, a->cmode, a->op);

@ -1791,11 +1788,10 @@ static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
                           NeonGenWidenFn *widenfn,
                           NeonGenTwo64OpFn *opfn,
-                           bool src1_wide)
+                           int src1_mop, int src2_mop)
 {
    /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
    TCGv_i64 rn0_64, rn1_64, rm_64;
-    TCGv_i32 rm;

    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
        return false;
@ -1807,12 +1803,12 @@ static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
        return false;
    }

-    if (!widenfn || !opfn) {
+    if (!opfn) {
        /* size == 3 case, which is an entirely different insn group */
        return false;
    }

-    if ((a->vd & 1) || (src1_wide && (a->vn & 1))) {
+    if ((a->vd & 1) || (src1_mop == MO_Q && (a->vn & 1))) {
        return false;
    }

@ -1824,38 +1820,50 @@ static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
    rn1_64 = tcg_temp_new_i64();
    rm_64 = tcg_temp_new_i64();

-    if (src1_wide) {
-        neon_load_reg64(rn0_64, a->vn);
+    if (src1_mop >= 0) {
+        read_neon_element64(rn0_64, a->vn, 0, src1_mop);
    } else {
-        TCGv_i32 tmp = neon_load_reg(a->vn, 0);
+        TCGv_i32 tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, a->vn, 0, MO_32);
        widenfn(rn0_64, tmp);
        tcg_temp_free_i32(tmp);
    }
-    rm = neon_load_reg(a->vm, 0);
+    if (src2_mop >= 0) {
+        read_neon_element64(rm_64, a->vm, 0, src2_mop);
+    } else {
+        TCGv_i32 tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, a->vm, 0, MO_32);
+        widenfn(rm_64, tmp);
+        tcg_temp_free_i32(tmp);
+    }

-    widenfn(rm_64, rm);
-    tcg_temp_free_i32(rm);
    opfn(rn0_64, rn0_64, rm_64);

    /*
     * Load second pass inputs before storing the first pass result, to
     * avoid incorrect results if a narrow input overlaps with the result.
     */
-    if (src1_wide) {
-        neon_load_reg64(rn1_64, a->vn + 1);
+    if (src1_mop >= 0) {
+        read_neon_element64(rn1_64, a->vn, 1, src1_mop);
    } else {
-        TCGv_i32 tmp = neon_load_reg(a->vn, 1);
+        TCGv_i32 tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, a->vn, 1, MO_32);
        widenfn(rn1_64, tmp);
        tcg_temp_free_i32(tmp);
    }
-    rm = neon_load_reg(a->vm, 1);
+    if (src2_mop >= 0) {
+        read_neon_element64(rm_64, a->vm, 1, src2_mop);
+    } else {
+        TCGv_i32 tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, a->vm, 1, MO_32);
+        widenfn(rm_64, tmp);
+        tcg_temp_free_i32(tmp);
+    }

-    neon_store_reg64(rn0_64, a->vd);
+    write_neon_element64(rn0_64, a->vd, 0, MO_64);

-    widenfn(rm_64, rm);
-    tcg_temp_free_i32(rm);
    opfn(rn1_64, rn1_64, rm_64);
-    neon_store_reg64(rn1_64, a->vd + 1);
+    write_neon_element64(rn1_64, a->vd, 1, MO_64);

    tcg_temp_free_i64(rn0_64);
    tcg_temp_free_i64(rn1_64);
@ -1864,14 +1872,13 @@ static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
    return true;
 }

-#define DO_PREWIDEN(INSN, S, EXT, OP, SRC1WIDE)                         \
+#define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
    {                                                                   \
        static NeonGenWidenFn * const widenfn[] = {                     \
            gen_helper_neon_widen_##S##8,                               \
            gen_helper_neon_widen_##S##16,                              \
-            tcg_gen_##EXT##_i32_i64,                                    \
-            NULL,                                                       \
+            NULL, NULL,                                                 \
        };                                                              \
        static NeonGenTwo64OpFn * const addfn[] = {                     \
            gen_helper_neon_##OP##l_u16,                                \
@ -1879,18 +1886,20 @@ static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
            tcg_gen_##OP##_i64,                                         \
            NULL,                                                       \
        };                                                              \
-        return do_prewiden_3d(s, a, widenfn[a->size],                   \
-                              addfn[a->size], SRC1WIDE);                \
+        int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
+        return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
+                              SRC1WIDE ? MO_Q : narrow_mop,             \
+                              narrow_mop);                              \
    }

-DO_PREWIDEN(VADDL_S, s, ext, add, false)
-DO_PREWIDEN(VADDL_U, u, extu, add, false)
-DO_PREWIDEN(VSUBL_S, s, ext, sub, false)
-DO_PREWIDEN(VSUBL_U, u, extu, sub, false)
-DO_PREWIDEN(VADDW_S, s, ext, add, true)
-DO_PREWIDEN(VADDW_U, u, extu, add, true)
-DO_PREWIDEN(VSUBW_S, s, ext, sub, true)
-DO_PREWIDEN(VSUBW_U, u, extu, sub, true)
+DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
+DO_PREWIDEN(VADDL_U, u, add, false, 0)
+DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
+DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
+DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
+DO_PREWIDEN(VADDW_U, u, add, true, 0)
+DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
+DO_PREWIDEN(VSUBW_U, u, sub, true, 0)

 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
                         NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
@ -1927,23 +1936,25 @@ static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
    rd0 = tcg_temp_new_i32();
    rd1 = tcg_temp_new_i32();

-    neon_load_reg64(rn_64, a->vn);
-    neon_load_reg64(rm_64, a->vm);
+    read_neon_element64(rn_64, a->vn, 0, MO_64);
+    read_neon_element64(rm_64, a->vm, 0, MO_64);

    opfn(rn_64, rn_64, rm_64);

    narrowfn(rd0, rn_64);

-    neon_load_reg64(rn_64, a->vn + 1);
-    neon_load_reg64(rm_64, a->vm + 1);
+    read_neon_element64(rn_64, a->vn, 1, MO_64);
+    read_neon_element64(rm_64, a->vm, 1, MO_64);

    opfn(rn_64, rn_64, rm_64);

    narrowfn(rd1, rn_64);

-    neon_store_reg(a->vd, 0, rd0);
-    neon_store_reg(a->vd, 1, rd1);
+    write_neon_element32(rd0, a->vd, 0, MO_32);
+    write_neon_element32(rd1, a->vd, 1, MO_32);

+    tcg_temp_free_i32(rd0);
+    tcg_temp_free_i32(rd1);
    tcg_temp_free_i64(rn_64);
    tcg_temp_free_i64(rm_64);

@ -2018,14 +2029,14 @@ static bool do_long_3d(DisasContext *s, arg_3diff *a,
    rd0 = tcg_temp_new_i64();
    rd1 = tcg_temp_new_i64();

-    rn = neon_load_reg(a->vn, 0);
-    rm = neon_load_reg(a->vm, 0);
+    rn = tcg_temp_new_i32();
+    rm = tcg_temp_new_i32();
+    read_neon_element32(rn, a->vn, 0, MO_32);
+    read_neon_element32(rm, a->vm, 0, MO_32);
    opfn(rd0, rn, rm);
-    tcg_temp_free_i32(rn);
-    tcg_temp_free_i32(rm);

-    rn = neon_load_reg(a->vn, 1);
-    rm = neon_load_reg(a->vm, 1);
+    read_neon_element32(rn, a->vn, 1, MO_32);
+    read_neon_element32(rm, a->vm, 1, MO_32);
    opfn(rd1, rn, rm);
    tcg_temp_free_i32(rn);
    tcg_temp_free_i32(rm);
@ -2033,18 +2044,15 @@ static bool do_long_3d(DisasContext *s, arg_3diff *a,
    /* Don't store results until after all loads: they might overlap */
    if (accfn) {
        tmp = tcg_temp_new_i64();
-        neon_load_reg64(tmp, a->vd);
-        accfn(tmp, tmp, rd0);
-        neon_store_reg64(tmp, a->vd);
-        neon_load_reg64(tmp, a->vd + 1);
-        accfn(tmp, tmp, rd1);
-        neon_store_reg64(tmp, a->vd + 1);
+        read_neon_element64(tmp, a->vd, 0, MO_64);
+        accfn(rd0, tmp, rd0);
+        read_neon_element64(tmp, a->vd, 1, MO_64);
+        accfn(rd1, tmp, rd1);
        tcg_temp_free_i64(tmp);
-    } else {
-        neon_store_reg64(rd0, a->vd);
-        neon_store_reg64(rd1, a->vd + 1);
    }

+    write_neon_element64(rd0, a->vd, 0, MO_64);
+    write_neon_element64(rd1, a->vd, 1, MO_64);
    tcg_temp_free_i64(rd0);
    tcg_temp_free_i64(rd1);

@ -2300,9 +2308,9 @@ static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
        return true;
    }

-    tcg_gen_gvec_3_ool(neon_reg_offset(a->vd, 0),
-                       neon_reg_offset(a->vn, 0),
-                       neon_reg_offset(a->vm, 0),
+    tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
+                       neon_full_reg_offset(a->vn),
+                       neon_full_reg_offset(a->vm),
                       16, 16, 0, fn_gvec);
    return true;
 }
@ -2327,16 +2335,16 @@ static void gen_neon_dup_high16(TCGv_i32 var)

 static inline TCGv_i32 neon_get_scalar(int size, int reg)
 {
-    TCGv_i32 tmp;
-    if (size == 1) {
-        tmp = neon_load_reg(reg & 7, reg >> 4);
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    if (size == MO_16) {
+        read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
        if (reg & 8) {
            gen_neon_dup_high16(tmp);
        } else {
            gen_neon_dup_low16(tmp);
        }
    } else {
-        tmp = neon_load_reg(reg & 15, reg >> 4);
+        read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
    }
    return tmp;
 }
@ -2350,7 +2358,7 @@ static bool do_2scalar(DisasContext *s, arg_2scalar *a,
     * perform an accumulation operation of that result into the
     * destination.
     */
-    TCGv_i32 scalar;
+    TCGv_i32 scalar, tmp;
    int pass;

    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
@ -2377,17 +2385,20 @@ static bool do_2scalar(DisasContext *s, arg_2scalar *a,
    }

    scalar = neon_get_scalar(a->size, a->vm);
+    tmp = tcg_temp_new_i32();

    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
-        TCGv_i32 tmp = neon_load_reg(a->vn, pass);
+        read_neon_element32(tmp, a->vn, pass, MO_32);
        opfn(tmp, tmp, scalar);
        if (accfn) {
-            TCGv_i32 rd = neon_load_reg(a->vd, pass);
+            TCGv_i32 rd = tcg_temp_new_i32();
+            read_neon_element32(rd, a->vd, pass, MO_32);
            accfn(tmp, rd, tmp);
            tcg_temp_free_i32(rd);
        }
-        neon_store_reg(a->vd, pass, tmp);
+        write_neon_element32(tmp, a->vd, pass, MO_32);
    }
+    tcg_temp_free_i32(tmp);
    tcg_temp_free_i32(scalar);
    return true;
 }
@ -2445,8 +2456,8 @@ static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
 {
    /* Two registers and a scalar, using gvec */
    int vec_size = a->q ? 16 : 8;
-    int rd_ofs = neon_reg_offset(a->vd, 0);
-    int rn_ofs = neon_reg_offset(a->vn, 0);
+    int rd_ofs = neon_full_reg_offset(a->vd);
+    int rn_ofs = neon_full_reg_offset(a->vn);
    int rm_ofs;
    int idx;
    TCGv_ptr fpstatus;
@ -2477,7 +2488,7 @@ static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
    /* a->vm is M:Vm, which encodes both register and index */
    idx = extract32(a->vm, a->size + 2, 2);
    a->vm = extract32(a->vm, 0, a->size + 2);
-    rm_ofs = neon_reg_offset(a->vm, 0);
+    rm_ofs = neon_full_reg_offset(a->vm);

    fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
    tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
@ -2542,7 +2553,7 @@ static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
     * performs a kind of fused op-then-accumulate using a helper
     * function that takes all of rd, rn and the scalar at once.
     */
-    TCGv_i32 scalar;
+    TCGv_i32 scalar, rn, rd;
    int pass;

    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
@ -2573,14 +2584,17 @@ static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
    }

    scalar = neon_get_scalar(a->size, a->vm);
+    rn = tcg_temp_new_i32();
+    rd = tcg_temp_new_i32();

    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
-        TCGv_i32 rn = neon_load_reg(a->vn, pass);
-        TCGv_i32 rd = neon_load_reg(a->vd, pass);
+        read_neon_element32(rn, a->vn, pass, MO_32);
+        read_neon_element32(rd, a->vd, pass, MO_32);
        opfn(rd, cpu_env, rn, scalar, rd);
-        tcg_temp_free_i32(rn);
-        neon_store_reg(a->vd, pass, rd);
+        write_neon_element32(rd, a->vd, pass, MO_32);
    }
+    tcg_temp_free_i32(rn);
+    tcg_temp_free_i32(rd);
    tcg_temp_free_i32(scalar);

    return true;
@ -2647,12 +2661,12 @@ static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
    scalar = neon_get_scalar(a->size, a->vm);

    /* Load all inputs before writing any outputs, in case of overlap */
-    rn = neon_load_reg(a->vn, 0);
+    rn = tcg_temp_new_i32();
+    read_neon_element32(rn, a->vn, 0, MO_32);
    rn0_64 = tcg_temp_new_i64();
    opfn(rn0_64, rn, scalar);
-    tcg_temp_free_i32(rn);

-    rn = neon_load_reg(a->vn, 1);
+    read_neon_element32(rn, a->vn, 1, MO_32);
    rn1_64 = tcg_temp_new_i64();
    opfn(rn1_64, rn, scalar);
    tcg_temp_free_i32(rn);
@ -2660,17 +2674,15 @@ static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,

    if (accfn) {
        TCGv_i64 t64 = tcg_temp_new_i64();
-        neon_load_reg64(t64, a->vd);
-        accfn(t64, t64, rn0_64);
-        neon_store_reg64(t64, a->vd);
-        neon_load_reg64(t64, a->vd + 1);
-        accfn(t64, t64, rn1_64);
-        neon_store_reg64(t64, a->vd + 1);
+        read_neon_element64(t64, a->vd, 0, MO_64);
+        accfn(rn0_64, t64, rn0_64);
+        read_neon_element64(t64, a->vd, 1, MO_64);
+        accfn(rn1_64, t64, rn1_64);
        tcg_temp_free_i64(t64);
-    } else {
-        neon_store_reg64(rn0_64, a->vd);
-        neon_store_reg64(rn1_64, a->vd + 1);
    }
+
+    write_neon_element64(rn0_64, a->vd, 0, MO_64);
+    write_neon_element64(rn1_64, a->vd, 1, MO_64);
    tcg_temp_free_i64(rn0_64);
    tcg_temp_free_i64(rn1_64);
    return true;
@ -2803,10 +2815,10 @@ static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
        right = tcg_temp_new_i64();
        dest = tcg_temp_new_i64();

-        neon_load_reg64(right, a->vn);
-        neon_load_reg64(left, a->vm);
+        read_neon_element64(right, a->vn, 0, MO_64);
+        read_neon_element64(left, a->vm, 0, MO_64);
        tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
-        neon_store_reg64(dest, a->vd);
+        write_neon_element64(dest, a->vd, 0, MO_64);

        tcg_temp_free_i64(left);
        tcg_temp_free_i64(right);
@ -2822,21 +2834,21 @@ static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
        destright = tcg_temp_new_i64();

        if (a->imm < 8) {
-            neon_load_reg64(right, a->vn);
-            neon_load_reg64(middle, a->vn + 1);
+            read_neon_element64(right, a->vn, 0, MO_64);
+            read_neon_element64(middle, a->vn, 1, MO_64);
            tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
-            neon_load_reg64(left, a->vm);
+            read_neon_element64(left, a->vm, 0, MO_64);
            tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
        } else {
-            neon_load_reg64(right, a->vn + 1);
-            neon_load_reg64(middle, a->vm);
+            read_neon_element64(right, a->vn, 1, MO_64);
+            read_neon_element64(middle, a->vm, 0, MO_64);
            tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
-            neon_load_reg64(left, a->vm + 1);
+            read_neon_element64(left, a->vm, 1, MO_64);
            tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
        }

-        neon_store_reg64(destright, a->vd);
-        neon_store_reg64(destleft, a->vd + 1);
+        write_neon_element64(destright, a->vd, 0, MO_64);
+        write_neon_element64(destleft, a->vd, 1, MO_64);

        tcg_temp_free_i64(destright);
        tcg_temp_free_i64(destleft);
@ -2876,30 +2888,34 @@ static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
        return false;
    }
    n <<= 3;
+    tmp = tcg_temp_new_i32();
    if (a->op) {
-        tmp = neon_load_reg(a->vd, 0);
+        read_neon_element32(tmp, a->vd, 0, MO_32);
    } else {
-        tmp = tcg_temp_new_i32();
        tcg_gen_movi_i32(tmp, 0);
    }
-    tmp2 = neon_load_reg(a->vm, 0);
+    tmp2 = tcg_temp_new_i32();
+    read_neon_element32(tmp2, a->vm, 0, MO_32);
    ptr1 = vfp_reg_ptr(true, a->vn);
    tmp4 = tcg_const_i32(n);
    gen_helper_neon_tbl(tmp2, tmp2, tmp, ptr1, tmp4);
-    tcg_temp_free_i32(tmp);
+
    if (a->op) {
-        tmp = neon_load_reg(a->vd, 1);
+        read_neon_element32(tmp, a->vd, 1, MO_32);
    } else {
-        tmp = tcg_temp_new_i32();
        tcg_gen_movi_i32(tmp, 0);
    }
-    tmp3 = neon_load_reg(a->vm, 1);
+    tmp3 = tcg_temp_new_i32();
+    read_neon_element32(tmp3, a->vm, 1, MO_32);
    gen_helper_neon_tbl(tmp3, tmp3, tmp, ptr1, tmp4);
+    tcg_temp_free_i32(tmp);
    tcg_temp_free_i32(tmp4);
    tcg_temp_free_ptr(ptr1);
-    neon_store_reg(a->vd, 0, tmp2);
-    neon_store_reg(a->vd, 1, tmp3);
-    tcg_temp_free_i32(tmp);
+
+    write_neon_element32(tmp2, a->vd, 0, MO_32);
+    write_neon_element32(tmp3, a->vd, 1, MO_32);
+    tcg_temp_free_i32(tmp2);
+    tcg_temp_free_i32(tmp3);
    return true;
 }

@ -2923,7 +2939,7 @@ static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
        return true;
    }

-    tcg_gen_gvec_dup_mem(a->size, neon_reg_offset(a->vd, 0),
+    tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
                         neon_element_offset(a->vm, a->index, a->size),
                         a->q ? 16 : 8, a->q ? 16 : 8);
    return true;
@ -2932,6 +2948,7 @@ static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
 {
    int pass, half;
+    TCGv_i32 tmp[2];

    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
        return false;
@ -2955,11 +2972,12 @@ static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
        return true;
    }

-    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
-        TCGv_i32 tmp[2];
+    tmp[0] = tcg_temp_new_i32();
+    tmp[1] = tcg_temp_new_i32();

+    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
        for (half = 0; half < 2; half++) {
-            tmp[half] = neon_load_reg(a->vm, pass * 2 + half);
+            read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
            switch (a->size) {
            case 0:
                tcg_gen_bswap32_i32(tmp[half], tmp[half]);
@ -2973,9 +2991,12 @@ static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
                g_assert_not_reached();
            }
        }
-        neon_store_reg(a->vd, pass * 2, tmp[1]);
-        neon_store_reg(a->vd, pass * 2 + 1, tmp[0]);
+        write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
+        write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
    }
+
+    tcg_temp_free_i32(tmp[0]);
+    tcg_temp_free_i32(tmp[1]);
    return true;
 }

@ -3020,23 +3041,25 @@ static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
        rm0_64 = tcg_temp_new_i64();
        rm1_64 = tcg_temp_new_i64();
        rd_64 = tcg_temp_new_i64();
-        tmp = neon_load_reg(a->vm, pass * 2);
+
+        tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, a->vm, pass * 2, MO_32);
        widenfn(rm0_64, tmp);
-        tcg_temp_free_i32(tmp);
-        tmp = neon_load_reg(a->vm, pass * 2 + 1);
+        read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
        widenfn(rm1_64, tmp);
        tcg_temp_free_i32(tmp);
+
        opfn(rd_64, rm0_64, rm1_64);
        tcg_temp_free_i64(rm0_64);
        tcg_temp_free_i64(rm1_64);

        if (accfn) {
            TCGv_i64 tmp64 = tcg_temp_new_i64();
-            neon_load_reg64(tmp64, a->vd + pass);
+            read_neon_element64(tmp64, a->vd, pass, MO_64);
            accfn(rd_64, tmp64, rd_64);
            tcg_temp_free_i64(tmp64);
        }
-        neon_store_reg64(rd_64, a->vd + pass);
+        write_neon_element64(rd_64, a->vd, pass, MO_64);
        tcg_temp_free_i64(rd_64);
    }
    return true;
@ -3234,12 +3257,14 @@ static bool do_vmovn(DisasContext *s, arg_2misc *a,
    rd0 = tcg_temp_new_i32();
    rd1 = tcg_temp_new_i32();

-    neon_load_reg64(rm, a->vm);
+    read_neon_element64(rm, a->vm, 0, MO_64);
    narrowfn(rd0, cpu_env, rm);
-    neon_load_reg64(rm, a->vm + 1);
+    read_neon_element64(rm, a->vm, 1, MO_64);
    narrowfn(rd1, cpu_env, rm);
-    neon_store_reg(a->vd, 0, rd0);
-    neon_store_reg(a->vd, 1, rd1);
+    write_neon_element32(rd0, a->vd, 0, MO_32);
+    write_neon_element32(rd1, a->vd, 1, MO_32);
+    tcg_temp_free_i32(rd0);
+    tcg_temp_free_i32(rd1);
    tcg_temp_free_i64(rm);
    return true;
 }
@ -3296,16 +3321,18 @@ static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
    }

    rd = tcg_temp_new_i64();
+    rm0 = tcg_temp_new_i32();
+    rm1 = tcg_temp_new_i32();

-    rm0 = neon_load_reg(a->vm, 0);
-    rm1 = neon_load_reg(a->vm, 1);
+    read_neon_element32(rm0, a->vm, 0, MO_32);
+    read_neon_element32(rm1, a->vm, 1, MO_32);

    widenfn(rd, rm0);
    tcg_gen_shli_i64(rd, rd, 8 << a->size);
-    neon_store_reg64(rd, a->vd);
+    write_neon_element64(rd, a->vd, 0, MO_64);
    widenfn(rd, rm1);
    tcg_gen_shli_i64(rd, rd, 8 << a->size);
-    neon_store_reg64(rd, a->vd + 1);
+    write_neon_element64(rd, a->vd, 1, MO_64);

    tcg_temp_free_i64(rd);
    tcg_temp_free_i32(rm0);
@ -3339,21 +3366,25 @@ static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)

    fpst = fpstatus_ptr(FPST_STD);
    ahp = get_ahp_flag();
-    tmp = neon_load_reg(a->vm, 0);
+    tmp = tcg_temp_new_i32();
+    read_neon_element32(tmp, a->vm, 0, MO_32);
    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
-    tmp2 = neon_load_reg(a->vm, 1);
+    tmp2 = tcg_temp_new_i32();
+    read_neon_element32(tmp2, a->vm, 1, MO_32);
    gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
    tcg_gen_shli_i32(tmp2, tmp2, 16);
    tcg_gen_or_i32(tmp2, tmp2, tmp);
-    tcg_temp_free_i32(tmp);
-    tmp = neon_load_reg(a->vm, 2);
+    read_neon_element32(tmp, a->vm, 2, MO_32);
    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
-    tmp3 = neon_load_reg(a->vm, 3);
-    neon_store_reg(a->vd, 0, tmp2);
+    tmp3 = tcg_temp_new_i32();
+    read_neon_element32(tmp3, a->vm, 3, MO_32);
+    write_neon_element32(tmp2, a->vd, 0, MO_32);
+    tcg_temp_free_i32(tmp2);
    gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
    tcg_gen_shli_i32(tmp3, tmp3, 16);
    tcg_gen_or_i32(tmp3, tmp3, tmp);
-    neon_store_reg(a->vd, 1, tmp3);
+    write_neon_element32(tmp3, a->vd, 1, MO_32);
+    tcg_temp_free_i32(tmp3);
    tcg_temp_free_i32(tmp);
    tcg_temp_free_i32(ahp);
    tcg_temp_free_ptr(fpst);
@ -3388,21 +3419,25 @@ static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
    fpst = fpstatus_ptr(FPST_STD);
    ahp = get_ahp_flag();
    tmp3 = tcg_temp_new_i32();
-    tmp = neon_load_reg(a->vm, 0);
-    tmp2 = neon_load_reg(a->vm, 1);
+    tmp2 = tcg_temp_new_i32();
+    tmp = tcg_temp_new_i32();
+    read_neon_element32(tmp, a->vm, 0, MO_32);
+    read_neon_element32(tmp2, a->vm, 1, MO_32);
    tcg_gen_ext16u_i32(tmp3, tmp);
    gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
-    neon_store_reg(a->vd, 0, tmp3);
+    write_neon_element32(tmp3, a->vd, 0, MO_32);
    tcg_gen_shri_i32(tmp, tmp, 16);
    gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
-    neon_store_reg(a->vd, 1, tmp);
-    tmp3 = tcg_temp_new_i32();
+    write_neon_element32(tmp, a->vd, 1, MO_32);
+    tcg_temp_free_i32(tmp);
    tcg_gen_ext16u_i32(tmp3, tmp2);
    gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
-    neon_store_reg(a->vd, 2, tmp3);
+    write_neon_element32(tmp3, a->vd, 2, MO_32);
+    tcg_temp_free_i32(tmp3);
    tcg_gen_shri_i32(tmp2, tmp2, 16);
    gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
-    neon_store_reg(a->vd, 3, tmp2);
+    write_neon_element32(tmp2, a->vd, 3, MO_32);
+    tcg_temp_free_i32(tmp2);
    tcg_temp_free_i32(ahp);
    tcg_temp_free_ptr(fpst);

@ -3412,8 +3447,8 @@ static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
 {
    int vec_size = a->q ? 16 : 8;
-    int rd_ofs = neon_reg_offset(a->vd, 0);
-    int rm_ofs = neon_reg_offset(a->vm, 0);
+    int rd_ofs = neon_full_reg_offset(a->vd);
+    int rm_ofs = neon_full_reg_offset(a->vm);

    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
        return false;
@ -3508,6 +3543,7 @@ DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)

 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
 {
+    TCGv_i32 tmp;
    int pass;

    /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
@ -3533,11 +3569,13 @@ static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
        return true;
    }

+    tmp = tcg_temp_new_i32();
    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
-        TCGv_i32 tmp = neon_load_reg(a->vm, pass);
+        read_neon_element32(tmp, a->vm, pass, MO_32);
        fn(tmp, tmp);
-        neon_store_reg(a->vd, pass, tmp);
+        write_neon_element32(tmp, a->vd, pass, MO_32);
    }
+    tcg_temp_free_i32(tmp);

    return true;
 }
@ -3812,10 +3850,10 @@ static bool trans_VSWP(DisasContext *s, arg_2misc *a)
    rm = tcg_temp_new_i64();
    rd = tcg_temp_new_i64();
    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
-        neon_load_reg64(rm, a->vm + pass);
-        neon_load_reg64(rd, a->vd + pass);
-        neon_store_reg64(rm, a->vd + pass);
-        neon_store_reg64(rd, a->vm + pass);
+        read_neon_element64(rm, a->vm, pass, MO_64);
+        read_neon_element64(rd, a->vd, pass, MO_64);
+        write_neon_element64(rm, a->vd, pass, MO_64);
+        write_neon_element64(rd, a->vm, pass, MO_64);
    }
    tcg_temp_free_i64(rm);
    tcg_temp_free_i64(rd);
@ -3890,25 +3928,29 @@ static bool trans_VTRN(DisasContext *s, arg_2misc *a)
        return true;
    }

-    if (a->size == 2) {
+    tmp = tcg_temp_new_i32();
+    tmp2 = tcg_temp_new_i32();
+    if (a->size == MO_32) {
        for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
-            tmp = neon_load_reg(a->vm, pass);
-            tmp2 = neon_load_reg(a->vd, pass + 1);
-            neon_store_reg(a->vm, pass, tmp2);
-            neon_store_reg(a->vd, pass + 1, tmp);
+            read_neon_element32(tmp, a->vm, pass, MO_32);
+            read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
+            write_neon_element32(tmp2, a->vm, pass, MO_32);
+            write_neon_element32(tmp, a->vd, pass + 1, MO_32);
        }
    } else {
        for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
-            tmp = neon_load_reg(a->vm, pass);
-            tmp2 = neon_load_reg(a->vd, pass);
-            if (a->size == 0) {
+            read_neon_element32(tmp, a->vm, pass, MO_32);
+            read_neon_element32(tmp2, a->vd, pass, MO_32);
+            if (a->size == MO_8) {
                gen_neon_trn_u8(tmp, tmp2);
            } else {
                gen_neon_trn_u16(tmp, tmp2);
            }
-            neon_store_reg(a->vm, pass, tmp2);
-            neon_store_reg(a->vd, pass, tmp);
+            write_neon_element32(tmp2, a->vm, pass, MO_32);
+            write_neon_element32(tmp, a->vd, pass, MO_32);
        }
    }
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(tmp2);
    return true;
 }
--- a/target/arm/translate-vfp.c.inc
+++ b/target/arm/translate-vfp.c.inc
@ -236,8 +236,8 @@ static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
        tcg_gen_ext_i32_i64(nf, cpu_NF);
        tcg_gen_ext_i32_i64(vf, cpu_VF);

-        neon_load_reg64(frn, rn);
-        neon_load_reg64(frm, rm);
+        vfp_load_reg64(frn, rn);
+        vfp_load_reg64(frm, rm);
        switch (a->cc) {
        case 0: /* eq: Z */
            tcg_gen_movcond_i64(TCG_COND_EQ, dest, zf, zero,
@ -264,7 +264,7 @@ static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
            tcg_temp_free_i64(tmp);
            break;
        }
-        neon_store_reg64(dest, rd);
+        vfp_store_reg64(dest, rd);
        tcg_temp_free_i64(frn);
        tcg_temp_free_i64(frm);
        tcg_temp_free_i64(dest);
@ -283,8 +283,8 @@ static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
        frn = tcg_temp_new_i32();
        frm = tcg_temp_new_i32();
        dest = tcg_temp_new_i32();
-        neon_load_reg32(frn, rn);
-        neon_load_reg32(frm, rm);
+        vfp_load_reg32(frn, rn);
+        vfp_load_reg32(frm, rm);
        switch (a->cc) {
        case 0: /* eq: Z */
            tcg_gen_movcond_i32(TCG_COND_EQ, dest, cpu_ZF, zero,
@ -315,7 +315,7 @@ static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
        if (sz == 1) {
            tcg_gen_andi_i32(dest, dest, 0xffff);
        }
-        neon_store_reg32(dest, rd);
+        vfp_store_reg32(dest, rd);
        tcg_temp_free_i32(frn);
        tcg_temp_free_i32(frm);
        tcg_temp_free_i32(dest);
@ -385,9 +385,9 @@ static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
        TCGv_i64 tcg_res;
        tcg_op = tcg_temp_new_i64();
        tcg_res = tcg_temp_new_i64();
-        neon_load_reg64(tcg_op, rm);
+        vfp_load_reg64(tcg_op, rm);
        gen_helper_rintd(tcg_res, tcg_op, fpst);
-        neon_store_reg64(tcg_res, rd);
+        vfp_store_reg64(tcg_res, rd);
        tcg_temp_free_i64(tcg_op);
        tcg_temp_free_i64(tcg_res);
    } else {
@ -395,13 +395,13 @@ static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
        TCGv_i32 tcg_res;
        tcg_op = tcg_temp_new_i32();
        tcg_res = tcg_temp_new_i32();
-        neon_load_reg32(tcg_op, rm);
+        vfp_load_reg32(tcg_op, rm);
        if (sz == 1) {
            gen_helper_rinth(tcg_res, tcg_op, fpst);
        } else {
            gen_helper_rints(tcg_res, tcg_op, fpst);
        }
-        neon_store_reg32(tcg_res, rd);
+        vfp_store_reg32(tcg_res, rd);
        tcg_temp_free_i32(tcg_op);
        tcg_temp_free_i32(tcg_res);
    }
@ -463,14 +463,14 @@ static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
        tcg_double = tcg_temp_new_i64();
        tcg_res = tcg_temp_new_i64();
        tcg_tmp = tcg_temp_new_i32();
-        neon_load_reg64(tcg_double, rm);
+        vfp_load_reg64(tcg_double, rm);
        if (is_signed) {
            gen_helper_vfp_tosld(tcg_res, tcg_double, tcg_shift, fpst);
        } else {
            gen_helper_vfp_tould(tcg_res, tcg_double, tcg_shift, fpst);
        }
        tcg_gen_extrl_i64_i32(tcg_tmp, tcg_res);
-        neon_store_reg32(tcg_tmp, rd);
+        vfp_store_reg32(tcg_tmp, rd);
        tcg_temp_free_i32(tcg_tmp);
        tcg_temp_free_i64(tcg_res);
        tcg_temp_free_i64(tcg_double);
@ -478,7 +478,7 @@ static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
        TCGv_i32 tcg_single, tcg_res;
        tcg_single = tcg_temp_new_i32();
        tcg_res = tcg_temp_new_i32();
-        neon_load_reg32(tcg_single, rm);
+        vfp_load_reg32(tcg_single, rm);
        if (sz == 1) {
            if (is_signed) {
                gen_helper_vfp_toslh(tcg_res, tcg_single, tcg_shift, fpst);
@ -492,7 +492,7 @@ static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
                gen_helper_vfp_touls(tcg_res, tcg_single, tcg_shift, fpst);
            }
        }
-        neon_store_reg32(tcg_res, rd);
+        vfp_store_reg32(tcg_res, rd);
        tcg_temp_free_i32(tcg_res);
        tcg_temp_free_i32(tcg_single);
    }
@ -511,11 +511,9 @@ static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a)
 {
    /* VMOV scalar to general purpose register */
    TCGv_i32 tmp;
-    int pass;
-    uint32_t offset;

-    /* SIZE == 2 is a VFP instruction; otherwise NEON.  */
-    if (a->size == 2
+    /* SIZE == MO_32 is a VFP instruction; otherwise NEON.  */
+    if (a->size == MO_32
        ? !dc_isar_feature(aa32_fpsp_v2, s)
        : !arm_dc_feature(s, ARM_FEATURE_NEON)) {
        return false;
@ -526,44 +524,12 @@ static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a)
        return false;
    }

-    offset = a->index << a->size;
-    pass = extract32(offset, 2, 1);
-    offset = extract32(offset, 0, 2) * 8;
-
    if (!vfp_access_check(s)) {
        return true;
    }

-    tmp = neon_load_reg(a->vn, pass);
-    switch (a->size) {
-    case 0:
-        if (offset) {
-            tcg_gen_shri_i32(tmp, tmp, offset);
-        }
-        if (a->u) {
-            gen_uxtb(tmp);
-        } else {
-            gen_sxtb(tmp);
-        }
-        break;
-    case 1:
-        if (a->u) {
-            if (offset) {
-                tcg_gen_shri_i32(tmp, tmp, 16);
-            } else {
-                gen_uxth(tmp);
-            }
-        } else {
-            if (offset) {
-                tcg_gen_sari_i32(tmp, tmp, 16);
-            } else {
-                gen_sxth(tmp);
-            }
-        }
-        break;
-    case 2:
-        break;
-    }
+    tmp = tcg_temp_new_i32();
+    read_neon_element32(tmp, a->vn, a->index, a->size | (a->u ? 0 : MO_SIGN));
    store_reg(s, a->rt, tmp);

    return true;
@ -572,12 +538,10 @@ static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a)
 static bool trans_VMOV_from_gp(DisasContext *s, arg_VMOV_from_gp *a)
 {
    /* VMOV general purpose register to scalar */
-    TCGv_i32 tmp, tmp2;
-    int pass;
-    uint32_t offset;
+    TCGv_i32 tmp;

-    /* SIZE == 2 is a VFP instruction; otherwise NEON.  */
-    if (a->size == 2
+    /* SIZE == MO_32 is a VFP instruction; otherwise NEON.  */
+    if (a->size == MO_32
        ? !dc_isar_feature(aa32_fpsp_v2, s)
        : !arm_dc_feature(s, ARM_FEATURE_NEON)) {
        return false;
@ -588,30 +552,13 @@ static bool trans_VMOV_from_gp(DisasContext *s, arg_VMOV_from_gp *a)
        return false;
    }

-    offset = a->index << a->size;
-    pass = extract32(offset, 2, 1);
-    offset = extract32(offset, 0, 2) * 8;
-
    if (!vfp_access_check(s)) {
        return true;
    }

    tmp = load_reg(s, a->rt);
-    switch (a->size) {
-    case 0:
-        tmp2 = neon_load_reg(a->vn, pass);
-        tcg_gen_deposit_i32(tmp, tmp2, tmp, offset, 8);
-        tcg_temp_free_i32(tmp2);
-        break;
-    case 1:
-        tmp2 = neon_load_reg(a->vn, pass);
-        tcg_gen_deposit_i32(tmp, tmp2, tmp, offset, 16);
-        tcg_temp_free_i32(tmp2);
-        break;
-    case 2:
-        break;
-    }
-    neon_store_reg(a->vn, pass, tmp);
+    write_neon_element32(tmp, a->vn, a->index, a->size);
+    tcg_temp_free_i32(tmp);

    return true;
 }
@ -653,7 +600,7 @@ static bool trans_VDUP(DisasContext *s, arg_VDUP *a)
    }

    tmp = load_reg(s, a->rt);
-    tcg_gen_gvec_dup_i32(size, neon_reg_offset(a->vn, 0),
+    tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(a->vn),
                         vec_size, vec_size, tmp);
    tcg_temp_free_i32(tmp);

@ -829,14 +776,14 @@ static bool trans_VMOV_half(DisasContext *s, arg_VMOV_single *a)
    if (a->l) {
        /* VFP to general purpose register */
        tmp = tcg_temp_new_i32();
-        neon_load_reg32(tmp, a->vn);
+        vfp_load_reg32(tmp, a->vn);
        tcg_gen_andi_i32(tmp, tmp, 0xffff);
        store_reg(s, a->rt, tmp);
    } else {
        /* general purpose register to VFP */
        tmp = load_reg(s, a->rt);
        tcg_gen_andi_i32(tmp, tmp, 0xffff);
-        neon_store_reg32(tmp, a->vn);
+        vfp_store_reg32(tmp, a->vn);
        tcg_temp_free_i32(tmp);
    }

@ -858,7 +805,7 @@ static bool trans_VMOV_single(DisasContext *s, arg_VMOV_single *a)
    if (a->l) {
        /* VFP to general purpose register */
        tmp = tcg_temp_new_i32();
-        neon_load_reg32(tmp, a->vn);
+        vfp_load_reg32(tmp, a->vn);
        if (a->rt == 15) {
            /* Set the 4 flag bits in the CPSR.  */
            gen_set_nzcv(tmp);
@ -869,7 +816,7 @@ static bool trans_VMOV_single(DisasContext *s, arg_VMOV_single *a)
    } else {
        /* general purpose register to VFP */
        tmp = load_reg(s, a->rt);
-        neon_store_reg32(tmp, a->vn);
+        vfp_store_reg32(tmp, a->vn);
        tcg_temp_free_i32(tmp);
    }

@ -895,18 +842,18 @@ static bool trans_VMOV_64_sp(DisasContext *s, arg_VMOV_64_sp *a)
    if (a->op) {
        /* fpreg to gpreg */
        tmp = tcg_temp_new_i32();
-        neon_load_reg32(tmp, a->vm);
+        vfp_load_reg32(tmp, a->vm);
        store_reg(s, a->rt, tmp);
        tmp = tcg_temp_new_i32();
-        neon_load_reg32(tmp, a->vm + 1);
+        vfp_load_reg32(tmp, a->vm + 1);
        store_reg(s, a->rt2, tmp);
    } else {
        /* gpreg to fpreg */
        tmp = load_reg(s, a->rt);
-        neon_store_reg32(tmp, a->vm);
+        vfp_store_reg32(tmp, a->vm);
        tcg_temp_free_i32(tmp);
        tmp = load_reg(s, a->rt2);
-        neon_store_reg32(tmp, a->vm + 1);
+        vfp_store_reg32(tmp, a->vm + 1);
        tcg_temp_free_i32(tmp);
    }

@ -938,18 +885,18 @@ static bool trans_VMOV_64_dp(DisasContext *s, arg_VMOV_64_dp *a)
    if (a->op) {
        /* fpreg to gpreg */
        tmp = tcg_temp_new_i32();
-        neon_load_reg32(tmp, a->vm * 2);
+        vfp_load_reg32(tmp, a->vm * 2);
        store_reg(s, a->rt, tmp);
        tmp = tcg_temp_new_i32();
-        neon_load_reg32(tmp, a->vm * 2 + 1);
+        vfp_load_reg32(tmp, a->vm * 2 + 1);
        store_reg(s, a->rt2, tmp);
    } else {
        /* gpreg to fpreg */
        tmp = load_reg(s, a->rt);
-        neon_store_reg32(tmp, a->vm * 2);
+        vfp_store_reg32(tmp, a->vm * 2);
        tcg_temp_free_i32(tmp);
        tmp = load_reg(s, a->rt2);
-        neon_store_reg32(tmp, a->vm * 2 + 1);
+        vfp_store_reg32(tmp, a->vm * 2 + 1);
        tcg_temp_free_i32(tmp);
    }

@ -980,9 +927,9 @@ static bool trans_VLDR_VSTR_hp(DisasContext *s, arg_VLDR_VSTR_sp *a)
    tmp = tcg_temp_new_i32();
    if (a->l) {
        gen_aa32_ld16u(s, tmp, addr, get_mem_index(s));
-        neon_store_reg32(tmp, a->vd);
+        vfp_store_reg32(tmp, a->vd);
    } else {
-        neon_load_reg32(tmp, a->vd);
+        vfp_load_reg32(tmp, a->vd);
        gen_aa32_st16(s, tmp, addr, get_mem_index(s));
    }
    tcg_temp_free_i32(tmp);
@ -1014,9 +961,9 @@ static bool trans_VLDR_VSTR_sp(DisasContext *s, arg_VLDR_VSTR_sp *a)
    tmp = tcg_temp_new_i32();
    if (a->l) {
        gen_aa32_ld32u(s, tmp, addr, get_mem_index(s));
-        neon_store_reg32(tmp, a->vd);
+        vfp_store_reg32(tmp, a->vd);
    } else {
-        neon_load_reg32(tmp, a->vd);
+        vfp_load_reg32(tmp, a->vd);
        gen_aa32_st32(s, tmp, addr, get_mem_index(s));
    }
    tcg_temp_free_i32(tmp);
@ -1055,9 +1002,9 @@ static bool trans_VLDR_VSTR_dp(DisasContext *s, arg_VLDR_VSTR_dp *a)
    tmp = tcg_temp_new_i64();
    if (a->l) {
        gen_aa32_ld64(s, tmp, addr, get_mem_index(s));
-        neon_store_reg64(tmp, a->vd);
+        vfp_store_reg64(tmp, a->vd);
    } else {
-        neon_load_reg64(tmp, a->vd);
+        vfp_load_reg64(tmp, a->vd);
        gen_aa32_st64(s, tmp, addr, get_mem_index(s));
    }
    tcg_temp_free_i64(tmp);
@ -1119,10 +1066,10 @@ static bool trans_VLDM_VSTM_sp(DisasContext *s, arg_VLDM_VSTM_sp *a)
        if (a->l) {
            /* load */
            gen_aa32_ld32u(s, tmp, addr, get_mem_index(s));
-            neon_store_reg32(tmp, a->vd + i);
+            vfp_store_reg32(tmp, a->vd + i);
        } else {
            /* store */
-            neon_load_reg32(tmp, a->vd + i);
+            vfp_load_reg32(tmp, a->vd + i);
            gen_aa32_st32(s, tmp, addr, get_mem_index(s));
        }
        tcg_gen_addi_i32(addr, addr, offset);
@ -1202,10 +1149,10 @@ static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a)
        if (a->l) {
            /* load */
            gen_aa32_ld64(s, tmp, addr, get_mem_index(s));
-            neon_store_reg64(tmp, a->vd + i);
+            vfp_store_reg64(tmp, a->vd + i);
        } else {
            /* store */
-            neon_load_reg64(tmp, a->vd + i);
+            vfp_load_reg64(tmp, a->vd + i);
            gen_aa32_st64(s, tmp, addr, get_mem_index(s));
        }
        tcg_gen_addi_i32(addr, addr, offset);
@ -1338,15 +1285,15 @@ static bool do_vfp_3op_sp(DisasContext *s, VFPGen3OpSPFn *fn,
    fd = tcg_temp_new_i32();
    fpst = fpstatus_ptr(FPST_FPCR);

-    neon_load_reg32(f0, vn);
-    neon_load_reg32(f1, vm);
+    vfp_load_reg32(f0, vn);
+    vfp_load_reg32(f1, vm);

    for (;;) {
        if (reads_vd) {
-            neon_load_reg32(fd, vd);
+            vfp_load_reg32(fd, vd);
        }
        fn(fd, f0, f1, fpst);
-        neon_store_reg32(fd, vd);
+        vfp_store_reg32(fd, vd);

        if (veclen == 0) {
            break;
@ -1356,10 +1303,10 @@ static bool do_vfp_3op_sp(DisasContext *s, VFPGen3OpSPFn *fn,
        veclen--;
        vd = vfp_advance_sreg(vd, delta_d);
        vn = vfp_advance_sreg(vn, delta_d);
-        neon_load_reg32(f0, vn);
+        vfp_load_reg32(f0, vn);
        if (delta_m) {
            vm = vfp_advance_sreg(vm, delta_m);
-            neon_load_reg32(f1, vm);
+            vfp_load_reg32(f1, vm);
        }
    }

@ -1402,14 +1349,14 @@ static bool do_vfp_3op_hp(DisasContext *s, VFPGen3OpSPFn *fn,
    fd = tcg_temp_new_i32();
    fpst = fpstatus_ptr(FPST_FPCR_F16);

-    neon_load_reg32(f0, vn);
-    neon_load_reg32(f1, vm);
+    vfp_load_reg32(f0, vn);
+    vfp_load_reg32(f1, vm);

    if (reads_vd) {
-        neon_load_reg32(fd, vd);
+        vfp_load_reg32(fd, vd);
    }
    fn(fd, f0, f1, fpst);
-    neon_store_reg32(fd, vd);
+    vfp_store_reg32(fd, vd);

    tcg_temp_free_i32(f0);
    tcg_temp_free_i32(f1);
@ -1469,15 +1416,15 @@ static bool do_vfp_3op_dp(DisasContext *s, VFPGen3OpDPFn *fn,
    fd = tcg_temp_new_i64();
    fpst = fpstatus_ptr(FPST_FPCR);

-    neon_load_reg64(f0, vn);
-    neon_load_reg64(f1, vm);
+    vfp_load_reg64(f0, vn);
+    vfp_load_reg64(f1, vm);

    for (;;) {
        if (reads_vd) {
-            neon_load_reg64(fd, vd);
+            vfp_load_reg64(fd, vd);
        }
        fn(fd, f0, f1, fpst);
-        neon_store_reg64(fd, vd);
+        vfp_store_reg64(fd, vd);

        if (veclen == 0) {
            break;
@ -1486,10 +1433,10 @@ static bool do_vfp_3op_dp(DisasContext *s, VFPGen3OpDPFn *fn,
        veclen--;
        vd = vfp_advance_dreg(vd, delta_d);
        vn = vfp_advance_dreg(vn, delta_d);
-        neon_load_reg64(f0, vn);
+        vfp_load_reg64(f0, vn);
        if (delta_m) {
            vm = vfp_advance_dreg(vm, delta_m);
-            neon_load_reg64(f1, vm);
+            vfp_load_reg64(f1, vm);
        }
    }

@ -1542,11 +1489,11 @@ static bool do_vfp_2op_sp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
    f0 = tcg_temp_new_i32();
    fd = tcg_temp_new_i32();

-    neon_load_reg32(f0, vm);
+    vfp_load_reg32(f0, vm);

    for (;;) {
        fn(fd, f0);
-        neon_store_reg32(fd, vd);
+        vfp_store_reg32(fd, vd);

        if (veclen == 0) {
            break;
@ -1556,7 +1503,7 @@ static bool do_vfp_2op_sp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
            /* single source one-many */
            while (veclen--) {
                vd = vfp_advance_sreg(vd, delta_d);
-                neon_store_reg32(fd, vd);
+                vfp_store_reg32(fd, vd);
            }
            break;
        }
@ -1565,7 +1512,7 @@ static bool do_vfp_2op_sp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
        veclen--;
        vd = vfp_advance_sreg(vd, delta_d);
        vm = vfp_advance_sreg(vm, delta_m);
-        neon_load_reg32(f0, vm);
+        vfp_load_reg32(f0, vm);
    }

    tcg_temp_free_i32(f0);
@ -1598,9 +1545,9 @@ static bool do_vfp_2op_hp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
    }

    f0 = tcg_temp_new_i32();
-    neon_load_reg32(f0, vm);
+    vfp_load_reg32(f0, vm);
    fn(f0, f0);
-    neon_store_reg32(f0, vd);
+    vfp_store_reg32(f0, vd);
    tcg_temp_free_i32(f0);

    return true;
@ -1652,11 +1599,11 @@ static bool do_vfp_2op_dp(DisasContext *s, VFPGen2OpDPFn *fn, int vd, int vm)
    f0 = tcg_temp_new_i64();
    fd = tcg_temp_new_i64();

-    neon_load_reg64(f0, vm);
+    vfp_load_reg64(f0, vm);

    for (;;) {
        fn(fd, f0);
-        neon_store_reg64(fd, vd);
+        vfp_store_reg64(fd, vd);

        if (veclen == 0) {
            break;
@ -1666,7 +1613,7 @@ static bool do_vfp_2op_dp(DisasContext *s, VFPGen2OpDPFn *fn, int vd, int vm)
            /* single source one-many */
            while (veclen--) {
                vd = vfp_advance_dreg(vd, delta_d);
-                neon_store_reg64(fd, vd);
+                vfp_store_reg64(fd, vd);
            }
            break;
        }
@ -1675,7 +1622,7 @@ static bool do_vfp_2op_dp(DisasContext *s, VFPGen2OpDPFn *fn, int vd, int vm)
        veclen--;
        vd = vfp_advance_dreg(vd, delta_d);
        vd = vfp_advance_dreg(vm, delta_m);
-        neon_load_reg64(f0, vm);
+        vfp_load_reg64(f0, vm);
    }

    tcg_temp_free_i64(f0);
@ -2090,20 +2037,20 @@ static bool do_vfm_hp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
    vm = tcg_temp_new_i32();
    vd = tcg_temp_new_i32();

-    neon_load_reg32(vn, a->vn);
-    neon_load_reg32(vm, a->vm);
+    vfp_load_reg32(vn, a->vn);
+    vfp_load_reg32(vm, a->vm);
    if (neg_n) {
        /* VFNMS, VFMS */
        gen_helper_vfp_negh(vn, vn);
    }
-    neon_load_reg32(vd, a->vd);
+    vfp_load_reg32(vd, a->vd);
    if (neg_d) {
        /* VFNMA, VFNMS */
        gen_helper_vfp_negh(vd, vd);
    }
    fpst = fpstatus_ptr(FPST_FPCR_F16);
    gen_helper_vfp_muladdh(vd, vn, vm, vd, fpst);
-    neon_store_reg32(vd, a->vd);
+    vfp_store_reg32(vd, a->vd);

    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i32(vn);
@ -2155,20 +2102,20 @@ static bool do_vfm_sp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
    vm = tcg_temp_new_i32();
    vd = tcg_temp_new_i32();

-    neon_load_reg32(vn, a->vn);
-    neon_load_reg32(vm, a->vm);
+    vfp_load_reg32(vn, a->vn);
+    vfp_load_reg32(vm, a->vm);
    if (neg_n) {
        /* VFNMS, VFMS */
        gen_helper_vfp_negs(vn, vn);
    }
-    neon_load_reg32(vd, a->vd);
+    vfp_load_reg32(vd, a->vd);
    if (neg_d) {
        /* VFNMA, VFNMS */
        gen_helper_vfp_negs(vd, vd);
    }
    fpst = fpstatus_ptr(FPST_FPCR);
    gen_helper_vfp_muladds(vd, vn, vm, vd, fpst);
-    neon_store_reg32(vd, a->vd);
+    vfp_store_reg32(vd, a->vd);

    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i32(vn);
@ -2226,20 +2173,20 @@ static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d)
    vm = tcg_temp_new_i64();
    vd = tcg_temp_new_i64();

-    neon_load_reg64(vn, a->vn);
-    neon_load_reg64(vm, a->vm);
+    vfp_load_reg64(vn, a->vn);
+    vfp_load_reg64(vm, a->vm);
    if (neg_n) {
        /* VFNMS, VFMS */
        gen_helper_vfp_negd(vn, vn);
    }
-    neon_load_reg64(vd, a->vd);
+    vfp_load_reg64(vd, a->vd);
    if (neg_d) {
        /* VFNMA, VFNMS */
        gen_helper_vfp_negd(vd, vd);
    }
    fpst = fpstatus_ptr(FPST_FPCR);
    gen_helper_vfp_muladdd(vd, vn, vm, vd, fpst);
-    neon_store_reg64(vd, a->vd);
+    vfp_store_reg64(vd, a->vd);

    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i64(vn);
@ -2283,7 +2230,7 @@ static bool trans_VMOV_imm_hp(DisasContext *s, arg_VMOV_imm_sp *a)
    }

    fd = tcg_const_i32(vfp_expand_imm(MO_16, a->imm));
-    neon_store_reg32(fd, a->vd);
+    vfp_store_reg32(fd, a->vd);
    tcg_temp_free_i32(fd);
    return true;
 }
@ -2323,7 +2270,7 @@ static bool trans_VMOV_imm_sp(DisasContext *s, arg_VMOV_imm_sp *a)
    fd = tcg_const_i32(vfp_expand_imm(MO_32, a->imm));

    for (;;) {
-        neon_store_reg32(fd, vd);
+        vfp_store_reg32(fd, vd);

        if (veclen == 0) {
            break;
@ -2378,7 +2325,7 @@ static bool trans_VMOV_imm_dp(DisasContext *s, arg_VMOV_imm_dp *a)
    fd = tcg_const_i64(vfp_expand_imm(MO_64, a->imm));

    for (;;) {
-        neon_store_reg64(fd, vd);
+        vfp_store_reg64(fd, vd);

        if (veclen == 0) {
            break;
@ -2450,11 +2397,11 @@ static bool trans_VCMP_hp(DisasContext *s, arg_VCMP_sp *a)
    vd = tcg_temp_new_i32();
    vm = tcg_temp_new_i32();

-    neon_load_reg32(vd, a->vd);
+    vfp_load_reg32(vd, a->vd);
    if (a->z) {
        tcg_gen_movi_i32(vm, 0);
    } else {
-        neon_load_reg32(vm, a->vm);
+        vfp_load_reg32(vm, a->vm);
    }

    if (a->e) {
@ -2489,11 +2436,11 @@ static bool trans_VCMP_sp(DisasContext *s, arg_VCMP_sp *a)
    vd = tcg_temp_new_i32();
    vm = tcg_temp_new_i32();

-    neon_load_reg32(vd, a->vd);
+    vfp_load_reg32(vd, a->vd);
    if (a->z) {
        tcg_gen_movi_i32(vm, 0);
    } else {
-        neon_load_reg32(vm, a->vm);
+        vfp_load_reg32(vm, a->vm);
    }

    if (a->e) {
@ -2533,11 +2480,11 @@ static bool trans_VCMP_dp(DisasContext *s, arg_VCMP_dp *a)
    vd = tcg_temp_new_i64();
    vm = tcg_temp_new_i64();

-    neon_load_reg64(vd, a->vd);
+    vfp_load_reg64(vd, a->vd);
    if (a->z) {
        tcg_gen_movi_i64(vm, 0);
    } else {
-        neon_load_reg64(vm, a->vm);
+        vfp_load_reg64(vm, a->vm);
    }

    if (a->e) {
@ -2572,7 +2519,7 @@ static bool trans_VCVT_f32_f16(DisasContext *s, arg_VCVT_f32_f16 *a)
    /* The T bit tells us if we want the low or high 16 bits of Vm */
    tcg_gen_ld16u_i32(tmp, cpu_env, vfp_f16_offset(a->vm, a->t));
    gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp_mode);
-    neon_store_reg32(tmp, a->vd);
+    vfp_store_reg32(tmp, a->vd);
    tcg_temp_free_i32(ahp_mode);
    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i32(tmp);
@ -2610,7 +2557,7 @@ static bool trans_VCVT_f64_f16(DisasContext *s, arg_VCVT_f64_f16 *a)
    tcg_gen_ld16u_i32(tmp, cpu_env, vfp_f16_offset(a->vm, a->t));
    vd = tcg_temp_new_i64();
    gen_helper_vfp_fcvt_f16_to_f64(vd, tmp, fpst, ahp_mode);
-    neon_store_reg64(vd, a->vd);
+    vfp_store_reg64(vd, a->vd);
    tcg_temp_free_i32(ahp_mode);
    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i32(tmp);
@ -2636,7 +2583,7 @@ static bool trans_VCVT_f16_f32(DisasContext *s, arg_VCVT_f16_f32 *a)
    ahp_mode = get_ahp_flag();
    tmp = tcg_temp_new_i32();

-    neon_load_reg32(tmp, a->vm);
+    vfp_load_reg32(tmp, a->vm);
    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp_mode);
    tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t));
    tcg_temp_free_i32(ahp_mode);
@ -2674,7 +2621,7 @@ static bool trans_VCVT_f16_f64(DisasContext *s, arg_VCVT_f16_f64 *a)
    tmp = tcg_temp_new_i32();
    vm = tcg_temp_new_i64();

-    neon_load_reg64(vm, a->vm);
+    vfp_load_reg64(vm, a->vm);
    gen_helper_vfp_fcvt_f64_to_f16(tmp, vm, fpst, ahp_mode);
    tcg_temp_free_i64(vm);
    tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t));
@ -2698,10 +2645,10 @@ static bool trans_VRINTR_hp(DisasContext *s, arg_VRINTR_sp *a)
    }

    tmp = tcg_temp_new_i32();
-    neon_load_reg32(tmp, a->vm);
+    vfp_load_reg32(tmp, a->vm);
    fpst = fpstatus_ptr(FPST_FPCR_F16);
    gen_helper_rinth(tmp, tmp, fpst);
-    neon_store_reg32(tmp, a->vd);
+    vfp_store_reg32(tmp, a->vd);
    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i32(tmp);
    return true;
@ -2721,10 +2668,10 @@ static bool trans_VRINTR_sp(DisasContext *s, arg_VRINTR_sp *a)
    }

    tmp = tcg_temp_new_i32();
-    neon_load_reg32(tmp, a->vm);
+    vfp_load_reg32(tmp, a->vm);
    fpst = fpstatus_ptr(FPST_FPCR);
    gen_helper_rints(tmp, tmp, fpst);
-    neon_store_reg32(tmp, a->vd);
+    vfp_store_reg32(tmp, a->vd);
    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i32(tmp);
    return true;
@ -2753,10 +2700,10 @@ static bool trans_VRINTR_dp(DisasContext *s, arg_VRINTR_dp *a)
    }

    tmp = tcg_temp_new_i64();
-    neon_load_reg64(tmp, a->vm);
+    vfp_load_reg64(tmp, a->vm);
    fpst = fpstatus_ptr(FPST_FPCR);
    gen_helper_rintd(tmp, tmp, fpst);
-    neon_store_reg64(tmp, a->vd);
+    vfp_store_reg64(tmp, a->vd);
    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i64(tmp);
    return true;
@ -2777,13 +2724,13 @@ static bool trans_VRINTZ_hp(DisasContext *s, arg_VRINTZ_sp *a)
    }

    tmp = tcg_temp_new_i32();
-    neon_load_reg32(tmp, a->vm);
+    vfp_load_reg32(tmp, a->vm);
    fpst = fpstatus_ptr(FPST_FPCR_F16);
    tcg_rmode = tcg_const_i32(float_round_to_zero);
    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
    gen_helper_rinth(tmp, tmp, fpst);
    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-    neon_store_reg32(tmp, a->vd);
+    vfp_store_reg32(tmp, a->vd);
    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i32(tcg_rmode);
    tcg_temp_free_i32(tmp);
@ -2805,13 +2752,13 @@ static bool trans_VRINTZ_sp(DisasContext *s, arg_VRINTZ_sp *a)
    }

    tmp = tcg_temp_new_i32();
-    neon_load_reg32(tmp, a->vm);
+    vfp_load_reg32(tmp, a->vm);
    fpst = fpstatus_ptr(FPST_FPCR);
    tcg_rmode = tcg_const_i32(float_round_to_zero);
    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
    gen_helper_rints(tmp, tmp, fpst);
    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-    neon_store_reg32(tmp, a->vd);
+    vfp_store_reg32(tmp, a->vd);
    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i32(tcg_rmode);
    tcg_temp_free_i32(tmp);
@ -2842,13 +2789,13 @@ static bool trans_VRINTZ_dp(DisasContext *s, arg_VRINTZ_dp *a)
    }

    tmp = tcg_temp_new_i64();
-    neon_load_reg64(tmp, a->vm);
+    vfp_load_reg64(tmp, a->vm);
    fpst = fpstatus_ptr(FPST_FPCR);
    tcg_rmode = tcg_const_i32(float_round_to_zero);
    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
    gen_helper_rintd(tmp, tmp, fpst);
    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-    neon_store_reg64(tmp, a->vd);
+    vfp_store_reg64(tmp, a->vd);
    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i64(tmp);
    tcg_temp_free_i32(tcg_rmode);
@ -2869,10 +2816,10 @@ static bool trans_VRINTX_hp(DisasContext *s, arg_VRINTX_sp *a)
    }

    tmp = tcg_temp_new_i32();
-    neon_load_reg32(tmp, a->vm);
+    vfp_load_reg32(tmp, a->vm);
    fpst = fpstatus_ptr(FPST_FPCR_F16);
    gen_helper_rinth_exact(tmp, tmp, fpst);
-    neon_store_reg32(tmp, a->vd);
+    vfp_store_reg32(tmp, a->vd);
    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i32(tmp);
    return true;
@ -2892,10 +2839,10 @@ static bool trans_VRINTX_sp(DisasContext *s, arg_VRINTX_sp *a)
    }

    tmp = tcg_temp_new_i32();
-    neon_load_reg32(tmp, a->vm);
+    vfp_load_reg32(tmp, a->vm);
    fpst = fpstatus_ptr(FPST_FPCR);
    gen_helper_rints_exact(tmp, tmp, fpst);
-    neon_store_reg32(tmp, a->vd);
+    vfp_store_reg32(tmp, a->vd);
    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i32(tmp);
    return true;
@ -2924,10 +2871,10 @@ static bool trans_VRINTX_dp(DisasContext *s, arg_VRINTX_dp *a)
    }

    tmp = tcg_temp_new_i64();
-    neon_load_reg64(tmp, a->vm);
+    vfp_load_reg64(tmp, a->vm);
    fpst = fpstatus_ptr(FPST_FPCR);
    gen_helper_rintd_exact(tmp, tmp, fpst);
-    neon_store_reg64(tmp, a->vd);
+    vfp_store_reg64(tmp, a->vd);
    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i64(tmp);
    return true;
@ -2953,9 +2900,9 @@ static bool trans_VCVT_sp(DisasContext *s, arg_VCVT_sp *a)

    vm = tcg_temp_new_i32();
    vd = tcg_temp_new_i64();
-    neon_load_reg32(vm, a->vm);
+    vfp_load_reg32(vm, a->vm);
    gen_helper_vfp_fcvtds(vd, vm, cpu_env);
-    neon_store_reg64(vd, a->vd);
+    vfp_store_reg64(vd, a->vd);
    tcg_temp_free_i32(vm);
    tcg_temp_free_i64(vd);
    return true;
@ -2981,9 +2928,9 @@ static bool trans_VCVT_dp(DisasContext *s, arg_VCVT_dp *a)

    vd = tcg_temp_new_i32();
    vm = tcg_temp_new_i64();
-    neon_load_reg64(vm, a->vm);
+    vfp_load_reg64(vm, a->vm);
    gen_helper_vfp_fcvtsd(vd, vm, cpu_env);
-    neon_store_reg32(vd, a->vd);
+    vfp_store_reg32(vd, a->vd);
    tcg_temp_free_i32(vd);
    tcg_temp_free_i64(vm);
    return true;
@ -3003,7 +2950,7 @@ static bool trans_VCVT_int_hp(DisasContext *s, arg_VCVT_int_sp *a)
    }

    vm = tcg_temp_new_i32();
-    neon_load_reg32(vm, a->vm);
+    vfp_load_reg32(vm, a->vm);
    fpst = fpstatus_ptr(FPST_FPCR_F16);
    if (a->s) {
        /* i32 -> f16 */
@ -3012,7 +2959,7 @@ static bool trans_VCVT_int_hp(DisasContext *s, arg_VCVT_int_sp *a)
        /* u32 -> f16 */
        gen_helper_vfp_uitoh(vm, vm, fpst);
    }
-    neon_store_reg32(vm, a->vd);
+    vfp_store_reg32(vm, a->vd);
    tcg_temp_free_i32(vm);
    tcg_temp_free_ptr(fpst);
    return true;
@ -3032,7 +2979,7 @@ static bool trans_VCVT_int_sp(DisasContext *s, arg_VCVT_int_sp *a)
    }

    vm = tcg_temp_new_i32();
-    neon_load_reg32(vm, a->vm);
+    vfp_load_reg32(vm, a->vm);
    fpst = fpstatus_ptr(FPST_FPCR);
    if (a->s) {
        /* i32 -> f32 */
@ -3041,7 +2988,7 @@ static bool trans_VCVT_int_sp(DisasContext *s, arg_VCVT_int_sp *a)
        /* u32 -> f32 */
        gen_helper_vfp_uitos(vm, vm, fpst);
    }
-    neon_store_reg32(vm, a->vd);
+    vfp_store_reg32(vm, a->vd);
    tcg_temp_free_i32(vm);
    tcg_temp_free_ptr(fpst);
    return true;
@ -3068,7 +3015,7 @@ static bool trans_VCVT_int_dp(DisasContext *s, arg_VCVT_int_dp *a)

    vm = tcg_temp_new_i32();
    vd = tcg_temp_new_i64();
-    neon_load_reg32(vm, a->vm);
+    vfp_load_reg32(vm, a->vm);
    fpst = fpstatus_ptr(FPST_FPCR);
    if (a->s) {
        /* i32 -> f64 */
@ -3077,7 +3024,7 @@ static bool trans_VCVT_int_dp(DisasContext *s, arg_VCVT_int_dp *a)
        /* u32 -> f64 */
        gen_helper_vfp_uitod(vd, vm, fpst);
    }
-    neon_store_reg64(vd, a->vd);
+    vfp_store_reg64(vd, a->vd);
    tcg_temp_free_i32(vm);
    tcg_temp_free_i64(vd);
    tcg_temp_free_ptr(fpst);
@ -3108,9 +3055,9 @@ static bool trans_VJCVT(DisasContext *s, arg_VJCVT *a)

    vm = tcg_temp_new_i64();
    vd = tcg_temp_new_i32();
-    neon_load_reg64(vm, a->vm);
+    vfp_load_reg64(vm, a->vm);
    gen_helper_vjcvt(vd, vm, cpu_env);
-    neon_store_reg32(vd, a->vd);
+    vfp_store_reg32(vd, a->vd);
    tcg_temp_free_i64(vm);
    tcg_temp_free_i32(vd);
    return true;
@ -3133,7 +3080,7 @@ static bool trans_VCVT_fix_hp(DisasContext *s, arg_VCVT_fix_sp *a)
    frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);

    vd = tcg_temp_new_i32();
-    neon_load_reg32(vd, a->vd);
+    vfp_load_reg32(vd, a->vd);

    fpst = fpstatus_ptr(FPST_FPCR_F16);
    shift = tcg_const_i32(frac_bits);
@ -3168,7 +3115,7 @@ static bool trans_VCVT_fix_hp(DisasContext *s, arg_VCVT_fix_sp *a)
        g_assert_not_reached();
    }

-    neon_store_reg32(vd, a->vd);
+    vfp_store_reg32(vd, a->vd);
    tcg_temp_free_i32(vd);
    tcg_temp_free_i32(shift);
    tcg_temp_free_ptr(fpst);
@ -3192,7 +3139,7 @@ static bool trans_VCVT_fix_sp(DisasContext *s, arg_VCVT_fix_sp *a)
    frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);

    vd = tcg_temp_new_i32();
-    neon_load_reg32(vd, a->vd);
+    vfp_load_reg32(vd, a->vd);

    fpst = fpstatus_ptr(FPST_FPCR);
    shift = tcg_const_i32(frac_bits);
@ -3227,7 +3174,7 @@ static bool trans_VCVT_fix_sp(DisasContext *s, arg_VCVT_fix_sp *a)
        g_assert_not_reached();
    }

-    neon_store_reg32(vd, a->vd);
+    vfp_store_reg32(vd, a->vd);
    tcg_temp_free_i32(vd);
    tcg_temp_free_i32(shift);
    tcg_temp_free_ptr(fpst);
@ -3257,7 +3204,7 @@ static bool trans_VCVT_fix_dp(DisasContext *s, arg_VCVT_fix_dp *a)
    frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);

    vd = tcg_temp_new_i64();
-    neon_load_reg64(vd, a->vd);
+    vfp_load_reg64(vd, a->vd);

    fpst = fpstatus_ptr(FPST_FPCR);
    shift = tcg_const_i32(frac_bits);
@ -3292,7 +3239,7 @@ static bool trans_VCVT_fix_dp(DisasContext *s, arg_VCVT_fix_dp *a)
        g_assert_not_reached();
    }

-    neon_store_reg64(vd, a->vd);
+    vfp_store_reg64(vd, a->vd);
    tcg_temp_free_i64(vd);
    tcg_temp_free_i32(shift);
    tcg_temp_free_ptr(fpst);
@ -3314,7 +3261,7 @@ static bool trans_VCVT_hp_int(DisasContext *s, arg_VCVT_sp_int *a)

    fpst = fpstatus_ptr(FPST_FPCR_F16);
    vm = tcg_temp_new_i32();
-    neon_load_reg32(vm, a->vm);
+    vfp_load_reg32(vm, a->vm);

    if (a->s) {
        if (a->rz) {
@ -3329,7 +3276,7 @@ static bool trans_VCVT_hp_int(DisasContext *s, arg_VCVT_sp_int *a)
            gen_helper_vfp_touih(vm, vm, fpst);
        }
    }
-    neon_store_reg32(vm, a->vd);
+    vfp_store_reg32(vm, a->vd);
    tcg_temp_free_i32(vm);
    tcg_temp_free_ptr(fpst);
    return true;
@ -3350,7 +3297,7 @@ static bool trans_VCVT_sp_int(DisasContext *s, arg_VCVT_sp_int *a)

    fpst = fpstatus_ptr(FPST_FPCR);
    vm = tcg_temp_new_i32();
-    neon_load_reg32(vm, a->vm);
+    vfp_load_reg32(vm, a->vm);

    if (a->s) {
        if (a->rz) {
@ -3365,7 +3312,7 @@ static bool trans_VCVT_sp_int(DisasContext *s, arg_VCVT_sp_int *a)
            gen_helper_vfp_touis(vm, vm, fpst);
        }
    }
-    neon_store_reg32(vm, a->vd);
+    vfp_store_reg32(vm, a->vd);
    tcg_temp_free_i32(vm);
    tcg_temp_free_ptr(fpst);
    return true;
@ -3393,7 +3340,7 @@ static bool trans_VCVT_dp_int(DisasContext *s, arg_VCVT_dp_int *a)
    fpst = fpstatus_ptr(FPST_FPCR);
    vm = tcg_temp_new_i64();
    vd = tcg_temp_new_i32();
-    neon_load_reg64(vm, a->vm);
+    vfp_load_reg64(vm, a->vm);

    if (a->s) {
        if (a->rz) {
@ -3408,7 +3355,7 @@ static bool trans_VCVT_dp_int(DisasContext *s, arg_VCVT_dp_int *a)
            gen_helper_vfp_touid(vd, vm, fpst);
        }
    }
-    neon_store_reg32(vd, a->vd);
+    vfp_store_reg32(vd, a->vd);
    tcg_temp_free_i32(vd);
    tcg_temp_free_i64(vm);
    tcg_temp_free_ptr(fpst);
@ -3521,10 +3468,10 @@ static bool trans_VINS(DisasContext *s, arg_VINS *a)
    /* Insert low half of Vm into high half of Vd */
    rm = tcg_temp_new_i32();
    rd = tcg_temp_new_i32();
-    neon_load_reg32(rm, a->vm);
-    neon_load_reg32(rd, a->vd);
+    vfp_load_reg32(rm, a->vm);
+    vfp_load_reg32(rd, a->vd);
    tcg_gen_deposit_i32(rd, rd, rm, 16, 16);
-    neon_store_reg32(rd, a->vd);
+    vfp_store_reg32(rd, a->vd);
    tcg_temp_free_i32(rm);
    tcg_temp_free_i32(rd);
    return true;
@ -3548,9 +3495,9 @@ static bool trans_VMOVX(DisasContext *s, arg_VINS *a)

    /* Set Vd to high half of Vm */
    rm = tcg_temp_new_i32();
-    neon_load_reg32(rm, a->vm);
+    vfp_load_reg32(rm, a->vm);
    tcg_gen_shri_i32(rm, rm, 16);
-    neon_store_reg32(rm, a->vd);
+    vfp_store_reg32(rm, a->vd);
    tcg_temp_free_i32(rm);
    return true;
 }
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@ -1094,64 +1094,141 @@ static inline void gen_hlt(DisasContext *s, int imm)
    unallocated_encoding(s);
 }

-static inline long vfp_reg_offset(bool dp, unsigned reg)
+/*
+ * Return the offset of a "full" NEON Dreg.
+ */
+static long neon_full_reg_offset(unsigned reg)
+{
+    return offsetof(CPUARMState, vfp.zregs[reg >> 1].d[reg & 1]);
+}
+
+/*
+ * Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
+ * where 0 is the least significant end of the register.
+ */
+static long neon_element_offset(int reg, int element, MemOp memop)
+{
+    int element_size = 1 << (memop & MO_SIZE);
+    int ofs = element * element_size;
+#ifdef HOST_WORDS_BIGENDIAN
+    /*
+     * Calculate the offset assuming fully little-endian,
+     * then XOR to account for the order of the 8-byte units.
+     */
+    if (element_size < 8) {
+        ofs ^= 8 - element_size;
+    }
+#endif
+    return neon_full_reg_offset(reg) + ofs;
+}
+
+/* Return the offset of a VFP Dreg (dp = true) or VFP Sreg (dp = false). */
+static long vfp_reg_offset(bool dp, unsigned reg)
 {
    if (dp) {
-        return offsetof(CPUARMState, vfp.zregs[reg >> 1].d[reg & 1]);
+        return neon_element_offset(reg, 0, MO_64);
    } else {
-        long ofs = offsetof(CPUARMState, vfp.zregs[reg >> 2].d[(reg >> 1) & 1]);
-        if (reg & 1) {
-            ofs += offsetof(CPU_DoubleU, l.upper);
-        } else {
-            ofs += offsetof(CPU_DoubleU, l.lower);
-        }
-        return ofs;
+        return neon_element_offset(reg >> 1, reg & 1, MO_32);
    }
 }

-/* Return the offset of a 32-bit piece of a NEON register.
-   zero is the least significant end of the register.  */
-static inline long
-neon_reg_offset (int reg, int n)
+static inline void vfp_load_reg64(TCGv_i64 var, int reg)
 {
-    int sreg;
-    sreg = reg * 2 + n;
-    return vfp_reg_offset(0, sreg);
+    tcg_gen_ld_i64(var, cpu_env, vfp_reg_offset(true, reg));
 }

-static TCGv_i32 neon_load_reg(int reg, int pass)
+static inline void vfp_store_reg64(TCGv_i64 var, int reg)
 {
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    tcg_gen_ld_i32(tmp, cpu_env, neon_reg_offset(reg, pass));
-    return tmp;
+    tcg_gen_st_i64(var, cpu_env, vfp_reg_offset(true, reg));
 }

-static void neon_store_reg(int reg, int pass, TCGv_i32 var)
-{
-    tcg_gen_st_i32(var, cpu_env, neon_reg_offset(reg, pass));
-    tcg_temp_free_i32(var);
-}
-
-static inline void neon_load_reg64(TCGv_i64 var, int reg)
-{
-    tcg_gen_ld_i64(var, cpu_env, vfp_reg_offset(1, reg));
-}
-
-static inline void neon_store_reg64(TCGv_i64 var, int reg)
-{
-    tcg_gen_st_i64(var, cpu_env, vfp_reg_offset(1, reg));
-}
-
-static inline void neon_load_reg32(TCGv_i32 var, int reg)
+static inline void vfp_load_reg32(TCGv_i32 var, int reg)
 {
    tcg_gen_ld_i32(var, cpu_env, vfp_reg_offset(false, reg));
 }

-static inline void neon_store_reg32(TCGv_i32 var, int reg)
+static inline void vfp_store_reg32(TCGv_i32 var, int reg)
 {
    tcg_gen_st_i32(var, cpu_env, vfp_reg_offset(false, reg));
 }

+static void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop)
+{
+    long off = neon_element_offset(reg, ele, memop);
+
+    switch (memop) {
+    case MO_SB:
+        tcg_gen_ld8s_i32(dest, cpu_env, off);
+        break;
+    case MO_UB:
+        tcg_gen_ld8u_i32(dest, cpu_env, off);
+        break;
+    case MO_SW:
+        tcg_gen_ld16s_i32(dest, cpu_env, off);
+        break;
+    case MO_UW:
+        tcg_gen_ld16u_i32(dest, cpu_env, off);
+        break;
+    case MO_UL:
+    case MO_SL:
+        tcg_gen_ld_i32(dest, cpu_env, off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop)
+{
+    long off = neon_element_offset(reg, ele, memop);
+
+    switch (memop) {
+    case MO_SL:
+        tcg_gen_ld32s_i64(dest, cpu_env, off);
+        break;
+    case MO_UL:
+        tcg_gen_ld32u_i64(dest, cpu_env, off);
+        break;
+    case MO_Q:
+        tcg_gen_ld_i64(dest, cpu_env, off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop)
+{
+    long off = neon_element_offset(reg, ele, memop);
+
+    switch (memop) {
+    case MO_8:
+        tcg_gen_st8_i32(src, cpu_env, off);
+        break;
+    case MO_16:
+        tcg_gen_st16_i32(src, cpu_env, off);
+        break;
+    case MO_32:
+        tcg_gen_st_i32(src, cpu_env, off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void write_neon_element64(TCGv_i64 src, int reg, int ele, MemOp memop)
+{
+    long off = neon_element_offset(reg, ele, memop);
+
+    switch (memop) {
+    case MO_64:
+        tcg_gen_st_i64(src, cpu_env, off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
 {
    TCGv_ptr ret = tcg_temp_new_ptr();
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@ -293,7 +293,7 @@ void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
    intptr_t index = simd_data(desc);
    uint32_t *d = vd;
    int8_t *n = vn;
-    int8_t *m_indexed = (int8_t *)vm + index * 4;
+    int8_t *m_indexed = (int8_t *)vm + H4(index) * 4;

    /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd.
     * Otherwise opr_sz is a multiple of 16.
@ -324,7 +324,7 @@ void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
    intptr_t index = simd_data(desc);
    uint32_t *d = vd;
    uint8_t *n = vn;
-    uint8_t *m_indexed = (uint8_t *)vm + index * 4;
+    uint8_t *m_indexed = (uint8_t *)vm + H4(index) * 4;

    /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd.
     * Otherwise opr_sz is a multiple of 16.
@ -1858,10 +1858,10 @@ DO_ABA(gvec_uaba_d, uint64_t)
        r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
        r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
                                                                        \
-        d[H4(0)] = r0;                                                  \
-        d[H4(1)] = r1;                                                  \
-        d[H4(2)] = r2;                                                  \
-        d[H4(3)] = r3;                                                  \
+        d[H2(0)] = r0;                                                  \
+        d[H2(1)] = r1;                                                  \
+        d[H2(2)] = r2;                                                  \
+        d[H2(3)] = r3;                                                  \
    }

 DO_NEON_PAIRWISE(neon_padd, add)
--- a/tests/qtest/npcm7xx_rng-test.c
+++ b/tests/qtest/npcm7xx_rng-test.c
@ -265,10 +265,16 @@ int main(int argc, char **argv)

    qtest_add_func("npcm7xx_rng/enable_disable", test_enable_disable);
    qtest_add_func("npcm7xx_rng/rosel", test_rosel);
-    qtest_add_func("npcm7xx_rng/continuous/monobit", test_continuous_monobit);
-    qtest_add_func("npcm7xx_rng/continuous/runs", test_continuous_runs);
-    qtest_add_func("npcm7xx_rng/first_byte/monobit", test_first_byte_monobit);
-    qtest_add_func("npcm7xx_rng/first_byte/runs", test_first_byte_runs);
+    /*
+     * These tests fail intermittently; only run them on explicit
+     * request until we figure out why.
+     */
+    if (getenv("QEMU_TEST_FLAKY_RNG_TESTS")) {
+        qtest_add_func("npcm7xx_rng/continuous/monobit", test_continuous_monobit);
+        qtest_add_func("npcm7xx_rng/continuous/runs", test_continuous_runs);
+        qtest_add_func("npcm7xx_rng/first_byte/monobit", test_first_byte_monobit);
+        qtest_add_func("npcm7xx_rng/first_byte/runs", test_first_byte_runs);
+    }

    qtest_start("-machine npcm750-evb");
    ret = g_test_run();