diff --git a/docs/system/gdb.rst b/docs/system/gdb.rst index 144d083df3..bdb42dae2f 100644 --- a/docs/system/gdb.rst +++ b/docs/system/gdb.rst @@ -15,7 +15,8 @@ The ``-s`` option will make QEMU listen for an incoming connection from gdb on TCP port 1234, and ``-S`` will make QEMU not start the guest until you tell it to from gdb. (If you want to specify which TCP port to use or to use something other than TCP for the gdbstub -connection, use the ``-gdb dev`` option instead of ``-s``.) +connection, use the ``-gdb dev`` option instead of ``-s``. See +`Using unix sockets`_ for an example.) .. parsed-literal:: @@ -100,6 +101,29 @@ not just those in the cluster you are currently working on:: (gdb) set schedule-multiple on +Using unix sockets +================== + +An alternate method for connecting gdb to the QEMU gdbstub is to use +a unix socket (if supported by your operating system). This is useful when +running several tests in parallel, or if you do not have a known free TCP +port (e.g. when running automated tests). + +First create a chardev with the appropriate options, then +instruct the gdbserver to use that device: + +.. parsed-literal:: + + |qemu_system| -chardev socket,path=/tmp/gdb-socket,server=on,wait=off,id=gdb0 -gdb chardev:gdb0 -S ... + +Start gdb as before, but this time connect using the path to +the socket:: + + (gdb) target remote /tmp/gdb-socket + +Note that to use a unix socket for the connection you will need +gdb version 9.0 or newer. + Advanced debugging options ========================== diff --git a/hw/arm/exynos4210.c b/hw/arm/exynos4210.c index 5c7a51bbad..0299e81f85 100644 --- a/hw/arm/exynos4210.c +++ b/hw/arm/exynos4210.c @@ -173,6 +173,9 @@ static DeviceState *pl330_create(uint32_t base, qemu_or_irq *orgate, int i; dev = qdev_new("pl330"); + object_property_set_link(OBJECT(dev), "memory", + OBJECT(get_system_memory()), + &error_fatal); qdev_prop_set_uint8(dev, "num_events", nevents); qdev_prop_set_uint8(dev, "num_chnls", 8); qdev_prop_set_uint8(dev, "num_periph_req", nreq); diff --git a/hw/arm/fsl-imx6ul.c b/hw/arm/fsl-imx6ul.c index e0128d7316..1d1a708dd9 100644 --- a/hw/arm/fsl-imx6ul.c +++ b/hw/arm/fsl-imx6ul.c @@ -534,6 +534,13 @@ static void fsl_imx6ul_realize(DeviceState *dev, Error **errp) */ create_unimplemented_device("sdma", FSL_IMX6UL_SDMA_ADDR, 0x4000); + /* + * SAI (Audio SSI (Synchronous Serial Interface)) + */ + create_unimplemented_device("sai1", FSL_IMX6UL_SAI1_ADDR, 0x4000); + create_unimplemented_device("sai2", FSL_IMX6UL_SAI2_ADDR, 0x4000); + create_unimplemented_device("sai3", FSL_IMX6UL_SAI3_ADDR, 0x4000); + /* * PWM */ @@ -542,6 +549,11 @@ static void fsl_imx6ul_realize(DeviceState *dev, Error **errp) create_unimplemented_device("pwm3", FSL_IMX6UL_PWM3_ADDR, 0x4000); create_unimplemented_device("pwm4", FSL_IMX6UL_PWM4_ADDR, 0x4000); + /* + * Audio ASRC (asynchronous sample rate converter) + */ + create_unimplemented_device("asrc", FSL_IMX6UL_ASRC_ADDR, 0x4000); + /* * CAN */ diff --git a/hw/arm/fsl-imx7.c b/hw/arm/fsl-imx7.c index 2ff2cab924..149885f2b8 100644 --- a/hw/arm/fsl-imx7.c +++ b/hw/arm/fsl-imx7.c @@ -467,6 +467,13 @@ static void fsl_imx7_realize(DeviceState *dev, Error **errp) create_unimplemented_device("can1", FSL_IMX7_CAN1_ADDR, FSL_IMX7_CANn_SIZE); create_unimplemented_device("can2", FSL_IMX7_CAN2_ADDR, FSL_IMX7_CANn_SIZE); + /* + * SAI (Audio SSI (Synchronous Serial Interface)) + */ + create_unimplemented_device("sai1", FSL_IMX7_SAI1_ADDR, FSL_IMX7_SAIn_SIZE); + create_unimplemented_device("sai2", FSL_IMX7_SAI2_ADDR, FSL_IMX7_SAIn_SIZE); + create_unimplemented_device("sai2", FSL_IMX7_SAI3_ADDR, FSL_IMX7_SAIn_SIZE); + /* * OCOTP */ diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c index c1629df603..509c5f09b4 100644 --- a/hw/arm/sbsa-ref.c +++ b/hw/arm/sbsa-ref.c @@ -65,7 +65,7 @@ enum { SBSA_GIC_DIST, SBSA_GIC_REDIST, SBSA_SECURE_EC, - SBSA_GWDT, + SBSA_GWDT_WS0, SBSA_GWDT_REFRESH, SBSA_GWDT_CONTROL, SBSA_SMMU, @@ -140,7 +140,7 @@ static const int sbsa_ref_irqmap[] = { [SBSA_AHCI] = 10, [SBSA_EHCI] = 11, [SBSA_SMMU] = 12, /* ... to 15 */ - [SBSA_GWDT] = 16, + [SBSA_GWDT_WS0] = 16, }; static const char * const valid_cpus[] = { @@ -481,7 +481,7 @@ static void create_wdt(const SBSAMachineState *sms) hwaddr cbase = sbsa_ref_memmap[SBSA_GWDT_CONTROL].base; DeviceState *dev = qdev_new(TYPE_WDT_SBSA); SysBusDevice *s = SYS_BUS_DEVICE(dev); - int irq = sbsa_ref_irqmap[SBSA_GWDT]; + int irq = sbsa_ref_irqmap[SBSA_GWDT_WS0]; sysbus_realize_and_unref(s, &error_fatal); sysbus_mmio_map(s, 0, rbase); diff --git a/hw/arm/xilinx_zynq.c b/hw/arm/xilinx_zynq.c index 245af81bbb..69c333e91b 100644 --- a/hw/arm/xilinx_zynq.c +++ b/hw/arm/xilinx_zynq.c @@ -312,6 +312,9 @@ static void zynq_init(MachineState *machine) sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, pic[39-IRQ_OFFSET]); dev = qdev_new("pl330"); + object_property_set_link(OBJECT(dev), "memory", + OBJECT(address_space_mem), + &error_fatal); qdev_prop_set_uint8(dev, "num_chnls", 8); qdev_prop_set_uint8(dev, "num_periph_req", 4); qdev_prop_set_uint8(dev, "num_events", 16); diff --git a/hw/char/pl011.c b/hw/char/pl011.c index dc85527a5f..6e2d7f7509 100644 --- a/hw/char/pl011.c +++ b/hw/char/pl011.c @@ -26,6 +26,7 @@ #include "hw/qdev-properties-system.h" #include "migration/vmstate.h" #include "chardev/char-fe.h" +#include "chardev/char-serial.h" #include "qemu/log.h" #include "qemu/module.h" #include "trace.h" @@ -231,6 +232,11 @@ static void pl011_write(void *opaque, hwaddr offset, s->read_count = 0; s->read_pos = 0; } + if ((s->lcr ^ value) & 0x1) { + int break_enable = value & 0x1; + qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_BREAK, + &break_enable); + } s->lcr = value; pl011_set_read_trigger(s); break; diff --git a/hw/dma/pl330.c b/hw/dma/pl330.c index 944ba296b0..0cb46191c1 100644 --- a/hw/dma/pl330.c +++ b/hw/dma/pl330.c @@ -269,6 +269,9 @@ struct PL330State { uint8_t num_faulting; uint8_t periph_busy[PL330_PERIPH_NUM]; + /* Memory region that DMA operation access */ + MemoryRegion *mem_mr; + AddressSpace *mem_as; }; #define TYPE_PL330 "pl330" @@ -1108,7 +1111,7 @@ static inline const PL330InsnDesc *pl330_fetch_insn(PL330Chan *ch) uint8_t opcode; int i; - dma_memory_read(&address_space_memory, ch->pc, &opcode, 1); + dma_memory_read(ch->parent->mem_as, ch->pc, &opcode, 1); for (i = 0; insn_desc[i].size; i++) { if ((opcode & insn_desc[i].opmask) == insn_desc[i].opcode) { return &insn_desc[i]; @@ -1122,7 +1125,7 @@ static inline void pl330_exec_insn(PL330Chan *ch, const PL330InsnDesc *insn) uint8_t buf[PL330_INSN_MAXSIZE]; assert(insn->size <= PL330_INSN_MAXSIZE); - dma_memory_read(&address_space_memory, ch->pc, buf, insn->size); + dma_memory_read(ch->parent->mem_as, ch->pc, buf, insn->size); insn->exec(ch, buf[0], &buf[1], insn->size - 1); } @@ -1186,7 +1189,7 @@ static int pl330_exec_cycle(PL330Chan *channel) if (q != NULL && q->len <= pl330_fifo_num_free(&s->fifo)) { int len = q->len - (q->addr & (q->len - 1)); - dma_memory_read(&address_space_memory, q->addr, buf, len); + dma_memory_read(s->mem_as, q->addr, buf, len); trace_pl330_exec_cycle(q->addr, len); if (trace_event_get_state_backends(TRACE_PL330_HEXDUMP)) { pl330_hexdump(buf, len); @@ -1217,7 +1220,7 @@ static int pl330_exec_cycle(PL330Chan *channel) fifo_res = pl330_fifo_get(&s->fifo, buf, len, q->tag); } if (fifo_res == PL330_FIFO_OK || q->z) { - dma_memory_write(&address_space_memory, q->addr, buf, len); + dma_memory_write(s->mem_as, q->addr, buf, len); trace_pl330_exec_cycle(q->addr, len); if (trace_event_get_state_backends(TRACE_PL330_HEXDUMP)) { pl330_hexdump(buf, len); @@ -1562,6 +1565,18 @@ static void pl330_realize(DeviceState *dev, Error **errp) "dma", PL330_IOMEM_SIZE); sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->iomem); + if (!s->mem_mr) { + error_setg(errp, "'memory' link is not set"); + return; + } else if (s->mem_mr == get_system_memory()) { + /* Avoid creating new AS for system memory. */ + s->mem_as = &address_space_memory; + } else { + s->mem_as = g_new0(AddressSpace, 1); + address_space_init(s->mem_as, s->mem_mr, + memory_region_name(s->mem_mr)); + } + s->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, pl330_exec_cycle_timer, s); s->cfg[0] = (s->mgr_ns_at_rst ? 0x4 : 0) | @@ -1656,6 +1671,9 @@ static Property pl330_properties[] = { DEFINE_PROP_UINT8("rd_q_dep", PL330State, rd_q_dep, 16), DEFINE_PROP_UINT16("data_buffer_dep", PL330State, data_buffer_dep, 256), + DEFINE_PROP_LINK("memory", PL330State, mem_mr, + TYPE_MEMORY_REGION, MemoryRegion *), + DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/arm/fsl-imx7.h b/include/hw/arm/fsl-imx7.h index f5d527a490..1c5fa6fd67 100644 --- a/include/hw/arm/fsl-imx7.h +++ b/include/hw/arm/fsl-imx7.h @@ -174,6 +174,11 @@ enum FslIMX7MemoryMap { FSL_IMX7_UART6_ADDR = 0x30A80000, FSL_IMX7_UART7_ADDR = 0x30A90000, + FSL_IMX7_SAI1_ADDR = 0x308A0000, + FSL_IMX7_SAI2_ADDR = 0x308B0000, + FSL_IMX7_SAI3_ADDR = 0x308C0000, + FSL_IMX7_SAIn_SIZE = 0x10000, + FSL_IMX7_ENET1_ADDR = 0x30BE0000, FSL_IMX7_ENET2_ADDR = 0x30BF0000, diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 2866dd7658..a82e39dd97 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -1017,6 +1017,9 @@ static void arm_cpu_dump_state(CPUState *cs, FILE *f, int flags) i, v); } qemu_fprintf(f, "FPSCR: %08x\n", vfp_get_fpscr(env)); + if (cpu_isar_feature(aa32_mve, cpu)) { + qemu_fprintf(f, "VPR: %08x\n", env->v7m.vpr); + } } } diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 9f0a5f84d5..5cf8996ae3 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -54,6 +54,7 @@ #define EXCP_LAZYFP 20 /* v7M fault during lazy FP stacking */ #define EXCP_LSERR 21 /* v8M LSERR SecureFault */ #define EXCP_UNALIGNED 22 /* v7M UNALIGNED UsageFault */ +#define EXCP_DIVBYZERO 23 /* v7M DIVBYZERO UsageFault */ /* NB: add new EXCP_ defines to the array in arm_log_exception() too */ #define ARMV7M_EXCP_RESET 1 diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 56e40844ad..3db9b15f12 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -33,8 +33,105 @@ DEF_HELPER_FLAGS_3(mve_vstrb_h, TCG_CALL_NO_WG, void, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vstrb_w, TCG_CALL_NO_WG, void, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vstrh_w, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrb_sg_sh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrb_sg_sw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrh_sg_sw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vldrb_sg_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrb_sg_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrb_sg_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrh_sg_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrh_sg_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrw_sg_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrd_sg_ud, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vstrb_sg_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vstrb_sg_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vstrb_sg_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vstrh_sg_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vstrh_sg_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vstrw_sg_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vstrd_sg_ud, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vldrh_sg_os_sw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vldrh_sg_os_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrh_sg_os_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrw_sg_os_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrd_sg_os_ud, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vstrh_sg_os_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vstrh_sg_os_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vstrw_sg_os_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vstrd_sg_os_ud, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vldrw_sg_wb_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vldrd_sg_wb_ud, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vstrw_sg_wb_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vstrd_sg_wb_ud, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vld20b, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vld20h, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vld20w, TCG_CALL_NO_WG, void, env, i32, i32) + +DEF_HELPER_FLAGS_3(mve_vld21b, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vld21h, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vld21w, TCG_CALL_NO_WG, void, env, i32, i32) + +DEF_HELPER_FLAGS_3(mve_vld40b, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vld40h, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vld40w, TCG_CALL_NO_WG, void, env, i32, i32) + +DEF_HELPER_FLAGS_3(mve_vld41b, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vld41h, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vld41w, TCG_CALL_NO_WG, void, env, i32, i32) + +DEF_HELPER_FLAGS_3(mve_vld42b, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vld42h, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vld42w, TCG_CALL_NO_WG, void, env, i32, i32) + +DEF_HELPER_FLAGS_3(mve_vld43b, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vld43h, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vld43w, TCG_CALL_NO_WG, void, env, i32, i32) + +DEF_HELPER_FLAGS_3(mve_vst20b, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vst20h, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vst20w, TCG_CALL_NO_WG, void, env, i32, i32) + +DEF_HELPER_FLAGS_3(mve_vst21b, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vst21h, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vst21w, TCG_CALL_NO_WG, void, env, i32, i32) + +DEF_HELPER_FLAGS_3(mve_vst40b, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vst40h, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vst40w, TCG_CALL_NO_WG, void, env, i32, i32) + +DEF_HELPER_FLAGS_3(mve_vst41b, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vst41h, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vst41w, TCG_CALL_NO_WG, void, env, i32, i32) + +DEF_HELPER_FLAGS_3(mve_vst42b, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vst42h, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vst42w, TCG_CALL_NO_WG, void, env, i32, i32) + +DEF_HELPER_FLAGS_3(mve_vst43b, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vst43h, TCG_CALL_NO_WG, void, env, i32, i32) +DEF_HELPER_FLAGS_3(mve_vst43w, TCG_CALL_NO_WG, void, env, i32, i32) + DEF_HELPER_FLAGS_3(mve_vdup, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vidupb, TCG_CALL_NO_WG, i32, env, ptr, i32, i32) +DEF_HELPER_FLAGS_4(mve_viduph, TCG_CALL_NO_WG, i32, env, ptr, i32, i32) +DEF_HELPER_FLAGS_4(mve_vidupw, TCG_CALL_NO_WG, i32, env, ptr, i32, i32) + +DEF_HELPER_FLAGS_5(mve_viwdupb, TCG_CALL_NO_WG, i32, env, ptr, i32, i32, i32) +DEF_HELPER_FLAGS_5(mve_viwduph, TCG_CALL_NO_WG, i32, env, ptr, i32, i32, i32) +DEF_HELPER_FLAGS_5(mve_viwdupw, TCG_CALL_NO_WG, i32, env, ptr, i32, i32, i32) + +DEF_HELPER_FLAGS_5(mve_vdwdupb, TCG_CALL_NO_WG, i32, env, ptr, i32, i32, i32) +DEF_HELPER_FLAGS_5(mve_vdwduph, TCG_CALL_NO_WG, i32, env, ptr, i32, i32, i32) +DEF_HELPER_FLAGS_5(mve_vdwdupw, TCG_CALL_NO_WG, i32, env, ptr, i32, i32, i32) + DEF_HELPER_FLAGS_3(mve_vclsb, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vclsh, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vclsw, TCG_CALL_NO_WG, void, env, ptr, ptr) @@ -64,12 +161,53 @@ DEF_HELPER_FLAGS_3(mve_vnegw, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vfnegh, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vfnegs, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqabsb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqabsh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqabsw, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vqnegb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqnegh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqnegw, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vmaxab, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vmaxah, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vmaxaw, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vminab, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vminah, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vminaw, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vmovnbb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vmovnbh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vmovntb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vmovnth, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vqmovunbb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqmovunbh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqmovuntb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqmovunth, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vqmovnbsb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqmovnbsh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqmovntsb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqmovntsh, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vqmovnbub, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqmovnbuh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqmovntub, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vqmovntuh, TCG_CALL_NO_WG, void, env, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vand, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vbic, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vorr, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vorn, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_veor, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vpsel, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_1(mve_vpnot, TCG_CALL_NO_WG, void, env) + +DEF_HELPER_FLAGS_2(mve_vctp, TCG_CALL_NO_WG, void, env, i32) + DEF_HELPER_FLAGS_4(mve_vaddb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vaddh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vaddw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) @@ -145,6 +283,11 @@ DEF_HELPER_FLAGS_4(mve_vmulltub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulltuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulltuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullpbh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullpth, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullpbw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullptw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vqdmulhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqdmulhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqdmulhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) @@ -328,6 +471,30 @@ DEF_HELPER_FLAGS_4(mve_vqdmullb_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i3 DEF_HELPER_FLAGS_4(mve_vqdmullt_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vqdmullt_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmlab, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmlah, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmlaw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vmlasb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmlash, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmlasw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqdmlahb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmlahh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmlahw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqrdmlahb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrdmlahh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrdmlahw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqdmlashb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmlashh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmlashw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqrdmlashb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrdmlashh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrdmlashw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) @@ -349,6 +516,23 @@ DEF_HELPER_FLAGS_4(mve_vrmlaldavhuw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vrmlsldavhsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vrmlsldavhxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmladavsb, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmladavsh, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmladavsw, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmladavub, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmladavuh, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmladavuw, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmlsdavb, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmlsdavh, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmlsdavw, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vmladavsxb, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmladavsxh, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmladavsxw, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmlsdavxb, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmlsdavxh, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmlsdavxw, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) + DEF_HELPER_FLAGS_3(mve_vaddvsb, TCG_CALL_NO_WG, i32, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vaddvub, TCG_CALL_NO_WG, i32, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vaddvsh, TCG_CALL_NO_WG, i32, env, ptr, i32) @@ -356,9 +540,36 @@ DEF_HELPER_FLAGS_3(mve_vaddvuh, TCG_CALL_NO_WG, i32, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vaddvsw, TCG_CALL_NO_WG, i32, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vaddvuw, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vmaxvsb, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vmaxvsh, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vmaxvsw, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vmaxvub, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vmaxvuh, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vmaxvuw, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vmaxavb, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vmaxavh, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vmaxavw, TCG_CALL_NO_WG, i32, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vminvsb, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vminvsh, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vminvsw, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vminvub, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vminvuh, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vminvuw, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vminavb, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vminavh, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vminavw, TCG_CALL_NO_WG, i32, env, ptr, i32) + DEF_HELPER_FLAGS_3(mve_vaddlv_s, TCG_CALL_NO_WG, i64, env, ptr, i64) DEF_HELPER_FLAGS_3(mve_vaddlv_u, TCG_CALL_NO_WG, i64, env, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vabavsb, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vabavsh, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vabavsw, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vabavub, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vabavuh, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vabavuw, TCG_CALL_NO_WG, i32, env, ptr, ptr, i32) + DEF_HELPER_FLAGS_3(mve_vmovi, TCG_CALL_NO_WG, void, env, ptr, i64) DEF_HELPER_FLAGS_3(mve_vandi, TCG_CALL_NO_WG, void, env, ptr, i64) DEF_HELPER_FLAGS_3(mve_vorri, TCG_CALL_NO_WG, void, env, ptr, i64) @@ -391,6 +602,14 @@ DEF_HELPER_FLAGS_4(mve_vrshli_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vrshli_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vrshli_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrshli_sb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrshli_sh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrshli_sw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqrshli_ub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrshli_uh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrshli_uw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(mve_vshllbsb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vshllbsh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vshllbub, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) @@ -463,3 +682,67 @@ DEF_HELPER_FLAGS_3(mve_uqshl, TCG_CALL_NO_RWG, i32, env, i32, i32) DEF_HELPER_FLAGS_3(mve_sqshl, TCG_CALL_NO_RWG, i32, env, i32, i32) DEF_HELPER_FLAGS_3(mve_uqrshl, TCG_CALL_NO_RWG, i32, env, i32, i32) DEF_HELPER_FLAGS_3(mve_sqrshr, TCG_CALL_NO_RWG, i32, env, i32, i32) + +DEF_HELPER_FLAGS_3(mve_vcmpeqb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmpeqh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmpeqw, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vcmpneb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmpneh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmpnew, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vcmpcsb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmpcsh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmpcsw, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vcmphib, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmphih, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmphiw, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vcmpgeb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmpgeh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmpgew, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vcmpltb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmplth, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmpltw, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vcmpgtb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmpgth, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmpgtw, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vcmpleb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmpleh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vcmplew, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vcmpeq_scalarb, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmpeq_scalarh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmpeq_scalarw, TCG_CALL_NO_WG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vcmpne_scalarb, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmpne_scalarh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmpne_scalarw, TCG_CALL_NO_WG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vcmpcs_scalarb, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmpcs_scalarh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmpcs_scalarw, TCG_CALL_NO_WG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vcmphi_scalarb, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmphi_scalarh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmphi_scalarw, TCG_CALL_NO_WG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vcmpge_scalarb, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmpge_scalarh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmpge_scalarw, TCG_CALL_NO_WG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vcmplt_scalarb, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmplt_scalarh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmplt_scalarw, TCG_CALL_NO_WG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vcmpgt_scalarb, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmpgt_scalarh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmpgt_scalarw, TCG_CALL_NO_WG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vcmple_scalarb, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmple_scalarh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vcmple_scalarw, TCG_CALL_NO_WG, void, env, ptr, i32) diff --git a/target/arm/helper.c b/target/arm/helper.c index 155d8bf239..56c520cf8e 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -9345,6 +9345,18 @@ uint32_t HELPER(sxtb16)(uint32_t x) return res; } +static void handle_possible_div0_trap(CPUARMState *env, uintptr_t ra) +{ + /* + * Take a division-by-zero exception if necessary; otherwise return + * to get the usual non-trapping division behaviour (result of 0) + */ + if (arm_feature(env, ARM_FEATURE_M) + && (env->v7m.ccr[env->v7m.secure] & R_V7M_CCR_DIV_0_TRP_MASK)) { + raise_exception_ra(env, EXCP_DIVBYZERO, 0, 1, ra); + } +} + uint32_t HELPER(uxtb16)(uint32_t x) { uint32_t res; @@ -9353,19 +9365,24 @@ uint32_t HELPER(uxtb16)(uint32_t x) return res; } -int32_t HELPER(sdiv)(int32_t num, int32_t den) +int32_t HELPER(sdiv)(CPUARMState *env, int32_t num, int32_t den) { - if (den == 0) - return 0; - if (num == INT_MIN && den == -1) - return INT_MIN; + if (den == 0) { + handle_possible_div0_trap(env, GETPC()); + return 0; + } + if (num == INT_MIN && den == -1) { + return INT_MIN; + } return num / den; } -uint32_t HELPER(udiv)(uint32_t num, uint32_t den) +uint32_t HELPER(udiv)(CPUARMState *env, uint32_t num, uint32_t den) { - if (den == 0) - return 0; + if (den == 0) { + handle_possible_div0_trap(env, GETPC()); + return 0; + } return num / den; } @@ -9564,6 +9581,7 @@ void arm_log_exception(int idx) [EXCP_LAZYFP] = "v7M exception during lazy FP stacking", [EXCP_LSERR] = "v8M LSERR UsageFault", [EXCP_UNALIGNED] = "v7M UNALIGNED UsageFault", + [EXCP_DIVBYZERO] = "v7M DIVBYZERO UsageFault", }; if (idx >= 0 && idx < ARRAY_SIZE(excnames)) { diff --git a/target/arm/helper.h b/target/arm/helper.h index 248569b0cd..aee8f0019b 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -6,8 +6,8 @@ DEF_HELPER_3(add_saturate, i32, env, i32, i32) DEF_HELPER_3(sub_saturate, i32, env, i32, i32) DEF_HELPER_3(add_usaturate, i32, env, i32, i32) DEF_HELPER_3(sub_usaturate, i32, env, i32, i32) -DEF_HELPER_FLAGS_2(sdiv, TCG_CALL_NO_RWG_SE, s32, s32, s32) -DEF_HELPER_FLAGS_2(udiv, TCG_CALL_NO_RWG_SE, i32, i32, i32) +DEF_HELPER_FLAGS_3(sdiv, TCG_CALL_NO_RWG, s32, env, s32, s32) +DEF_HELPER_FLAGS_3(udiv, TCG_CALL_NO_RWG, i32, env, i32, i32) DEF_HELPER_FLAGS_1(rbit, TCG_CALL_NO_RWG_SE, i32, i32) #define PAS_OP(pfx) \ diff --git a/target/arm/kvm.c b/target/arm/kvm.c index d8381ba224..5d55de1a49 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -998,7 +998,6 @@ int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, hwaddr xlat, len, doorbell_gpa; MemoryRegionSection mrs; MemoryRegion *mr; - int ret = 1; if (as == &address_space_memory) { return 0; @@ -1006,15 +1005,19 @@ int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, /* MSI doorbell address is translated by an IOMMU */ - rcu_read_lock(); + RCU_READ_LOCK_GUARD(); + mr = address_space_translate(as, address, &xlat, &len, true, MEMTXATTRS_UNSPECIFIED); + if (!mr) { - goto unlock; + return 1; } + mrs = memory_region_find(mr, xlat, 1); + if (!mrs.mr) { - goto unlock; + return 1; } doorbell_gpa = mrs.offset_within_address_space; @@ -1025,11 +1028,7 @@ int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, trace_kvm_arm_fixup_msi_route(address, doorbell_gpa); - ret = 0; - -unlock: - rcu_read_unlock(); - return ret; + return 0; } int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route, diff --git a/target/arm/m_helper.c b/target/arm/m_helper.c index 20761c9487..47903b3dc3 100644 --- a/target/arm/m_helper.c +++ b/target/arm/m_helper.c @@ -2252,6 +2252,10 @@ void arm_v7m_cpu_do_interrupt(CPUState *cs) armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK; break; + case EXCP_DIVBYZERO: + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_DIVBYZERO_MASK; + break; case EXCP_SWI: /* The PC already points to the next instruction. */ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SVC, env->v7m.secure); diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 595d97568e..8744681629 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -35,11 +35,35 @@ &2scalar qd qn rm size &1imm qd imm cmode op &2shift qd qm shift size +&vidup qd rn size imm +&viwdup qd rn rm size imm +&vcmp qm qn size mask +&vcmp_scalar qn rm size mask +&shl_scalar qda rm size +&vmaxv qm rda size +&vabav qn qm rda size +&vldst_sg qd qm rn size msize os +&vldst_sg_imm qd qm a w imm +&vldst_il qd rn size pat w + +# scatter-gather memory size is in bits 6:4 +%sg_msize 6:1 4:1 @vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0 # Note that both Rn and Qd are 3 bits only (no D bit) @vldst_wn ... u:1 ... . . . . l:1 . rn:3 qd:3 . ... .. imm:7 &vldr_vstr +@vldst_sg .... .... .... rn:4 .... ... size:2 ... ... os:1 &vldst_sg \ + qd=%qd qm=%qm msize=%sg_msize + +# Qm is in the fields usually labeled Qn +@vldst_sg_imm .... .... a:1 . w:1 . .... .... .... . imm:7 &vldst_sg_imm \ + qd=%qd qm=%qn + +# Deinterleaving load/interleaving store +@vldst_il .... .... .. w:1 . rn:4 .... ... size:2 pat:2 ..... &vldst_il \ + qd=%qd + @1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm @1op_nosz .... .... .... .... .... .... .... .... &1op qd=%qd qm=%qm size=0 @2op .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn @@ -84,6 +108,16 @@ @2_shr_w .... .... .. 1 ..... .... .... .... .... &2shift qd=%qd qm=%qm \ size=2 shift=%rshift_i5 +@shl_scalar .... .... .... size:2 .. .... .... .... rm:4 &shl_scalar qda=%qd + +# Vector comparison; 4-bit Qm but 3-bit Qn +%mask_22_13 22:1 13:3 +@vcmp .... .... .. size:2 qn:3 . .... .... .... .... &vcmp qm=%qm mask=%mask_22_13 +@vcmp_scalar .... .... .. size:2 qn:3 . .... .... .... rm:4 &vcmp_scalar \ + mask=%mask_22_13 + +@vmaxv .... .... .... size:2 .. rda:4 .... .... .... &vmaxv qm=%qm + # Vector loads and stores # Widening loads and narrowing stores: @@ -119,6 +153,26 @@ VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111101 ....... @vldr_vstr \ VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111110 ....... @vldr_vstr \ size=2 p=1 +# gather loads/scatter stores +VLDR_S_sg 111 0 1100 1 . 01 .... ... 0 111 . .... .... @vldst_sg +VLDR_U_sg 111 1 1100 1 . 01 .... ... 0 111 . .... .... @vldst_sg +VSTR_sg 111 0 1100 1 . 00 .... ... 0 111 . .... .... @vldst_sg + +VLDRW_sg_imm 111 1 1101 ... 1 ... 0 ... 1 1110 .... .... @vldst_sg_imm +VLDRD_sg_imm 111 1 1101 ... 1 ... 0 ... 1 1111 .... .... @vldst_sg_imm +VSTRW_sg_imm 111 1 1101 ... 0 ... 0 ... 1 1110 .... .... @vldst_sg_imm +VSTRD_sg_imm 111 1 1101 ... 0 ... 0 ... 1 1111 .... .... @vldst_sg_imm + +# deinterleaving loads/interleaving stores +VLD2 1111 1100 1 .. 1 .... ... 1 111 .. .. 00000 @vldst_il +VLD4 1111 1100 1 .. 1 .... ... 1 111 .. .. 00001 @vldst_il +VST2 1111 1100 1 .. 0 .... ... 1 111 .. .. 00000 @vldst_il +VST4 1111 1100 1 .. 0 .... ... 1 111 .. .. 00001 @vldst_il + +# Moves between 2 32-bit vector lanes and 2 general purpose registers +VMOV_to_2gp 1110 1100 0 . 00 rt2:4 ... 0 1111 000 idx:1 rt:4 qd=%qd +VMOV_from_2gp 1110 1100 0 . 01 rt2:4 ... 0 1111 000 idx:1 rt:4 qd=%qd + # Vector 2-op VAND 1110 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz VBIC 1110 1111 0 . 01 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz @@ -136,6 +190,11 @@ VMUL 1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op VSHLL_BS 111 0 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_b VSHLL_BS 111 0 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_h + VQMOVUNB 111 0 1110 0 . 11 .. 01 ... 0 1110 1 0 . 0 ... 1 @1op + VQMOVN_BS 111 0 1110 0 . 11 .. 11 ... 0 1110 0 0 . 0 ... 1 @1op + + VMAXA 111 0 1110 0 . 11 .. 11 ... 0 1110 1 0 . 0 ... 1 @1op + VMULH_S 111 0 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op } @@ -143,6 +202,9 @@ VMUL 1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op VSHLL_BU 111 1 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_b VSHLL_BU 111 1 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_h + VMOVNB 111 1 1110 0 . 11 .. 01 ... 0 1110 1 0 . 0 ... 1 @1op + VQMOVN_BU 111 1 1110 0 . 11 .. 11 ... 0 1110 0 0 . 0 ... 1 @1op + VMULH_U 111 1 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op } @@ -150,6 +212,11 @@ VMUL 1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op VSHLL_TS 111 0 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_b VSHLL_TS 111 0 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_h + VQMOVUNT 111 0 1110 0 . 11 .. 01 ... 1 1110 1 0 . 0 ... 1 @1op + VQMOVN_TS 111 0 1110 0 . 11 .. 11 ... 1 1110 0 0 . 0 ... 1 @1op + + VMINA 111 0 1110 0 . 11 .. 11 ... 1 1110 1 0 . 0 ... 1 @1op + VRMULH_S 111 0 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op } @@ -157,6 +224,9 @@ VMUL 1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op VSHLL_TU 111 1 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_b VSHLL_TU 111 1 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_h + VMOVNT 111 1 1110 0 . 11 .. 01 ... 1 1110 1 0 . 0 ... 1 @1op + VQMOVN_TU 111 1 1110 0 . 11 .. 11 ... 1 1110 0 0 . 0 ... 1 @1op + VRMULH_U 111 1 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op } @@ -173,10 +243,16 @@ VHADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op VHSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op VHSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op -VMULL_BS 111 0 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op -VMULL_BU 111 1 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op -VMULL_TS 111 0 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op -VMULL_TU 111 1 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op +{ + VMULLP_B 111 . 1110 0 . 11 ... 1 ... 0 1110 . 0 . 0 ... 0 @2op_sz28 + VMULL_BS 111 0 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op + VMULL_BU 111 1 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op +} +{ + VMULLP_T 111 . 1110 0 . 11 ... 1 ... 1 1110 . 0 . 0 ... 0 @2op_sz28 + VMULL_TS 111 0 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op + VMULL_TU 111 1 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op +} VQDMULH 1110 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op VQRDMULH 1111 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op @@ -244,6 +320,9 @@ VABS_fp 1111 1111 1 . 11 .. 01 ... 0 0111 01 . 0 ... 0 @1op VNEG 1111 1111 1 . 11 .. 01 ... 0 0011 11 . 0 ... 0 @1op VNEG_fp 1111 1111 1 . 11 .. 01 ... 0 0111 11 . 0 ... 0 @1op +VQABS 1111 1111 1 . 11 .. 00 ... 0 0111 01 . 0 ... 0 @1op +VQNEG 1111 1111 1 . 11 .. 00 ... 0 0111 11 . 0 ... 0 @1op + &vdup qd rt size # Qd is in the fields usually named Qn @vdup .... .... . . .. ... . rt:4 .... . . . . .... qd=%qn &vdup @@ -253,6 +332,29 @@ VDUP 1110 1110 1 1 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=0 VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 1 1 0000 @vdup size=1 VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2 +# Incrementing and decrementing dup + +# VIDUP, VDDUP format immediate: 1 << (immh:imml) +%imm_vidup 7:1 0:1 !function=vidup_imm + +# VIDUP, VDDUP registers: Rm bits [3:1] from insn, bit 0 is 1; +# Rn bits [3:1] from insn, bit 0 is 0 +%vidup_rm 1:3 !function=times_2_plus_1 +%vidup_rn 17:3 !function=times_2 + +@vidup .... .... . . size:2 .... .... .... .... .... \ + qd=%qd imm=%imm_vidup rn=%vidup_rn &vidup +@viwdup .... .... . . size:2 .... .... .... .... .... \ + qd=%qd imm=%imm_vidup rm=%vidup_rm rn=%vidup_rn &viwdup +{ + VIDUP 1110 1110 0 . .. ... 1 ... 0 1111 . 110 111 . @vidup + VIWDUP 1110 1110 0 . .. ... 1 ... 0 1111 . 110 ... . @viwdup +} +{ + VDDUP 1110 1110 0 . .. ... 1 ... 1 1111 . 110 111 . @vidup + VDWDUP 1110 1110 0 . .. ... 1 ... 1 1111 . 110 ... . @viwdup +} + # multiply-add long dual accumulate # rdahi: bits [3:1] from insn, bit 0 is 1 # rdalo: bits [3:1] from insn, bit 0 is 0 @@ -262,26 +364,76 @@ VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2 %size_16 16:1 !function=plus_1 &vmlaldav rdahi rdalo size qn qm x a +&vmladav rda size qn qm x a -@vmlaldav .... .... . ... ... . ... . .... .... qm:3 . \ +@vmlaldav .... .... . ... ... . ... x:1 .... .. a:1 . qm:3 . \ qn=%qn rdahi=%rdahi rdalo=%rdalo size=%size_16 &vmlaldav -@vmlaldav_nosz .... .... . ... ... . ... . .... .... qm:3 . \ +@vmlaldav_nosz .... .... . ... ... . ... x:1 .... .. a:1 . qm:3 . \ qn=%qn rdahi=%rdahi rdalo=%rdalo size=0 &vmlaldav -VMLALDAV_S 1110 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 0 @vmlaldav -VMLALDAV_U 1111 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 0 @vmlaldav +@vmladav .... .... .... ... . ... x:1 .... . . a:1 . qm:3 . \ + qn=%qn rda=%rdalo size=%size_16 &vmladav +@vmladav_nosz .... .... .... ... . ... x:1 .... . . a:1 . qm:3 . \ + qn=%qn rda=%rdalo size=0 &vmladav -VMLSLDAV 1110 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 1 @vmlaldav +{ + VMLADAV_S 1110 1110 1111 ... . ... . 1110 . 0 . 0 ... 0 @vmladav + VMLALDAV_S 1110 1110 1 ... ... . ... . 1110 . 0 . 0 ... 0 @vmlaldav +} +{ + VMLADAV_U 1111 1110 1111 ... . ... . 1110 . 0 . 0 ... 0 @vmladav + VMLALDAV_U 1111 1110 1 ... ... . ... . 1110 . 0 . 0 ... 0 @vmlaldav +} -VRMLALDAVH_S 1110 1110 1 ... ... 0 ... x:1 1111 . 0 a:1 0 ... 0 @vmlaldav_nosz -VRMLALDAVH_U 1111 1110 1 ... ... 0 ... x:1 1111 . 0 a:1 0 ... 0 @vmlaldav_nosz +{ + VMLSDAV 1110 1110 1111 ... . ... . 1110 . 0 . 0 ... 1 @vmladav + VMLSLDAV 1110 1110 1 ... ... . ... . 1110 . 0 . 0 ... 1 @vmlaldav +} -VRMLSLDAVH 1111 1110 1 ... ... 0 ... x:1 1110 . 0 a:1 0 ... 1 @vmlaldav_nosz +{ + VMLSDAV 1111 1110 1111 ... 0 ... . 1110 . 0 . 0 ... 1 @vmladav_nosz + VRMLSLDAVH 1111 1110 1 ... ... 0 ... . 1110 . 0 . 0 ... 1 @vmlaldav_nosz +} + +VMLADAV_S 1110 1110 1111 ... 0 ... . 1111 . 0 . 0 ... 1 @vmladav_nosz +VMLADAV_U 1111 1110 1111 ... 0 ... . 1111 . 0 . 0 ... 1 @vmladav_nosz + +{ + VMAXV_S 1110 1110 1110 .. 10 .... 1111 0 0 . 0 ... 0 @vmaxv + VMINV_S 1110 1110 1110 .. 10 .... 1111 1 0 . 0 ... 0 @vmaxv + VMAXAV 1110 1110 1110 .. 00 .... 1111 0 0 . 0 ... 0 @vmaxv + VMINAV 1110 1110 1110 .. 00 .... 1111 1 0 . 0 ... 0 @vmaxv + VMLADAV_S 1110 1110 1111 ... 0 ... . 1111 . 0 . 0 ... 0 @vmladav_nosz + VRMLALDAVH_S 1110 1110 1 ... ... 0 ... . 1111 . 0 . 0 ... 0 @vmlaldav_nosz +} + +{ + VMAXV_U 1111 1110 1110 .. 10 .... 1111 0 0 . 0 ... 0 @vmaxv + VMINV_U 1111 1110 1110 .. 10 .... 1111 1 0 . 0 ... 0 @vmaxv + VMLADAV_U 1111 1110 1111 ... 0 ... . 1111 . 0 . 0 ... 0 @vmladav_nosz + VRMLALDAVH_U 1111 1110 1 ... ... 0 ... . 1111 . 0 . 0 ... 0 @vmlaldav_nosz +} # Scalar operations VADD_scalar 1110 1110 0 . .. ... 1 ... 0 1111 . 100 .... @2scalar VSUB_scalar 1110 1110 0 . .. ... 1 ... 1 1111 . 100 .... @2scalar -VMUL_scalar 1110 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar + +{ + VSHL_S_scalar 1110 1110 0 . 11 .. 01 ... 1 1110 0110 .... @shl_scalar + VRSHL_S_scalar 1110 1110 0 . 11 .. 11 ... 1 1110 0110 .... @shl_scalar + VQSHL_S_scalar 1110 1110 0 . 11 .. 01 ... 1 1110 1110 .... @shl_scalar + VQRSHL_S_scalar 1110 1110 0 . 11 .. 11 ... 1 1110 1110 .... @shl_scalar + VMUL_scalar 1110 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar +} + +{ + VSHL_U_scalar 1111 1110 0 . 11 .. 01 ... 1 1110 0110 .... @shl_scalar + VRSHL_U_scalar 1111 1110 0 . 11 .. 11 ... 1 1110 0110 .... @shl_scalar + VQSHL_U_scalar 1111 1110 0 . 11 .. 01 ... 1 1110 1110 .... @shl_scalar + VQRSHL_U_scalar 1111 1110 0 . 11 .. 11 ... 1 1110 1110 .... @shl_scalar + VBRSR 1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar +} + VHADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar VHADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar VHSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar @@ -301,11 +453,18 @@ VHSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar size=%size_28 } -VBRSR 1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar - VQDMULH_scalar 1110 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar +# The U bit (28) is don't-care because it does not affect the result +VMLA 111- 1110 0 . .. ... 1 ... 0 1110 . 100 .... @2scalar +VMLAS 111- 1110 0 . .. ... 1 ... 1 1110 . 100 .... @2scalar + +VQRDMLAH 1110 1110 0 . .. ... 0 ... 0 1110 . 100 .... @2scalar +VQRDMLASH 1110 1110 0 . .. ... 0 ... 1 1110 . 100 .... @2scalar +VQDMLAH 1110 1110 0 . .. ... 0 ... 0 1110 . 110 .... @2scalar +VQDMLASH 1110 1110 0 . .. ... 0 ... 1 1110 . 110 .... @2scalar + # Vector add across vector { VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo @@ -313,9 +472,10 @@ VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar rdahi=%rdahi rdalo=%rdalo } -# Predicate operations -%mask_22_13 22:1 13:3 -VPST 1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13 +@vabav .... .... .. size:2 .... rda:4 .... .... .... &vabav qn=%qn qm=%qm + +VABAV_S 111 0 1110 10 .. ... 0 .... 1111 . 0 . 0 ... 1 @vabav +VABAV_U 111 1 1110 10 .. ... 0 .... 1111 . 0 . 0 ... 1 @vabav # Logical immediate operations (1 reg and modified-immediate) @@ -364,6 +524,8 @@ VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_h VRSHRI_U 111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w # VSHLL T1 encoding; the T2 VSHLL encoding is elsewhere in this file +# Note that VMOVL is encoded as "VSHLL with a zero shift count"; we +# implement it that way rather than special-casing it in the decode. VSHLL_BS 111 0 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_b VSHLL_BS 111 0 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_h @@ -425,3 +587,31 @@ VQRSHRUNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_b VQRSHRUNT 111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_h VSHLC 111 0 1110 1 . 1 imm:5 ... 0 1111 1100 rdm:4 qd=%qd + +# Comparisons. We expand out the conditions which are split across +# encodings T1, T2, T3 and the fc bits. These include VPT, which is +# effectively "VCMP then VPST". A plain "VCMP" has a mask field of zero. +VCMPEQ 1111 1110 0 . .. ... 1 ... 0 1111 0 0 . 0 ... 0 @vcmp +VCMPNE 1111 1110 0 . .. ... 1 ... 0 1111 1 0 . 0 ... 0 @vcmp +{ + VPSEL 1111 1110 0 . 11 ... 1 ... 0 1111 . 0 . 0 ... 1 @2op_nosz + VCMPCS 1111 1110 0 . .. ... 1 ... 0 1111 0 0 . 0 ... 1 @vcmp + VCMPHI 1111 1110 0 . .. ... 1 ... 0 1111 1 0 . 0 ... 1 @vcmp +} +VCMPGE 1111 1110 0 . .. ... 1 ... 1 1111 0 0 . 0 ... 0 @vcmp +VCMPLT 1111 1110 0 . .. ... 1 ... 1 1111 1 0 . 0 ... 0 @vcmp +VCMPGT 1111 1110 0 . .. ... 1 ... 1 1111 0 0 . 0 ... 1 @vcmp +VCMPLE 1111 1110 0 . .. ... 1 ... 1 1111 1 0 . 0 ... 1 @vcmp + +{ + VPNOT 1111 1110 0 0 11 000 1 000 0 1111 0100 1101 + VPST 1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13 + VCMPEQ_scalar 1111 1110 0 . .. ... 1 ... 0 1111 0 1 0 0 .... @vcmp_scalar +} +VCMPNE_scalar 1111 1110 0 . .. ... 1 ... 0 1111 1 1 0 0 .... @vcmp_scalar +VCMPCS_scalar 1111 1110 0 . .. ... 1 ... 0 1111 0 1 1 0 .... @vcmp_scalar +VCMPHI_scalar 1111 1110 0 . .. ... 1 ... 0 1111 1 1 1 0 .... @vcmp_scalar +VCMPGE_scalar 1111 1110 0 . .. ... 1 ... 1 1111 0 1 0 0 .... @vcmp_scalar +VCMPLT_scalar 1111 1110 0 . .. ... 1 ... 1 1111 1 1 0 0 .... @vcmp_scalar +VCMPGT_scalar 1111 1110 0 . .. ... 1 ... 1 1111 0 1 1 0 .... @vcmp_scalar +VCMPLE_scalar 1111 1110 0 . .. ... 1 ... 1 1111 1 1 1 0 .... @vcmp_scalar diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index db5d622085..c2826eb5f9 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -26,6 +26,35 @@ #include "exec/exec-all.h" #include "tcg/tcg.h" +static uint16_t mve_eci_mask(CPUARMState *env) +{ + /* + * Return the mask of which elements in the MVE vector correspond + * to beats being executed. The mask has 1 bits for executed lanes + * and 0 bits where ECI says this beat was already executed. + */ + int eci; + + if ((env->condexec_bits & 0xf) != 0) { + return 0xffff; + } + + eci = env->condexec_bits >> 4; + switch (eci) { + case ECI_NONE: + return 0xffff; + case ECI_A0: + return 0xfff0; + case ECI_A0A1: + return 0xff00; + case ECI_A0A1A2: + case ECI_A0A1A2B0: + return 0xf000; + default: + g_assert_not_reached(); + } +} + static uint16_t mve_element_mask(CPUARMState *env) { /* @@ -64,33 +93,15 @@ static uint16_t mve_element_mask(CPUARMState *env) */ int masklen = env->regs[14] << env->v7m.ltpsize; assert(masklen <= 16); - mask &= MAKE_64BIT_MASK(0, masklen); - } - - if ((env->condexec_bits & 0xf) == 0) { - /* - * ECI bits indicate which beats are already executed; - * we handle this by effectively predicating them out. - */ - int eci = env->condexec_bits >> 4; - switch (eci) { - case ECI_NONE: - break; - case ECI_A0: - mask &= 0xfff0; - break; - case ECI_A0A1: - mask &= 0xff00; - break; - case ECI_A0A1A2: - case ECI_A0A1A2B0: - mask &= 0xf000; - break; - default: - g_assert_not_reached(); - } + uint16_t ltpmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0; + mask &= ltpmask; } + /* + * ECI bits indicate which beats are already executed; + * we handle this by effectively predicating them out. + */ + mask &= mve_eci_mask(env); return mask; } @@ -99,6 +110,8 @@ static void mve_advance_vpt(CPUARMState *env) /* Advance the VPT and ECI state if necessary */ uint32_t vpr = env->v7m.vpr; unsigned mask01, mask23; + uint16_t inv_mask; + uint16_t eci_mask = mve_eci_mask(env); if ((env->condexec_bits & 0xf) == 0) { env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ? @@ -110,27 +123,36 @@ static void mve_advance_vpt(CPUARMState *env) return; } + /* Invert P0 bits if needed, but only for beats we actually executed */ mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01); mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23); - if (mask01 > 8) { - /* high bit set, but not 0b1000: invert the relevant half of P0 */ - vpr ^= 0xff; + /* Start by assuming we invert all bits corresponding to executed beats */ + inv_mask = eci_mask; + if (mask01 <= 8) { + /* MASK01 says don't invert low half of P0 */ + inv_mask &= ~0xff; } - if (mask23 > 8) { - /* high bit set, but not 0b1000: invert the relevant half of P0 */ - vpr ^= 0xff00; + if (mask23 <= 8) { + /* MASK23 says don't invert high half of P0 */ + inv_mask &= ~0xff00; } - vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1); + vpr ^= inv_mask; + /* Only update MASK01 if beat 1 executed */ + if (eci_mask & 0xf0) { + vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1); + } + /* Beat 3 always executes, so update MASK23 */ vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1); env->v7m.vpr = vpr; } - +/* For loads, predicated lanes are zeroed instead of keeping their old values */ #define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE) \ void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ { \ TYPE *d = vd; \ uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ unsigned b, e; \ /* \ * R_SXTM allows the dest reg to become UNKNOWN for abandoned \ @@ -138,8 +160,9 @@ static void mve_advance_vpt(CPUARMState *env) * then take an exception. \ */ \ for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ - if (mask & (1 << b)) { \ - d[H##ESIZE(e)] = cpu_##LDTYPE##_data_ra(env, addr, GETPC()); \ + if (eci_mask & (1 << b)) { \ + d[H##ESIZE(e)] = (mask & (1 << b)) ? \ + cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \ } \ addr += MSIZE; \ } \ @@ -183,6 +206,504 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) #undef DO_VLDR #undef DO_VSTR +/* + * Gather loads/scatter stores. Here each element of Qm specifies + * an offset to use from the base register Rm. In the _os_ versions + * that offset is scaled by the element size. + * For loads, predicated lanes are zeroed instead of retaining + * their previous values. + */ +#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ + uint32_t base) \ + { \ + TYPE *d = vd; \ + OFFTYPE *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + unsigned e; \ + uint32_t addr; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \ + if (!(eci_mask & 1)) { \ + continue; \ + } \ + addr = ADDRFN(base, m[H##ESIZE(e)]); \ + d[H##ESIZE(e)] = (mask & 1) ? \ + cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \ + if (WB) { \ + m[H##ESIZE(e)] = addr; \ + } \ + } \ + mve_advance_vpt(env); \ + } + +/* We know here TYPE is unsigned so always the same as the offset type */ +#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN, WB) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ + uint32_t base) \ + { \ + TYPE *d = vd; \ + TYPE *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + unsigned e; \ + uint32_t addr; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \ + if (!(eci_mask & 1)) { \ + continue; \ + } \ + addr = ADDRFN(base, m[H##ESIZE(e)]); \ + if (mask & 1) { \ + cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \ + } \ + if (WB) { \ + m[H##ESIZE(e)] = addr; \ + } \ + } \ + mve_advance_vpt(env); \ + } + +/* + * 64-bit accesses are slightly different: they are done as two 32-bit + * accesses, controlled by the predicate mask for the relevant beat, + * and with a single 32-bit offset in the first of the two Qm elements. + * Note that for QEMU our IMPDEF AIRCR.ENDIANNESS is always 0 (little). + * Address writeback happens on the odd beats and updates the address + * stored in the even-beat element. + */ +#define DO_VLDR64_SG(OP, ADDRFN, WB) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ + uint32_t base) \ + { \ + uint32_t *d = vd; \ + uint32_t *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + unsigned e; \ + uint32_t addr; \ + for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \ + if (!(eci_mask & 1)) { \ + continue; \ + } \ + addr = ADDRFN(base, m[H4(e & ~1)]); \ + addr += 4 * (e & 1); \ + d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \ + if (WB && (e & 1)) { \ + m[H4(e & ~1)] = addr - 4; \ + } \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VSTR64_SG(OP, ADDRFN, WB) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ + uint32_t base) \ + { \ + uint32_t *d = vd; \ + uint32_t *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + unsigned e; \ + uint32_t addr; \ + for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \ + if (!(eci_mask & 1)) { \ + continue; \ + } \ + addr = ADDRFN(base, m[H4(e & ~1)]); \ + addr += 4 * (e & 1); \ + if (mask & 1) { \ + cpu_stl_data_ra(env, addr, d[H4(e)], GETPC()); \ + } \ + if (WB && (e & 1)) { \ + m[H4(e & ~1)] = addr - 4; \ + } \ + } \ + mve_advance_vpt(env); \ + } + +#define ADDR_ADD(BASE, OFFSET) ((BASE) + (OFFSET)) +#define ADDR_ADD_OSH(BASE, OFFSET) ((BASE) + ((OFFSET) << 1)) +#define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2)) +#define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3)) + +DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD, false) + +DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD, false) +DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false) +DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD, false) + +DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH, false) +DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH, false) +DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH, false) +DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW, false) +DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD, false) + +DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD, false) +DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD, false) +DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD, false) +DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD, false) +DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD, false) +DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD, false) +DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD, false) + +DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH, false) +DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH, false) +DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW, false) +DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD, false) + +DO_VLDR_SG(vldrw_sg_wb_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true) +DO_VLDR64_SG(vldrd_sg_wb_ud, ADDR_ADD, true) +DO_VSTR_SG(vstrw_sg_wb_uw, stl, 4, uint32_t, ADDR_ADD, true) +DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true) + +/* + * Deinterleaving loads/interleaving stores. + * + * For these helpers we are passed the index of the first Qreg + * (VLD2/VST2 will also access Qn+1, VLD4/VST4 access Qn .. Qn+3) + * and the value of the base address register Rn. + * The helpers are specialized for pattern and element size, so + * for instance vld42h is VLD4 with pattern 2, element size MO_16. + * + * These insns are beatwise but not predicated, so we must honour ECI, + * but need not look at mve_element_mask(). + * + * The pseudocode implements these insns with multiple memory accesses + * of the element size, but rules R_VVVG and R_FXDM permit us to make + * one 32-bit memory access per beat. + */ +#define DO_VLD4B(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat, e; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 4; \ + data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + for (e = 0; e < 4; e++, data >>= 8) { \ + uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \ + qd[H1(off[beat])] = data; \ + } \ + } \ + } + +#define DO_VLD4H(OP, O1, O2) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O1, O2, O2 }; \ + uint32_t addr, data; \ + int y; /* y counts 0 2 0 2 */ \ + uint16_t *qd; \ + for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 8 + (beat & 1) * 4; \ + data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y); \ + qd[H2(off[beat])] = data; \ + data >>= 16; \ + qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1); \ + qd[H2(off[beat])] = data; \ + } \ + } + +#define DO_VLD4W(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + uint32_t *qd; \ + int y; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 4; \ + data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + y = (beat + (O1 & 2)) & 3; \ + qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \ + qd[H4(off[beat] >> 2)] = data; \ + } \ + } + +DO_VLD4B(vld40b, 0, 1, 10, 11) +DO_VLD4B(vld41b, 2, 3, 12, 13) +DO_VLD4B(vld42b, 4, 5, 14, 15) +DO_VLD4B(vld43b, 6, 7, 8, 9) + +DO_VLD4H(vld40h, 0, 5) +DO_VLD4H(vld41h, 1, 6) +DO_VLD4H(vld42h, 2, 7) +DO_VLD4H(vld43h, 3, 4) + +DO_VLD4W(vld40w, 0, 1, 10, 11) +DO_VLD4W(vld41w, 2, 3, 12, 13) +DO_VLD4W(vld42w, 4, 5, 14, 15) +DO_VLD4W(vld43w, 6, 7, 8, 9) + +#define DO_VLD2B(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat, e; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + uint8_t *qd; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 2; \ + data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + for (e = 0; e < 4; e++, data >>= 8) { \ + qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \ + qd[H1(off[beat] + (e >> 1))] = data; \ + } \ + } \ + } + +#define DO_VLD2H(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + int e; \ + uint16_t *qd; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 4; \ + data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + for (e = 0; e < 2; e++, data >>= 16) { \ + qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \ + qd[H2(off[beat])] = data; \ + } \ + } \ + } + +#define DO_VLD2W(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + uint32_t *qd; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat]; \ + data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \ + qd[H4(off[beat] >> 3)] = data; \ + } \ + } + +DO_VLD2B(vld20b, 0, 2, 12, 14) +DO_VLD2B(vld21b, 4, 6, 8, 10) + +DO_VLD2H(vld20h, 0, 1, 6, 7) +DO_VLD2H(vld21h, 2, 3, 4, 5) + +DO_VLD2W(vld20w, 0, 4, 24, 28) +DO_VLD2W(vld21w, 8, 12, 16, 20) + +#define DO_VST4B(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat, e; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 4; \ + data = 0; \ + for (e = 3; e >= 0; e--) { \ + uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \ + data = (data << 8) | qd[H1(off[beat])]; \ + } \ + cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + } \ + } + +#define DO_VST4H(OP, O1, O2) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O1, O2, O2 }; \ + uint32_t addr, data; \ + int y; /* y counts 0 2 0 2 */ \ + uint16_t *qd; \ + for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 8 + (beat & 1) * 4; \ + qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y); \ + data = qd[H2(off[beat])]; \ + qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1); \ + data |= qd[H2(off[beat])] << 16; \ + cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + } \ + } + +#define DO_VST4W(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + uint32_t *qd; \ + int y; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 4; \ + y = (beat + (O1 & 2)) & 3; \ + qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \ + data = qd[H4(off[beat] >> 2)]; \ + cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + } \ + } + +DO_VST4B(vst40b, 0, 1, 10, 11) +DO_VST4B(vst41b, 2, 3, 12, 13) +DO_VST4B(vst42b, 4, 5, 14, 15) +DO_VST4B(vst43b, 6, 7, 8, 9) + +DO_VST4H(vst40h, 0, 5) +DO_VST4H(vst41h, 1, 6) +DO_VST4H(vst42h, 2, 7) +DO_VST4H(vst43h, 3, 4) + +DO_VST4W(vst40w, 0, 1, 10, 11) +DO_VST4W(vst41w, 2, 3, 12, 13) +DO_VST4W(vst42w, 4, 5, 14, 15) +DO_VST4W(vst43w, 6, 7, 8, 9) + +#define DO_VST2B(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat, e; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + uint8_t *qd; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 2; \ + data = 0; \ + for (e = 3; e >= 0; e--) { \ + qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \ + data = (data << 8) | qd[H1(off[beat] + (e >> 1))]; \ + } \ + cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + } \ + } + +#define DO_VST2H(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + int e; \ + uint16_t *qd; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 4; \ + data = 0; \ + for (e = 1; e >= 0; e--) { \ + qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \ + data = (data << 16) | qd[H2(off[beat])]; \ + } \ + cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + } \ + } + +#define DO_VST2W(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + uint32_t *qd; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat]; \ + qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \ + data = qd[H4(off[beat] >> 3)]; \ + cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + } \ + } + +DO_VST2B(vst20b, 0, 2, 12, 14) +DO_VST2B(vst21b, 4, 6, 8, 10) + +DO_VST2H(vst20h, 0, 1, 6, 7) +DO_VST2H(vst21h, 2, 3, 4, 5) + +DO_VST2W(vst20w, 0, 4, 24, 28) +DO_VST2W(vst21w, 8, 12, 16, 20) + /* * The mergemask(D, R, M) macro performs the operation "*D = R" but * storing only the bytes which correspond to 1 bits in M, @@ -458,6 +979,22 @@ DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL) DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL) DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL) +/* + * Polynomial multiply. We can always do this generating 64 bits + * of the result at a time, so we don't need to use DO_2OP_L. + */ +#define VMULLPH_MASK 0x00ff00ff00ff00ffULL +#define VMULLPW_MASK 0x0000ffff0000ffffULL +#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK) +#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8) +#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK) +#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16) + +DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH) +DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH) +DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW) +DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW) + /* * Because the computation type is at least twice as large as required, * these work for both signed and unsigned source types. @@ -909,6 +1446,44 @@ DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w) mve_advance_vpt(env); \ } +/* "accumulating" version where FN takes d as well as n and m */ +#define DO_2OP_ACC_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + TYPE *d = vd, *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], \ + FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m), mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_2OP_SAT_ACC_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + TYPE *d = vd, *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + mergemask(&d[H##ESIZE(e)], \ + FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m, &sat), \ + mask); \ + qc |= sat & mask & 1; \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + /* provide unsigned 2-op scalar helpers for all sizes */ #define DO_2OP_SCALAR_U(OP, FN) \ DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \ @@ -919,6 +1494,11 @@ DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w) DO_2OP_SCALAR(OP##h, 2, int16_t, FN) \ DO_2OP_SCALAR(OP##w, 4, int32_t, FN) +#define DO_2OP_ACC_SCALAR_U(OP, FN) \ + DO_2OP_ACC_SCALAR(OP##b, 1, uint8_t, FN) \ + DO_2OP_ACC_SCALAR(OP##h, 2, uint16_t, FN) \ + DO_2OP_ACC_SCALAR(OP##w, 4, uint32_t, FN) + DO_2OP_SCALAR_U(vadd_scalar, DO_ADD) DO_2OP_SCALAR_U(vsub_scalar, DO_SUB) DO_2OP_SCALAR_U(vmul_scalar, DO_MUL) @@ -948,6 +1528,89 @@ DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B) DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H) DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W) +static int8_t do_vqdmlah_b(int8_t a, int8_t b, int8_t c, int round, bool *sat) +{ + int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 8) + (round << 7); + return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; +} + +static int16_t do_vqdmlah_h(int16_t a, int16_t b, int16_t c, + int round, bool *sat) +{ + int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 16) + (round << 15); + return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; +} + +static int32_t do_vqdmlah_w(int32_t a, int32_t b, int32_t c, + int round, bool *sat) +{ + /* + * Architecturally we should do the entire add, double, round + * and then check for saturation. We do three saturating adds, + * but we need to be careful about the order. If the first + * m1 + m2 saturates then it's impossible for the *2+rc to + * bring it back into the non-saturated range. However, if + * m1 + m2 is negative then it's possible that doing the doubling + * would take the intermediate result below INT64_MAX and the + * addition of the rounding constant then brings it back in range. + * So we add half the rounding constant and half the "c << esize" + * before doubling rather than adding the rounding constant after + * the doubling. + */ + int64_t m1 = (int64_t)a * b; + int64_t m2 = (int64_t)c << 31; + int64_t r; + if (sadd64_overflow(m1, m2, &r) || + sadd64_overflow(r, (round << 30), &r) || + sadd64_overflow(r, r, &r)) { + *sat = true; + return r < 0 ? INT32_MAX : INT32_MIN; + } + return r >> 32; +} + +/* + * The *MLAH insns are vector * scalar + vector; + * the *MLASH insns are vector * vector + scalar + */ +#define DO_VQDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 0, S) +#define DO_VQDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 0, S) +#define DO_VQDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 0, S) +#define DO_VQRDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 1, S) +#define DO_VQRDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 1, S) +#define DO_VQRDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 1, S) + +#define DO_VQDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 0, S) +#define DO_VQDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 0, S) +#define DO_VQDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 0, S) +#define DO_VQRDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 1, S) +#define DO_VQRDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 1, S) +#define DO_VQRDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 1, S) + +DO_2OP_SAT_ACC_SCALAR(vqdmlahb, 1, int8_t, DO_VQDMLAH_B) +DO_2OP_SAT_ACC_SCALAR(vqdmlahh, 2, int16_t, DO_VQDMLAH_H) +DO_2OP_SAT_ACC_SCALAR(vqdmlahw, 4, int32_t, DO_VQDMLAH_W) +DO_2OP_SAT_ACC_SCALAR(vqrdmlahb, 1, int8_t, DO_VQRDMLAH_B) +DO_2OP_SAT_ACC_SCALAR(vqrdmlahh, 2, int16_t, DO_VQRDMLAH_H) +DO_2OP_SAT_ACC_SCALAR(vqrdmlahw, 4, int32_t, DO_VQRDMLAH_W) + +DO_2OP_SAT_ACC_SCALAR(vqdmlashb, 1, int8_t, DO_VQDMLASH_B) +DO_2OP_SAT_ACC_SCALAR(vqdmlashh, 2, int16_t, DO_VQDMLASH_H) +DO_2OP_SAT_ACC_SCALAR(vqdmlashw, 4, int32_t, DO_VQDMLASH_W) +DO_2OP_SAT_ACC_SCALAR(vqrdmlashb, 1, int8_t, DO_VQRDMLASH_B) +DO_2OP_SAT_ACC_SCALAR(vqrdmlashh, 2, int16_t, DO_VQRDMLASH_H) +DO_2OP_SAT_ACC_SCALAR(vqrdmlashw, 4, int32_t, DO_VQRDMLASH_W) + +/* Vector by scalar plus vector */ +#define DO_VMLA(D, N, M) ((N) * (M) + (D)) + +DO_2OP_ACC_SCALAR_U(vmla, DO_VMLA) + +/* Vector by vector plus scalar */ +#define DO_VMLAS(D, N, M) ((N) * (D) + (M)) + +DO_2OP_ACC_SCALAR_U(vmlas, DO_VMLAS) + /* * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type. @@ -1124,6 +1787,47 @@ DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=) DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=) DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=) +/* + * Multiply add dual accumulate ops + */ +#define DO_DAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \ + uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ + void *vm, uint32_t a) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *n = vn, *m = vm; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + if (e & 1) { \ + a ODDACC \ + n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \ + } else { \ + a EVENACC \ + n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \ + } \ + } \ + } \ + mve_advance_vpt(env); \ + return a; \ + } + +#define DO_DAV_S(INSN, XCHG, EVENACC, ODDACC) \ + DO_DAV(INSN##b, 1, int8_t, XCHG, EVENACC, ODDACC) \ + DO_DAV(INSN##h, 2, int16_t, XCHG, EVENACC, ODDACC) \ + DO_DAV(INSN##w, 4, int32_t, XCHG, EVENACC, ODDACC) + +#define DO_DAV_U(INSN, XCHG, EVENACC, ODDACC) \ + DO_DAV(INSN##b, 1, uint8_t, XCHG, EVENACC, ODDACC) \ + DO_DAV(INSN##h, 2, uint16_t, XCHG, EVENACC, ODDACC) \ + DO_DAV(INSN##w, 4, uint32_t, XCHG, EVENACC, ODDACC) + +DO_DAV_S(vmladavs, false, +=, +=) +DO_DAV_U(vmladavu, false, +=, +=) +DO_DAV_S(vmlsdav, false, +=, -=) +DO_DAV_S(vmladavsx, true, +=, +=) +DO_DAV_S(vmlsdavx, true, +=, -=) + /* * Rounding multiply add long dual accumulate high. In the pseudocode * this is implemented with a 72-bit internal accumulator value of which @@ -1182,13 +1886,105 @@ DO_LDAVH(vrmlsldavhxsw, int32_t, int64_t, true, true) return ra; \ } \ -DO_VADDV(vaddvsb, 1, uint8_t) -DO_VADDV(vaddvsh, 2, uint16_t) -DO_VADDV(vaddvsw, 4, uint32_t) +DO_VADDV(vaddvsb, 1, int8_t) +DO_VADDV(vaddvsh, 2, int16_t) +DO_VADDV(vaddvsw, 4, int32_t) DO_VADDV(vaddvub, 1, uint8_t) DO_VADDV(vaddvuh, 2, uint16_t) DO_VADDV(vaddvuw, 4, uint32_t) +/* + * Vector max/min across vector. Unlike VADDV, we must + * read ra as the element size, not its full width. + * We work with int64_t internally for simplicity. + */ +#define DO_VMAXMINV(OP, ESIZE, TYPE, RATYPE, FN) \ + uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ + uint32_t ra_in) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *m = vm; \ + int64_t ra = (RATYPE)ra_in; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + ra = FN(ra, m[H##ESIZE(e)]); \ + } \ + } \ + mve_advance_vpt(env); \ + return ra; \ + } \ + +#define DO_VMAXMINV_U(INSN, FN) \ + DO_VMAXMINV(INSN##b, 1, uint8_t, uint8_t, FN) \ + DO_VMAXMINV(INSN##h, 2, uint16_t, uint16_t, FN) \ + DO_VMAXMINV(INSN##w, 4, uint32_t, uint32_t, FN) +#define DO_VMAXMINV_S(INSN, FN) \ + DO_VMAXMINV(INSN##b, 1, int8_t, int8_t, FN) \ + DO_VMAXMINV(INSN##h, 2, int16_t, int16_t, FN) \ + DO_VMAXMINV(INSN##w, 4, int32_t, int32_t, FN) + +/* + * Helpers for max and min of absolute values across vector: + * note that we only take the absolute value of 'm', not 'n' + */ +static int64_t do_maxa(int64_t n, int64_t m) +{ + if (m < 0) { + m = -m; + } + return MAX(n, m); +} + +static int64_t do_mina(int64_t n, int64_t m) +{ + if (m < 0) { + m = -m; + } + return MIN(n, m); +} + +DO_VMAXMINV_S(vmaxvs, DO_MAX) +DO_VMAXMINV_U(vmaxvu, DO_MAX) +DO_VMAXMINV_S(vminvs, DO_MIN) +DO_VMAXMINV_U(vminvu, DO_MIN) +/* + * VMAXAV, VMINAV treat the general purpose input as unsigned + * and the vector elements as signed. + */ +DO_VMAXMINV(vmaxavb, 1, int8_t, uint8_t, do_maxa) +DO_VMAXMINV(vmaxavh, 2, int16_t, uint16_t, do_maxa) +DO_VMAXMINV(vmaxavw, 4, int32_t, uint32_t, do_maxa) +DO_VMAXMINV(vminavb, 1, int8_t, uint8_t, do_mina) +DO_VMAXMINV(vminavh, 2, int16_t, uint16_t, do_mina) +DO_VMAXMINV(vminavw, 4, int32_t, uint32_t, do_mina) + +#define DO_VABAV(OP, ESIZE, TYPE) \ + uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ + void *vm, uint32_t ra) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *m = vm, *n = vn; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + int64_t n0 = n[H##ESIZE(e)]; \ + int64_t m0 = m[H##ESIZE(e)]; \ + uint32_t r = n0 >= m0 ? (n0 - m0) : (m0 - n0); \ + ra += r; \ + } \ + } \ + mve_advance_vpt(env); \ + return ra; \ + } + +DO_VABAV(vabavsb, 1, int8_t) +DO_VABAV(vabavsh, 2, int16_t) +DO_VABAV(vabavsw, 4, int32_t) +DO_VABAV(vabavub, 1, uint8_t) +DO_VABAV(vabavuh, 2, uint16_t) +DO_VABAV(vabavuw, 4, uint32_t) + #define DO_VADDLV(OP, TYPE, LTYPE) \ uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ uint64_t ra) \ @@ -1269,6 +2065,8 @@ DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP) DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP) DO_2SHIFT_U(vrshli_u, DO_VRSHLU) DO_2SHIFT_S(vrshli_s, DO_VRSHLS) +DO_2SHIFT_SAT_U(vqrshli_u, DO_UQRSHL_OP) +DO_2SHIFT_SAT_S(vqrshli_s, DO_SQRSHL_OP) /* Shift-and-insert; we always work with 64 bits at a time */ #define DO_2SHIFT_INSERT(OP, ESIZE, SHIFTFN, MASKFN) \ @@ -1279,11 +2077,12 @@ DO_2SHIFT_S(vrshli_s, DO_VRSHLS) uint16_t mask; \ uint64_t shiftmask; \ unsigned e; \ - if (shift == 0 || shift == ESIZE * 8) { \ + if (shift == ESIZE * 8) { \ /* \ - * Only VSLI can shift by 0; only VSRI can shift by