From 82faef92fb149b6bad699c3275473fda6fd486b6 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 3 Jun 2021 18:12:57 +0100 Subject: [PATCH 01/57] hw/acpi: Provide stub version of acpi_ghes_record_errors() Generic code in target/arm wants to call acpi_ghes_record_errors(); provide a stub version so that we don't fail to link when CONFIG_ACPI_APEI is not set. This requires us to add a new ghes-stub.c file to contain it and the meson.build mechanics to use it when appropriate. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Reviewed-by: Dongjiu Geng Message-id: 20210603171259.27962-2-peter.maydell@linaro.org --- hw/acpi/ghes-stub.c | 17 +++++++++++++++++ hw/acpi/meson.build | 6 +++--- 2 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 hw/acpi/ghes-stub.c diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c new file mode 100644 index 0000000000..9faba043b8 --- /dev/null +++ b/hw/acpi/ghes-stub.c @@ -0,0 +1,17 @@ +/* + * Support for generating APEI tables and recording CPER for Guests: + * stub functions. + * + * Copyright (c) 2021 Linaro, Ltd + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "hw/acpi/ghes.h" + +int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address) +{ + return -1; +} diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build index dd69577212..9b7fa75719 100644 --- a/hw/acpi/meson.build +++ b/hw/acpi/meson.build @@ -13,13 +13,13 @@ acpi_ss.add(when: 'CONFIG_ACPI_PCI', if_true: files('pci.c')) acpi_ss.add(when: 'CONFIG_ACPI_VMGENID', if_true: files('vmgenid.c')) acpi_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device.c')) acpi_ss.add(when: 'CONFIG_ACPI_HMAT', if_true: files('hmat.c')) -acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c')) +acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c'), if_false: files('ghes-stub.c')) acpi_ss.add(when: 'CONFIG_ACPI_X86', if_true: files('core.c', 'piix4.c', 'pcihp.c'), if_false: files('acpi-stub.c')) acpi_ss.add(when: 'CONFIG_ACPI_X86_ICH', if_true: files('ich9.c', 'tco.c')) acpi_ss.add(when: 'CONFIG_IPMI', if_true: files('ipmi.c'), if_false: files('ipmi-stub.c')) acpi_ss.add(when: 'CONFIG_PC', if_false: files('acpi-x86-stub.c')) acpi_ss.add(when: 'CONFIG_TPM', if_true: files('tpm.c')) -softmmu_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c')) +softmmu_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c', 'ghes-stub.c')) softmmu_ss.add_all(when: 'CONFIG_ACPI', if_true: acpi_ss) softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('acpi-stub.c', 'aml-build-stub.c', - 'acpi-x86-stub.c', 'ipmi-stub.c')) + 'acpi-x86-stub.c', 'ipmi-stub.c', 'ghes-stub.c')) From 1c81f5735af3e2356bb75482a58786a4f26d8300 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:24:37 +0100 Subject: [PATCH 02/57] hw/acpi: Provide function acpi_ghes_present() Allow code elsewhere in the system to check whether the ACPI GHES table is present, so it can determine whether it is OK to try to record an error by calling acpi_ghes_record_errors(). (We don't need to migrate the new 'present' field in AcpiGhesState, because it is set once at system initialization and doesn't change.) Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Reviewed-by: Dongjiu Geng Message-id: 20210603171259.27962-3-peter.maydell@linaro.org --- hw/acpi/ghes-stub.c | 5 +++++ hw/acpi/ghes.c | 17 +++++++++++++++++ include/hw/acpi/ghes.h | 9 +++++++++ 3 files changed, 31 insertions(+) diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c index 9faba043b8..c315de1802 100644 --- a/hw/acpi/ghes-stub.c +++ b/hw/acpi/ghes-stub.c @@ -15,3 +15,8 @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address) { return -1; } + +bool acpi_ghes_present(void) +{ + return false; +} diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c index a4dac6bf15..a749b84d62 100644 --- a/hw/acpi/ghes.c +++ b/hw/acpi/ghes.c @@ -386,6 +386,8 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s, /* Create a read-write fw_cfg file for Address */ fw_cfg_add_file_callback(s, ACPI_GHES_DATA_ADDR_FW_CFG_FILE, NULL, NULL, NULL, &(ags->ghes_addr_le), sizeof(ags->ghes_addr_le), false); + + ags->present = true; } int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address) @@ -443,3 +445,18 @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address) return ret; } + +bool acpi_ghes_present(void) +{ + AcpiGedState *acpi_ged_state; + AcpiGhesState *ags; + + acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED, + NULL)); + + if (!acpi_ged_state) { + return false; + } + ags = &acpi_ged_state->ghes_state; + return ags->present; +} diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h index 2ae8bc1ded..674f6958e9 100644 --- a/include/hw/acpi/ghes.h +++ b/include/hw/acpi/ghes.h @@ -64,6 +64,7 @@ enum { typedef struct AcpiGhesState { uint64_t ghes_addr_le; + bool present; /* True if GHES is present at all on this board */ } AcpiGhesState; void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker); @@ -72,4 +73,12 @@ void acpi_build_hest(GArray *table_data, BIOSLinker *linker, void acpi_ghes_add_fw_cfg(AcpiGhesState *vms, FWCfgState *s, GArray *hardware_errors); int acpi_ghes_record_errors(uint8_t notify, uint64_t error_physical_addr); + +/** + * acpi_ghes_present: Report whether ACPI GHES table is present + * + * Returns: true if the system has an ACPI GHES table and it is + * safe to call acpi_ghes_record_errors() to record a memory error. + */ +bool acpi_ghes_present(void); #endif From 15613357ba53a4763594f64476058b85b3014757 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:24:38 +0100 Subject: [PATCH 03/57] target/arm: Use acpi_ghes_present() to see if we report ACPI memory errors The virt_is_acpi_enabled() function is specific to the virt board, as is the check for its 'ras' property. Use the new acpi_ghes_present() function to check whether we should report memory errors via acpi_ghes_record_errors(). This avoids a link error if QEMU was built without support for the virt board, and provides a mechanism that can be used by any future board models that want to add ACPI memory error reporting support (they only need to call acpi_ghes_add_fw_cfg()). Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Reviewed-by: Dongjiu Geng Message-id: 20210603171259.27962-4-peter.maydell@linaro.org --- target/arm/kvm64.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 37ceadd9a9..59982d470d 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -1410,14 +1410,10 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) { ram_addr_t ram_addr; hwaddr paddr; - Object *obj = qdev_get_machine(); - VirtMachineState *vms = VIRT_MACHINE(obj); - bool acpi_enabled = virt_is_acpi_enabled(vms); assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); - if (acpi_enabled && addr && - object_property_get_bool(obj, "ras", NULL)) { + if (acpi_ghes_present() && addr) { ram_addr = qemu_ram_addr_from_host(addr); if (ram_addr != RAM_ADDR_INVALID && kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { From 741292face087213b846faed7055ff51b7fee2dd Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 15:03:28 +0100 Subject: [PATCH 04/57] docs/system/arm: Document which architecture extensions we emulate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These days the Arm architecture has a wide range of fine-grained optional extra architectural features. We implement quite a lot of these but by no means all of them. Document what we do implement, so that users can find out without having to dig through back-issues of our Changelog on the wiki. Signed-off-by: Peter Maydell Reviewed-by: Alex Bennée Message-id: 20210617140328.28622-1-peter.maydell@linaro.org Reviewed-by: Richard Henderson --- docs/system/arm/emulation.rst | 102 ++++++++++++++++++++++++++++++++++ docs/system/target-arm.rst | 6 ++ 2 files changed, 108 insertions(+) create mode 100644 docs/system/arm/emulation.rst diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst new file mode 100644 index 0000000000..836c1ca845 --- /dev/null +++ b/docs/system/arm/emulation.rst @@ -0,0 +1,102 @@ +A-profile CPU architecture support +================================== + +QEMU's TCG emulation includes support for the Armv5, Armv6, Armv7 and +Armv8 versions of the A-profile architecture. It also has support for +the following architecture extensions: + +- FEAT_AA32BF16 (AArch32 BFloat16 instructions) +- FEAT_AA32HPD (AArch32 hierarchical permission disables) +- FEAT_AA32I8MM (AArch32 Int8 matrix multiplication instructions) +- FEAT_AES (AESD and AESE instructions) +- FEAT_BF16 (AArch64 BFloat16 instructions) +- FEAT_BTI (Branch Target Identification) +- FEAT_DIT (Data Independent Timing instructions) +- FEAT_DPB (DC CVAP instruction) +- FEAT_DotProd (Advanced SIMD dot product instructions) +- FEAT_FCMA (Floating-point complex number instructions) +- FEAT_FHM (Floating-point half-precision multiplication instructions) +- FEAT_FP16 (Half-precision floating-point data processing) +- FEAT_FRINTTS (Floating-point to integer instructions) +- FEAT_FlagM (Flag manipulation instructions v2) +- FEAT_FlagM2 (Enhancements to flag manipulation instructions) +- FEAT_HPDS (Hierarchical permission disables) +- FEAT_I8MM (AArch64 Int8 matrix multiplication instructions) +- FEAT_JSCVT (JavaScript conversion instructions) +- FEAT_LOR (Limited ordering regions) +- FEAT_LRCPC (Load-acquire RCpc instructions) +- FEAT_LRCPC2 (Load-acquire RCpc instructions v2) +- FEAT_LSE (Large System Extensions) +- FEAT_MTE (Memory Tagging Extension) +- FEAT_MTE2 (Memory Tagging Extension) +- FEAT_PAN (Privileged access never) +- FEAT_PAN2 (AT S1E1R and AT S1E1W instruction variants affected by PSTATE.PAN) +- FEAT_PAuth (Pointer authentication) +- FEAT_PMULL (PMULL, PMULL2 instructions) +- FEAT_PMUv3p1 (PMU Extensions v3.1) +- FEAT_PMUv3p4 (PMU Extensions v3.4) +- FEAT_RDM (Advanced SIMD rounding double multiply accumulate instructions) +- FEAT_RNG (Random number generator) +- FEAT_SB (Speculation Barrier) +- FEAT_SEL2 (Secure EL2) +- FEAT_SHA1 (SHA1 instructions) +- FEAT_SHA256 (SHA256 instructions) +- FEAT_SHA3 (Advanced SIMD SHA3 instructions) +- FEAT_SHA512 (Advanced SIMD SHA512 instructions) +- FEAT_SM3 (Advanced SIMD SM3 instructions) +- FEAT_SM4 (Advanced SIMD SM4 instructions) +- FEAT_SPECRES (Speculation restriction instructions) +- FEAT_SSBS (Speculative Store Bypass Safe) +- FEAT_TLBIOS (TLB invalidate instructions in Outer Shareable domain) +- FEAT_TLBIRANGE (TLB invalidate range instructions) +- FEAT_TTCNP (Translation table Common not private translations) +- FEAT_TTST (Small translation tables) +- FEAT_UAO (Unprivileged Access Override control) +- FEAT_VHE (Virtualization Host Extensions) +- FEAT_VMID16 (16-bit VMID) +- FEAT_XNX (Translation table stage 2 Unprivileged Execute-never) +- SVE (The Scalable Vector Extension) +- SVE2 (The Scalable Vector Extension v2) + +For information on the specifics of these extensions, please refer +to the `Armv8-A Arm Architecture Reference Manual +`_. + +When a specific named CPU is being emulated, only those features which +are present in hardware for that CPU are emulated. (If a feature is +not in the list above then it is not supported, even if the real +hardware should have it.) The ``max`` CPU enables all features. + +R-profile CPU architecture support +================================== + +QEMU's TCG emulation support for R-profile CPUs is currently limited. +We emulate only the Cortex-R5 and Cortex-R5F CPUs. + +M-profile CPU architecture support +================================== + +QEMU's TCG emulation includes support for Armv6-M, Armv7-M, Armv8-M, and +Armv8.1-M versions of the M-profile architucture. It also has support +for the following architecture extensions: + +- FP (Floating-point Extension) +- FPCXT (FPCXT access instructions) +- HP (Half-precision floating-point instructions) +- LOB (Low Overhead loops and Branch future) +- M (Main Extension) +- MPU (Memory Protection Unit Extension) +- PXN (Privileged Execute Never) +- RAS (Reliability, Serviceability and Availability): "minimum RAS Extension" only +- S (Security Extension) +- ST (System Timer Extension) + +For information on the specifics of these extensions, please refer +to the `Armv8-M Arm Architecture Reference Manual +`_. + +When a specific named CPU is being emulated, only those features which +are present in hardware for that CPU are emulated. (If a feature is +not in the list above then it is not supported, even if the real +hardware should have it.) There is no equivalent of the ``max`` CPU for +M-profile. diff --git a/docs/system/target-arm.rst b/docs/system/target-arm.rst index edd013c7bb..8b8547f9a9 100644 --- a/docs/system/target-arm.rst +++ b/docs/system/target-arm.rst @@ -99,6 +99,12 @@ undocumented; you can get a complete list by running arm/virt arm/xlnx-versal-virt +Emulated CPU architecture support +================================= + +.. toctree:: + arm/emulation + Arm CPU features ================ From 41b3ffc59966c78383e177e1dd38f884e886d960 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 18 Jun 2021 15:10:13 +0100 Subject: [PATCH 05/57] target/arm/translate-vfp.c: Whitespace fixes In the code for handling VFP system register accesses there is some stray whitespace after a unary '-' operator, and also some incorrect indent in a couple of function prototypes. We're about to move this code to another file, so fix the code style issues first so checkpatch doesn't complain about the code-movement patch. Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210618141019.10671-2-peter.maydell@linaro.org --- target/arm/translate-vfp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c index 01e26a246d..5a4a13ec1e 100644 --- a/target/arm/translate-vfp.c +++ b/target/arm/translate-vfp.c @@ -771,9 +771,8 @@ static void gen_branch_fpInactive(DisasContext *s, TCGCond cond, } static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, - fp_sysreg_loadfn *loadfn, - void *opaque) + void *opaque) { /* Do a write to an M-profile floating point system register */ TCGv_i32 tmp; @@ -874,8 +873,8 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, } static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, - fp_sysreg_storefn *storefn, - void *opaque) + fp_sysreg_storefn *storefn, + void *opaque) { /* Do a read from an M-profile floating point system register */ TCGv_i32 tmp; @@ -1207,7 +1206,7 @@ static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value) TCGv_i32 addr; if (!a->a) { - offset = - offset; + offset = -offset; } addr = load_reg(s, a->rn); @@ -1242,7 +1241,7 @@ static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque) TCGv_i32 value = tcg_temp_new_i32(); if (!a->a) { - offset = - offset; + offset = -offset; } addr = load_reg(s, a->rn); From 9931d9d84bec87fae30b69590420d8ae459387a6 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 18 Jun 2021 15:10:14 +0100 Subject: [PATCH 06/57] target/arm: Handle FPU being disabled in FPCXT_NS accesses If the guest makes an FPCXT_NS access when the FPU is disabled, one of two things happens: * if there is no active FP context, then the insn behaves the same way as if the FPU was enabled: writes ignored, reads same value as FPDSCR_NS * if there is an active FP context, then we take a NOCP exception Add code to the sysreg read/write functions which emits code to take the NOCP exception in the latter case. At the moment this will never be used, because the NOCP checks in m-nocp.decode happen first, and so the trans functions are never called when the FPU is disabled. The code will be needed when we move the sysreg access insns to before the NOCP patterns in the following commit. Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210618141019.10671-3-peter.maydell@linaro.org --- target/arm/translate-vfp.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c index 5a4a13ec1e..107d6143be 100644 --- a/target/arm/translate-vfp.c +++ b/target/arm/translate-vfp.c @@ -821,7 +821,21 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, lab_end = gen_new_label(); /* fpInactive case: write is a NOP, so branch to end */ gen_branch_fpInactive(s, TCG_COND_NE, lab_end); - /* !fpInactive: PreserveFPState(), and reads same as FPCXT_S */ + /* + * !fpInactive: if FPU disabled, take NOCP exception; + * otherwise PreserveFPState(), and then FPCXT_NS writes + * behave the same as FPCXT_S writes. + */ + if (s->fp_excp_el) { + gen_exception_insn(s, s->pc_curr, EXCP_NOCP, + syn_uncategorized(), s->fp_excp_el); + /* + * This was only a conditional exception, so override + * gen_exception_insn()'s default to DISAS_NORETURN + */ + s->base.is_jmp = DISAS_NEXT; + break; + } gen_preserve_fp_state(s); /* fall through */ case ARM_VFP_FPCXT_S: @@ -961,7 +975,21 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, tcg_gen_br(lab_end); gen_set_label(lab_active); - /* !fpInactive: Reads the same as FPCXT_S, but side effects differ */ + /* + * !fpInactive: if FPU disabled, take NOCP exception; + * otherwise PreserveFPState(), and then FPCXT_NS + * reads the same as FPCXT_S. + */ + if (s->fp_excp_el) { + gen_exception_insn(s, s->pc_curr, EXCP_NOCP, + syn_uncategorized(), s->fp_excp_el); + /* + * This was only a conditional exception, so override + * gen_exception_insn()'s default to DISAS_NORETURN + */ + s->base.is_jmp = DISAS_NEXT; + break; + } gen_preserve_fp_state(s); tmp = tcg_temp_new_i32(); sfpa = tcg_temp_new_i32(); From fa856736b6d0dabdcbe1b199ef2bb4fdec0f4911 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 18 Jun 2021 15:10:15 +0100 Subject: [PATCH 07/57] target/arm: Don't NOCP fault for FPCXT_NS accesses The M-profile architecture requires that accesses to FPCXT_NS when there is no active FP state must not take a NOCP fault even if the FPU is disabled. We were not implementing this correctly, because in our decode we catch the NOCP faults early in m-nocp.decode. Fix this bug by moving all the handling of M-profile FP system register accesses from vfp.decode into m-nocp.decode and putting it above the NOCP blocks. This provides the correct behaviour: * for accesses other than FPCXT_NS the trans functions call vfp_access_check(), which will check for FPU disabled and raise a NOCP exception if necessary * for FPCXT_NS we have the special case code that doesn't call vfp_access_check() * when these trans functions want to raise an UNDEF they return false, so the decoder will fall through into the NOCP blocks. This means that NOCP correctly takes precedence over UNDEF for these insns. (This is a difference from the other insns handled by m-nocp.decode, where UNDEF takes precedence and which we implement by having those trans functions call unallocated_encoding() in the appropriate places.) [Note for backport to stable: this commit has a semantic dependency on commit 9a486856e9173af, which was not marked as cc-stable because we didn't know we'd need it for a for-stable bugfix.] Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210618141019.10671-4-peter.maydell@linaro.org --- target/arm/m-nocp.decode | 24 ++ target/arm/translate-a32.h | 1 + target/arm/translate-m-nocp.c | 514 +++++++++++++++++++++++++++++++++ target/arm/translate-vfp.c | 517 +--------------------------------- target/arm/vfp.decode | 14 - 5 files changed, 542 insertions(+), 528 deletions(-) diff --git a/target/arm/m-nocp.decode b/target/arm/m-nocp.decode index 6699626d7c..b65c801c97 100644 --- a/target/arm/m-nocp.decode +++ b/target/arm/m-nocp.decode @@ -34,6 +34,14 @@ &nocp cp +# M-profile VLDR/VSTR to sysreg +%vldr_sysreg 22:1 13:3 +%imm7_0x4 0:7 !function=times_4 + +&vldr_sysreg rn reg imm a w p +@vldr_sysreg .... ... . a:1 . . . rn:4 ... . ... .. ....... \ + reg=%vldr_sysreg imm=%imm7_0x4 &vldr_sysreg + { # Special cases which do not take an early NOCP: VLLDM and VLSTM VLLDM_VLSTM 1110 1100 001 l:1 rn:4 0000 1010 op:1 000 0000 @@ -41,6 +49,22 @@ VSCCLRM 1110 1100 1.01 1111 .... 1011 imm:7 0 vd=%vd_dp size=3 VSCCLRM 1110 1100 1.01 1111 .... 1010 imm:8 vd=%vd_sp size=2 + # FP system register accesses: these are a special case because accesses + # to FPCXT_NS succeed even if the FPU is disabled. We therefore need + # to handle them before the big NOCP blocks. Note that within these + # insns NOCP still has higher priority than UNDEFs; this is implemented + # by their returning 'false' for UNDEF so as to fall through into the + # NOCP check (in contrast to VLLDM etc, which call unallocated_encoding() + # for the UNDEFs there that must take precedence over NOCP.) + + VMSR_VMRS ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000 + + # P=0 W=0 is SEE "Related encodings", so split into two patterns + VLDR_sysreg ---- 110 1 . . w:1 1 .... ... 0 111 11 ....... @vldr_sysreg p=1 + VLDR_sysreg ---- 110 0 . . 1 1 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1 + VSTR_sysreg ---- 110 1 . . w:1 0 .... ... 0 111 11 ....... @vldr_sysreg p=1 + VSTR_sysreg ---- 110 0 . . 1 0 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1 + NOCP 111- 1110 ---- ---- ---- cp:4 ---- ---- &nocp NOCP 111- 110- ---- ---- ---- cp:4 ---- ---- &nocp # From v8.1M onwards this range will also NOCP: diff --git a/target/arm/translate-a32.h b/target/arm/translate-a32.h index 0a0053949f..abb3ecb6bc 100644 --- a/target/arm/translate-a32.h +++ b/target/arm/translate-a32.h @@ -32,6 +32,7 @@ bool disas_neon_shared(DisasContext *s, uint32_t insn); void load_reg_var(DisasContext *s, TCGv_i32 var, int reg); void arm_gen_condlabel(DisasContext *s); bool vfp_access_check(DisasContext *s); +void gen_preserve_fp_state(DisasContext *s); void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop); void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop); void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop); diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c index 09b3be4ed3..17fd2bf2fb 100644 --- a/target/arm/translate-m-nocp.c +++ b/target/arm/translate-m-nocp.c @@ -19,6 +19,7 @@ #include "qemu/osdep.h" #include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" #include "translate.h" #include "translate-a32.h" @@ -191,6 +192,519 @@ static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a) return true; } +/* + * M-profile provides two different sets of instructions that can + * access floating point system registers: VMSR/VMRS (which move + * to/from a general purpose register) and VLDR/VSTR sysreg (which + * move directly to/from memory). In some cases there are also side + * effects which must happen after any write to memory (which could + * cause an exception). So we implement the common logic for the + * sysreg access in gen_M_fp_sysreg_write() and gen_M_fp_sysreg_read(), + * which take pointers to callback functions which will perform the + * actual "read/write general purpose register" and "read/write + * memory" operations. + */ + +/* + * Emit code to store the sysreg to its final destination; frees the + * TCG temp 'value' it is passed. + */ +typedef void fp_sysreg_storefn(DisasContext *s, void *opaque, TCGv_i32 value); +/* + * Emit code to load the value to be copied to the sysreg; returns + * a new TCG temporary + */ +typedef TCGv_i32 fp_sysreg_loadfn(DisasContext *s, void *opaque); + +/* Common decode/access checks for fp sysreg read/write */ +typedef enum FPSysRegCheckResult { + FPSysRegCheckFailed, /* caller should return false */ + FPSysRegCheckDone, /* caller should return true */ + FPSysRegCheckContinue, /* caller should continue generating code */ +} FPSysRegCheckResult; + +static FPSysRegCheckResult fp_sysreg_checks(DisasContext *s, int regno) +{ + if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { + return FPSysRegCheckFailed; + } + + switch (regno) { + case ARM_VFP_FPSCR: + case QEMU_VFP_FPSCR_NZCV: + break; + case ARM_VFP_FPSCR_NZCVQC: + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return FPSysRegCheckFailed; + } + break; + case ARM_VFP_FPCXT_S: + case ARM_VFP_FPCXT_NS: + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return FPSysRegCheckFailed; + } + if (!s->v8m_secure) { + return FPSysRegCheckFailed; + } + break; + case ARM_VFP_VPR: + case ARM_VFP_P0: + if (!dc_isar_feature(aa32_mve, s)) { + return FPSysRegCheckFailed; + } + break; + default: + return FPSysRegCheckFailed; + } + + /* + * FPCXT_NS is a special case: it has specific handling for + * "current FP state is inactive", and must do the PreserveFPState() + * but not the usual full set of actions done by ExecuteFPCheck(). + * So we don't call vfp_access_check() and the callers must handle this. + */ + if (regno != ARM_VFP_FPCXT_NS && !vfp_access_check(s)) { + return FPSysRegCheckDone; + } + return FPSysRegCheckContinue; +} + +static void gen_branch_fpInactive(DisasContext *s, TCGCond cond, + TCGLabel *label) +{ + /* + * FPCXT_NS is a special case: it has specific handling for + * "current FP state is inactive", and must do the PreserveFPState() + * but not the usual full set of actions done by ExecuteFPCheck(). + * We don't have a TB flag that matches the fpInactive check, so we + * do it at runtime as we don't expect FPCXT_NS accesses to be frequent. + * + * Emit code that checks fpInactive and does a conditional + * branch to label based on it: + * if cond is TCG_COND_NE then branch if fpInactive != 0 (ie if inactive) + * if cond is TCG_COND_EQ then branch if fpInactive == 0 (ie if active) + */ + assert(cond == TCG_COND_EQ || cond == TCG_COND_NE); + + /* fpInactive = FPCCR_NS.ASPEN == 1 && CONTROL.FPCA == 0 */ + TCGv_i32 aspen, fpca; + aspen = load_cpu_field(v7m.fpccr[M_REG_NS]); + fpca = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK); + tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK); + tcg_gen_andi_i32(fpca, fpca, R_V7M_CONTROL_FPCA_MASK); + tcg_gen_or_i32(fpca, fpca, aspen); + tcg_gen_brcondi_i32(tcg_invert_cond(cond), fpca, 0, label); + tcg_temp_free_i32(aspen); + tcg_temp_free_i32(fpca); +} + +static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, + fp_sysreg_loadfn *loadfn, + void *opaque) +{ + /* Do a write to an M-profile floating point system register */ + TCGv_i32 tmp; + TCGLabel *lab_end = NULL; + + switch (fp_sysreg_checks(s, regno)) { + case FPSysRegCheckFailed: + return false; + case FPSysRegCheckDone: + return true; + case FPSysRegCheckContinue: + break; + } + + switch (regno) { + case ARM_VFP_FPSCR: + tmp = loadfn(s, opaque); + gen_helper_vfp_set_fpscr(cpu_env, tmp); + tcg_temp_free_i32(tmp); + gen_lookup_tb(s); + break; + case ARM_VFP_FPSCR_NZCVQC: + { + TCGv_i32 fpscr; + tmp = loadfn(s, opaque); + if (dc_isar_feature(aa32_mve, s)) { + /* QC is only present for MVE; otherwise RES0 */ + TCGv_i32 qc = tcg_temp_new_i32(); + tcg_gen_andi_i32(qc, tmp, FPCR_QC); + /* + * The 4 vfp.qc[] fields need only be "zero" vs "non-zero"; + * here writing the same value into all elements is simplest. + */ + tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc), + 16, 16, qc); + } + tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); + fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); + tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK); + tcg_gen_or_i32(fpscr, fpscr, tmp); + store_cpu_field(fpscr, vfp.xregs[ARM_VFP_FPSCR]); + tcg_temp_free_i32(tmp); + break; + } + case ARM_VFP_FPCXT_NS: + lab_end = gen_new_label(); + /* fpInactive case: write is a NOP, so branch to end */ + gen_branch_fpInactive(s, TCG_COND_NE, lab_end); + /* + * !fpInactive: if FPU disabled, take NOCP exception; + * otherwise PreserveFPState(), and then FPCXT_NS writes + * behave the same as FPCXT_S writes. + */ + if (s->fp_excp_el) { + gen_exception_insn(s, s->pc_curr, EXCP_NOCP, + syn_uncategorized(), s->fp_excp_el); + /* + * This was only a conditional exception, so override + * gen_exception_insn()'s default to DISAS_NORETURN + */ + s->base.is_jmp = DISAS_NEXT; + break; + } + gen_preserve_fp_state(s); + /* fall through */ + case ARM_VFP_FPCXT_S: + { + TCGv_i32 sfpa, control; + /* + * Set FPSCR and CONTROL.SFPA from value; the new FPSCR takes + * bits [27:0] from value and zeroes bits [31:28]. + */ + tmp = loadfn(s, opaque); + sfpa = tcg_temp_new_i32(); + tcg_gen_shri_i32(sfpa, tmp, 31); + control = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_deposit_i32(control, control, sfpa, + R_V7M_CONTROL_SFPA_SHIFT, 1); + store_cpu_field(control, v7m.control[M_REG_S]); + tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK); + gen_helper_vfp_set_fpscr(cpu_env, tmp); + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(sfpa); + break; + } + case ARM_VFP_VPR: + /* Behaves as NOP if not privileged */ + if (IS_USER(s)) { + break; + } + tmp = loadfn(s, opaque); + store_cpu_field(tmp, v7m.vpr); + break; + case ARM_VFP_P0: + { + TCGv_i32 vpr; + tmp = loadfn(s, opaque); + vpr = load_cpu_field(v7m.vpr); + tcg_gen_deposit_i32(vpr, vpr, tmp, + R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH); + store_cpu_field(vpr, v7m.vpr); + tcg_temp_free_i32(tmp); + break; + } + default: + g_assert_not_reached(); + } + if (lab_end) { + gen_set_label(lab_end); + } + return true; +} + +static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, + fp_sysreg_storefn *storefn, + void *opaque) +{ + /* Do a read from an M-profile floating point system register */ + TCGv_i32 tmp; + TCGLabel *lab_end = NULL; + bool lookup_tb = false; + + switch (fp_sysreg_checks(s, regno)) { + case FPSysRegCheckFailed: + return false; + case FPSysRegCheckDone: + return true; + case FPSysRegCheckContinue: + break; + } + + if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) { + /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */ + regno = QEMU_VFP_FPSCR_NZCV; + } + + switch (regno) { + case ARM_VFP_FPSCR: + tmp = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(tmp, cpu_env); + storefn(s, opaque, tmp); + break; + case ARM_VFP_FPSCR_NZCVQC: + tmp = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(tmp, cpu_env); + tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK); + storefn(s, opaque, tmp); + break; + case QEMU_VFP_FPSCR_NZCV: + /* + * Read just NZCV; this is a special case to avoid the + * helper call for the "VMRS to CPSR.NZCV" insn. + */ + tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); + tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); + storefn(s, opaque, tmp); + break; + case ARM_VFP_FPCXT_S: + { + TCGv_i32 control, sfpa, fpscr; + /* Bits [27:0] from FPSCR, bit [31] from CONTROL.SFPA */ + tmp = tcg_temp_new_i32(); + sfpa = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(tmp, cpu_env); + tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK); + control = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK); + tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT); + tcg_gen_or_i32(tmp, tmp, sfpa); + tcg_temp_free_i32(sfpa); + /* + * Store result before updating FPSCR etc, in case + * it is a memory write which causes an exception. + */ + storefn(s, opaque, tmp); + /* + * Now we must reset FPSCR from FPDSCR_NS, and clear + * CONTROL.SFPA; so we'll end the TB here. + */ + tcg_gen_andi_i32(control, control, ~R_V7M_CONTROL_SFPA_MASK); + store_cpu_field(control, v7m.control[M_REG_S]); + fpscr = load_cpu_field(v7m.fpdscr[M_REG_NS]); + gen_helper_vfp_set_fpscr(cpu_env, fpscr); + tcg_temp_free_i32(fpscr); + lookup_tb = true; + break; + } + case ARM_VFP_FPCXT_NS: + { + TCGv_i32 control, sfpa, fpscr, fpdscr, zero; + TCGLabel *lab_active = gen_new_label(); + + lookup_tb = true; + + gen_branch_fpInactive(s, TCG_COND_EQ, lab_active); + /* fpInactive case: reads as FPDSCR_NS */ + TCGv_i32 tmp = load_cpu_field(v7m.fpdscr[M_REG_NS]); + storefn(s, opaque, tmp); + lab_end = gen_new_label(); + tcg_gen_br(lab_end); + + gen_set_label(lab_active); + /* + * !fpInactive: if FPU disabled, take NOCP exception; + * otherwise PreserveFPState(), and then FPCXT_NS + * reads the same as FPCXT_S. + */ + if (s->fp_excp_el) { + gen_exception_insn(s, s->pc_curr, EXCP_NOCP, + syn_uncategorized(), s->fp_excp_el); + /* + * This was only a conditional exception, so override + * gen_exception_insn()'s default to DISAS_NORETURN + */ + s->base.is_jmp = DISAS_NEXT; + break; + } + gen_preserve_fp_state(s); + tmp = tcg_temp_new_i32(); + sfpa = tcg_temp_new_i32(); + fpscr = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(fpscr, cpu_env); + tcg_gen_andi_i32(tmp, fpscr, ~FPCR_NZCV_MASK); + control = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK); + tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT); + tcg_gen_or_i32(tmp, tmp, sfpa); + tcg_temp_free_i32(control); + /* Store result before updating FPSCR, in case it faults */ + storefn(s, opaque, tmp); + /* If SFPA is zero then set FPSCR from FPDSCR_NS */ + fpdscr = load_cpu_field(v7m.fpdscr[M_REG_NS]); + zero = tcg_const_i32(0); + tcg_gen_movcond_i32(TCG_COND_EQ, fpscr, sfpa, zero, fpdscr, fpscr); + gen_helper_vfp_set_fpscr(cpu_env, fpscr); + tcg_temp_free_i32(zero); + tcg_temp_free_i32(sfpa); + tcg_temp_free_i32(fpdscr); + tcg_temp_free_i32(fpscr); + break; + } + case ARM_VFP_VPR: + /* Behaves as NOP if not privileged */ + if (IS_USER(s)) { + break; + } + tmp = load_cpu_field(v7m.vpr); + storefn(s, opaque, tmp); + break; + case ARM_VFP_P0: + tmp = load_cpu_field(v7m.vpr); + tcg_gen_extract_i32(tmp, tmp, R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH); + storefn(s, opaque, tmp); + break; + default: + g_assert_not_reached(); + } + + if (lab_end) { + gen_set_label(lab_end); + } + if (lookup_tb) { + gen_lookup_tb(s); + } + return true; +} + +static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value) +{ + arg_VMSR_VMRS *a = opaque; + + if (a->rt == 15) { + /* Set the 4 flag bits in the CPSR */ + gen_set_nzcv(value); + tcg_temp_free_i32(value); + } else { + store_reg(s, a->rt, value); + } +} + +static TCGv_i32 gpr_to_fp_sysreg(DisasContext *s, void *opaque) +{ + arg_VMSR_VMRS *a = opaque; + + return load_reg(s, a->rt); +} + +static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) +{ + /* + * Accesses to R15 are UNPREDICTABLE; we choose to undef. + * FPSCR -> r15 is a special case which writes to the PSR flags; + * set a->reg to a special value to tell gen_M_fp_sysreg_read() + * we only care about the top 4 bits of FPSCR there. + */ + if (a->rt == 15) { + if (a->l && a->reg == ARM_VFP_FPSCR) { + a->reg = QEMU_VFP_FPSCR_NZCV; + } else { + return false; + } + } + + if (a->l) { + /* VMRS, move FP system register to gp register */ + return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_gpr, a); + } else { + /* VMSR, move gp register to FP system register */ + return gen_M_fp_sysreg_write(s, a->reg, gpr_to_fp_sysreg, a); + } +} + +static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value) +{ + arg_vldr_sysreg *a = opaque; + uint32_t offset = a->imm; + TCGv_i32 addr; + + if (!a->a) { + offset = -offset; + } + + addr = load_reg(s, a->rn); + if (a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + + if (s->v8m_stackcheck && a->rn == 13 && a->w) { + gen_helper_v8m_stackcheck(cpu_env, addr); + } + + gen_aa32_st_i32(s, value, addr, get_mem_index(s), + MO_UL | MO_ALIGN | s->be_data); + tcg_temp_free_i32(value); + + if (a->w) { + /* writeback */ + if (!a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } +} + +static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque) +{ + arg_vldr_sysreg *a = opaque; + uint32_t offset = a->imm; + TCGv_i32 addr; + TCGv_i32 value = tcg_temp_new_i32(); + + if (!a->a) { + offset = -offset; + } + + addr = load_reg(s, a->rn); + if (a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + + if (s->v8m_stackcheck && a->rn == 13 && a->w) { + gen_helper_v8m_stackcheck(cpu_env, addr); + } + + gen_aa32_ld_i32(s, value, addr, get_mem_index(s), + MO_UL | MO_ALIGN | s->be_data); + + if (a->w) { + /* writeback */ + if (!a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } + return value; +} + +static bool trans_VLDR_sysreg(DisasContext *s, arg_vldr_sysreg *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return false; + } + if (a->rn == 15) { + return false; + } + return gen_M_fp_sysreg_write(s, a->reg, memory_to_fp_sysreg, a); +} + +static bool trans_VSTR_sysreg(DisasContext *s, arg_vldr_sysreg *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return false; + } + if (a->rn == 15) { + return false; + } + return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_memory, a); +} + static bool trans_NOCP(DisasContext *s, arg_nocp *a) { /* diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c index 107d6143be..8987ef2e5b 100644 --- a/target/arm/translate-vfp.c +++ b/target/arm/translate-vfp.c @@ -109,7 +109,7 @@ static inline long vfp_f16_offset(unsigned reg, bool top) * Generate code for M-profile lazy FP state preservation if needed; * this corresponds to the pseudocode PreserveFPState() function. */ -static void gen_preserve_fp_state(DisasContext *s) +void gen_preserve_fp_state(DisasContext *s) { if (s->v7m_lspact) { /* @@ -663,435 +663,14 @@ static bool trans_VDUP(DisasContext *s, arg_VDUP *a) return true; } -/* - * M-profile provides two different sets of instructions that can - * access floating point system registers: VMSR/VMRS (which move - * to/from a general purpose register) and VLDR/VSTR sysreg (which - * move directly to/from memory). In some cases there are also side - * effects which must happen after any write to memory (which could - * cause an exception). So we implement the common logic for the - * sysreg access in gen_M_fp_sysreg_write() and gen_M_fp_sysreg_read(), - * which take pointers to callback functions which will perform the - * actual "read/write general purpose register" and "read/write - * memory" operations. - */ - -/* - * Emit code to store the sysreg to its final destination; frees the - * TCG temp 'value' it is passed. - */ -typedef void fp_sysreg_storefn(DisasContext *s, void *opaque, TCGv_i32 value); -/* - * Emit code to load the value to be copied to the sysreg; returns - * a new TCG temporary - */ -typedef TCGv_i32 fp_sysreg_loadfn(DisasContext *s, void *opaque); - -/* Common decode/access checks for fp sysreg read/write */ -typedef enum FPSysRegCheckResult { - FPSysRegCheckFailed, /* caller should return false */ - FPSysRegCheckDone, /* caller should return true */ - FPSysRegCheckContinue, /* caller should continue generating code */ -} FPSysRegCheckResult; - -static FPSysRegCheckResult fp_sysreg_checks(DisasContext *s, int regno) -{ - if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { - return FPSysRegCheckFailed; - } - - switch (regno) { - case ARM_VFP_FPSCR: - case QEMU_VFP_FPSCR_NZCV: - break; - case ARM_VFP_FPSCR_NZCVQC: - if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { - return FPSysRegCheckFailed; - } - break; - case ARM_VFP_FPCXT_S: - case ARM_VFP_FPCXT_NS: - if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { - return FPSysRegCheckFailed; - } - if (!s->v8m_secure) { - return FPSysRegCheckFailed; - } - break; - case ARM_VFP_VPR: - case ARM_VFP_P0: - if (!dc_isar_feature(aa32_mve, s)) { - return FPSysRegCheckFailed; - } - break; - default: - return FPSysRegCheckFailed; - } - - /* - * FPCXT_NS is a special case: it has specific handling for - * "current FP state is inactive", and must do the PreserveFPState() - * but not the usual full set of actions done by ExecuteFPCheck(). - * So we don't call vfp_access_check() and the callers must handle this. - */ - if (regno != ARM_VFP_FPCXT_NS && !vfp_access_check(s)) { - return FPSysRegCheckDone; - } - return FPSysRegCheckContinue; -} - -static void gen_branch_fpInactive(DisasContext *s, TCGCond cond, - TCGLabel *label) -{ - /* - * FPCXT_NS is a special case: it has specific handling for - * "current FP state is inactive", and must do the PreserveFPState() - * but not the usual full set of actions done by ExecuteFPCheck(). - * We don't have a TB flag that matches the fpInactive check, so we - * do it at runtime as we don't expect FPCXT_NS accesses to be frequent. - * - * Emit code that checks fpInactive and does a conditional - * branch to label based on it: - * if cond is TCG_COND_NE then branch if fpInactive != 0 (ie if inactive) - * if cond is TCG_COND_EQ then branch if fpInactive == 0 (ie if active) - */ - assert(cond == TCG_COND_EQ || cond == TCG_COND_NE); - - /* fpInactive = FPCCR_NS.ASPEN == 1 && CONTROL.FPCA == 0 */ - TCGv_i32 aspen, fpca; - aspen = load_cpu_field(v7m.fpccr[M_REG_NS]); - fpca = load_cpu_field(v7m.control[M_REG_S]); - tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK); - tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK); - tcg_gen_andi_i32(fpca, fpca, R_V7M_CONTROL_FPCA_MASK); - tcg_gen_or_i32(fpca, fpca, aspen); - tcg_gen_brcondi_i32(tcg_invert_cond(cond), fpca, 0, label); - tcg_temp_free_i32(aspen); - tcg_temp_free_i32(fpca); -} - -static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, - fp_sysreg_loadfn *loadfn, - void *opaque) -{ - /* Do a write to an M-profile floating point system register */ - TCGv_i32 tmp; - TCGLabel *lab_end = NULL; - - switch (fp_sysreg_checks(s, regno)) { - case FPSysRegCheckFailed: - return false; - case FPSysRegCheckDone: - return true; - case FPSysRegCheckContinue: - break; - } - - switch (regno) { - case ARM_VFP_FPSCR: - tmp = loadfn(s, opaque); - gen_helper_vfp_set_fpscr(cpu_env, tmp); - tcg_temp_free_i32(tmp); - gen_lookup_tb(s); - break; - case ARM_VFP_FPSCR_NZCVQC: - { - TCGv_i32 fpscr; - tmp = loadfn(s, opaque); - if (dc_isar_feature(aa32_mve, s)) { - /* QC is only present for MVE; otherwise RES0 */ - TCGv_i32 qc = tcg_temp_new_i32(); - tcg_gen_andi_i32(qc, tmp, FPCR_QC); - /* - * The 4 vfp.qc[] fields need only be "zero" vs "non-zero"; - * here writing the same value into all elements is simplest. - */ - tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc), - 16, 16, qc); - } - tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); - fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); - tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK); - tcg_gen_or_i32(fpscr, fpscr, tmp); - store_cpu_field(fpscr, vfp.xregs[ARM_VFP_FPSCR]); - tcg_temp_free_i32(tmp); - break; - } - case ARM_VFP_FPCXT_NS: - lab_end = gen_new_label(); - /* fpInactive case: write is a NOP, so branch to end */ - gen_branch_fpInactive(s, TCG_COND_NE, lab_end); - /* - * !fpInactive: if FPU disabled, take NOCP exception; - * otherwise PreserveFPState(), and then FPCXT_NS writes - * behave the same as FPCXT_S writes. - */ - if (s->fp_excp_el) { - gen_exception_insn(s, s->pc_curr, EXCP_NOCP, - syn_uncategorized(), s->fp_excp_el); - /* - * This was only a conditional exception, so override - * gen_exception_insn()'s default to DISAS_NORETURN - */ - s->base.is_jmp = DISAS_NEXT; - break; - } - gen_preserve_fp_state(s); - /* fall through */ - case ARM_VFP_FPCXT_S: - { - TCGv_i32 sfpa, control; - /* - * Set FPSCR and CONTROL.SFPA from value; the new FPSCR takes - * bits [27:0] from value and zeroes bits [31:28]. - */ - tmp = loadfn(s, opaque); - sfpa = tcg_temp_new_i32(); - tcg_gen_shri_i32(sfpa, tmp, 31); - control = load_cpu_field(v7m.control[M_REG_S]); - tcg_gen_deposit_i32(control, control, sfpa, - R_V7M_CONTROL_SFPA_SHIFT, 1); - store_cpu_field(control, v7m.control[M_REG_S]); - tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK); - gen_helper_vfp_set_fpscr(cpu_env, tmp); - tcg_temp_free_i32(tmp); - tcg_temp_free_i32(sfpa); - break; - } - case ARM_VFP_VPR: - /* Behaves as NOP if not privileged */ - if (IS_USER(s)) { - break; - } - tmp = loadfn(s, opaque); - store_cpu_field(tmp, v7m.vpr); - break; - case ARM_VFP_P0: - { - TCGv_i32 vpr; - tmp = loadfn(s, opaque); - vpr = load_cpu_field(v7m.vpr); - tcg_gen_deposit_i32(vpr, vpr, tmp, - R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH); - store_cpu_field(vpr, v7m.vpr); - tcg_temp_free_i32(tmp); - break; - } - default: - g_assert_not_reached(); - } - if (lab_end) { - gen_set_label(lab_end); - } - return true; -} - -static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, - fp_sysreg_storefn *storefn, - void *opaque) -{ - /* Do a read from an M-profile floating point system register */ - TCGv_i32 tmp; - TCGLabel *lab_end = NULL; - bool lookup_tb = false; - - switch (fp_sysreg_checks(s, regno)) { - case FPSysRegCheckFailed: - return false; - case FPSysRegCheckDone: - return true; - case FPSysRegCheckContinue: - break; - } - - if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) { - /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */ - regno = QEMU_VFP_FPSCR_NZCV; - } - - switch (regno) { - case ARM_VFP_FPSCR: - tmp = tcg_temp_new_i32(); - gen_helper_vfp_get_fpscr(tmp, cpu_env); - storefn(s, opaque, tmp); - break; - case ARM_VFP_FPSCR_NZCVQC: - tmp = tcg_temp_new_i32(); - gen_helper_vfp_get_fpscr(tmp, cpu_env); - tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK); - storefn(s, opaque, tmp); - break; - case QEMU_VFP_FPSCR_NZCV: - /* - * Read just NZCV; this is a special case to avoid the - * helper call for the "VMRS to CPSR.NZCV" insn. - */ - tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); - tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); - storefn(s, opaque, tmp); - break; - case ARM_VFP_FPCXT_S: - { - TCGv_i32 control, sfpa, fpscr; - /* Bits [27:0] from FPSCR, bit [31] from CONTROL.SFPA */ - tmp = tcg_temp_new_i32(); - sfpa = tcg_temp_new_i32(); - gen_helper_vfp_get_fpscr(tmp, cpu_env); - tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK); - control = load_cpu_field(v7m.control[M_REG_S]); - tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK); - tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT); - tcg_gen_or_i32(tmp, tmp, sfpa); - tcg_temp_free_i32(sfpa); - /* - * Store result before updating FPSCR etc, in case - * it is a memory write which causes an exception. - */ - storefn(s, opaque, tmp); - /* - * Now we must reset FPSCR from FPDSCR_NS, and clear - * CONTROL.SFPA; so we'll end the TB here. - */ - tcg_gen_andi_i32(control, control, ~R_V7M_CONTROL_SFPA_MASK); - store_cpu_field(control, v7m.control[M_REG_S]); - fpscr = load_cpu_field(v7m.fpdscr[M_REG_NS]); - gen_helper_vfp_set_fpscr(cpu_env, fpscr); - tcg_temp_free_i32(fpscr); - lookup_tb = true; - break; - } - case ARM_VFP_FPCXT_NS: - { - TCGv_i32 control, sfpa, fpscr, fpdscr, zero; - TCGLabel *lab_active = gen_new_label(); - - lookup_tb = true; - - gen_branch_fpInactive(s, TCG_COND_EQ, lab_active); - /* fpInactive case: reads as FPDSCR_NS */ - TCGv_i32 tmp = load_cpu_field(v7m.fpdscr[M_REG_NS]); - storefn(s, opaque, tmp); - lab_end = gen_new_label(); - tcg_gen_br(lab_end); - - gen_set_label(lab_active); - /* - * !fpInactive: if FPU disabled, take NOCP exception; - * otherwise PreserveFPState(), and then FPCXT_NS - * reads the same as FPCXT_S. - */ - if (s->fp_excp_el) { - gen_exception_insn(s, s->pc_curr, EXCP_NOCP, - syn_uncategorized(), s->fp_excp_el); - /* - * This was only a conditional exception, so override - * gen_exception_insn()'s default to DISAS_NORETURN - */ - s->base.is_jmp = DISAS_NEXT; - break; - } - gen_preserve_fp_state(s); - tmp = tcg_temp_new_i32(); - sfpa = tcg_temp_new_i32(); - fpscr = tcg_temp_new_i32(); - gen_helper_vfp_get_fpscr(fpscr, cpu_env); - tcg_gen_andi_i32(tmp, fpscr, ~FPCR_NZCV_MASK); - control = load_cpu_field(v7m.control[M_REG_S]); - tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK); - tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT); - tcg_gen_or_i32(tmp, tmp, sfpa); - tcg_temp_free_i32(control); - /* Store result before updating FPSCR, in case it faults */ - storefn(s, opaque, tmp); - /* If SFPA is zero then set FPSCR from FPDSCR_NS */ - fpdscr = load_cpu_field(v7m.fpdscr[M_REG_NS]); - zero = tcg_const_i32(0); - tcg_gen_movcond_i32(TCG_COND_EQ, fpscr, sfpa, zero, fpdscr, fpscr); - gen_helper_vfp_set_fpscr(cpu_env, fpscr); - tcg_temp_free_i32(zero); - tcg_temp_free_i32(sfpa); - tcg_temp_free_i32(fpdscr); - tcg_temp_free_i32(fpscr); - break; - } - case ARM_VFP_VPR: - /* Behaves as NOP if not privileged */ - if (IS_USER(s)) { - break; - } - tmp = load_cpu_field(v7m.vpr); - storefn(s, opaque, tmp); - break; - case ARM_VFP_P0: - tmp = load_cpu_field(v7m.vpr); - tcg_gen_extract_i32(tmp, tmp, R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH); - storefn(s, opaque, tmp); - break; - default: - g_assert_not_reached(); - } - - if (lab_end) { - gen_set_label(lab_end); - } - if (lookup_tb) { - gen_lookup_tb(s); - } - return true; -} - -static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value) -{ - arg_VMSR_VMRS *a = opaque; - - if (a->rt == 15) { - /* Set the 4 flag bits in the CPSR */ - gen_set_nzcv(value); - tcg_temp_free_i32(value); - } else { - store_reg(s, a->rt, value); - } -} - -static TCGv_i32 gpr_to_fp_sysreg(DisasContext *s, void *opaque) -{ - arg_VMSR_VMRS *a = opaque; - - return load_reg(s, a->rt); -} - -static bool gen_M_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) -{ - /* - * Accesses to R15 are UNPREDICTABLE; we choose to undef. - * FPSCR -> r15 is a special case which writes to the PSR flags; - * set a->reg to a special value to tell gen_M_fp_sysreg_read() - * we only care about the top 4 bits of FPSCR there. - */ - if (a->rt == 15) { - if (a->l && a->reg == ARM_VFP_FPSCR) { - a->reg = QEMU_VFP_FPSCR_NZCV; - } else { - return false; - } - } - - if (a->l) { - /* VMRS, move FP system register to gp register */ - return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_gpr, a); - } else { - /* VMSR, move gp register to FP system register */ - return gen_M_fp_sysreg_write(s, a->reg, gpr_to_fp_sysreg, a); - } -} - static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) { TCGv_i32 tmp; bool ignore_vfp_enabled = false; if (arm_dc_feature(s, ARM_FEATURE_M)) { - return gen_M_VMSR_VMRS(s, a); + /* M profile version was already handled in m-nocp.decode */ + return false; } if (!dc_isar_feature(aa32_fpsp_v2, s)) { @@ -1227,96 +806,6 @@ static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) return true; } -static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value) -{ - arg_vldr_sysreg *a = opaque; - uint32_t offset = a->imm; - TCGv_i32 addr; - - if (!a->a) { - offset = -offset; - } - - addr = load_reg(s, a->rn); - if (a->p) { - tcg_gen_addi_i32(addr, addr, offset); - } - - if (s->v8m_stackcheck && a->rn == 13 && a->w) { - gen_helper_v8m_stackcheck(cpu_env, addr); - } - - gen_aa32_st_i32(s, value, addr, get_mem_index(s), - MO_UL | MO_ALIGN | s->be_data); - tcg_temp_free_i32(value); - - if (a->w) { - /* writeback */ - if (!a->p) { - tcg_gen_addi_i32(addr, addr, offset); - } - store_reg(s, a->rn, addr); - } else { - tcg_temp_free_i32(addr); - } -} - -static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque) -{ - arg_vldr_sysreg *a = opaque; - uint32_t offset = a->imm; - TCGv_i32 addr; - TCGv_i32 value = tcg_temp_new_i32(); - - if (!a->a) { - offset = -offset; - } - - addr = load_reg(s, a->rn); - if (a->p) { - tcg_gen_addi_i32(addr, addr, offset); - } - - if (s->v8m_stackcheck && a->rn == 13 && a->w) { - gen_helper_v8m_stackcheck(cpu_env, addr); - } - - gen_aa32_ld_i32(s, value, addr, get_mem_index(s), - MO_UL | MO_ALIGN | s->be_data); - - if (a->w) { - /* writeback */ - if (!a->p) { - tcg_gen_addi_i32(addr, addr, offset); - } - store_reg(s, a->rn, addr); - } else { - tcg_temp_free_i32(addr); - } - return value; -} - -static bool trans_VLDR_sysreg(DisasContext *s, arg_vldr_sysreg *a) -{ - if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { - return false; - } - if (a->rn == 15) { - return false; - } - return gen_M_fp_sysreg_write(s, a->reg, memory_to_fp_sysreg, a); -} - -static bool trans_VSTR_sysreg(DisasContext *s, arg_vldr_sysreg *a) -{ - if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { - return false; - } - if (a->rn == 15) { - return false; - } - return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_memory, a); -} static bool trans_VMOV_half(DisasContext *s, arg_VMOV_single *a) { diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode index 52535d9b0b..5405e80197 100644 --- a/target/arm/vfp.decode +++ b/target/arm/vfp.decode @@ -84,20 +84,6 @@ VLDR_VSTR_hp ---- 1101 u:1 .0 l:1 rn:4 .... 1001 imm:8 vd=%vd_sp VLDR_VSTR_sp ---- 1101 u:1 .0 l:1 rn:4 .... 1010 imm:8 vd=%vd_sp VLDR_VSTR_dp ---- 1101 u:1 .0 l:1 rn:4 .... 1011 imm:8 vd=%vd_dp -# M-profile VLDR/VSTR to sysreg -%vldr_sysreg 22:1 13:3 -%imm7_0x4 0:7 !function=times_4 - -&vldr_sysreg rn reg imm a w p -@vldr_sysreg .... ... . a:1 . . . rn:4 ... . ... .. ....... \ - reg=%vldr_sysreg imm=%imm7_0x4 &vldr_sysreg - -# P=0 W=0 is SEE "Related encodings", so split into two patterns -VLDR_sysreg ---- 110 1 . . w:1 1 .... ... 0 111 11 ....... @vldr_sysreg p=1 -VLDR_sysreg ---- 110 0 . . 1 1 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1 -VSTR_sysreg ---- 110 1 . . w:1 0 .... ... 0 111 11 ....... @vldr_sysreg p=1 -VSTR_sysreg ---- 110 0 . . 1 0 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1 - # We split the load/store multiple up into two patterns to avoid # overlap with other insns in the "Advanced SIMD load/store and 64-bit move" # grouping: From e494cd0a1abce13a9a934c1cfdaaeabd389a77c7 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 18 Jun 2021 15:10:16 +0100 Subject: [PATCH 08/57] target/arm: Handle writeback in VLDR/VSTR sysreg with no memory access A few subcases of VLDR/VSTR sysreg succeed but do not perform a memory access: * VSTR of VPR when unprivileged * VLDR to VPR when unprivileged * VLDR to FPCXT_NS when fpInactive In these cases, even though we don't do the memory access we should still update the base register and perform the stack limit check if the insn's addressing mode specifies writeback. Our implementation failed to do this, because we handle these side-effects inside the memory_to_fp_sysreg() and fp_sysreg_to_memory() callback functions, which are only called if there's something to load or store. Fix this by adding an extra argument to the callbacks which is set to true to actually perform the access and false to only do side effects like writeback, and calling the callback with do_access = false for the three cases listed above. This produces slightly suboptimal code for the case of a write to FPCXT_NS when the FPU is inactive and the insn didn't have side effects (ie no writeback, or via VMSR), in which case we'll generate a conditional branch over an unconditional branch. But this doesn't seem to be important enough to merit requiring the callback to report back whether it generated any code or not. Cc: qemu-stable@nongnu.org Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210618141019.10671-5-peter.maydell@linaro.org --- target/arm/translate-m-nocp.c | 102 ++++++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 30 deletions(-) diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c index 17fd2bf2fb..312a25f058 100644 --- a/target/arm/translate-m-nocp.c +++ b/target/arm/translate-m-nocp.c @@ -207,14 +207,20 @@ static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a) /* * Emit code to store the sysreg to its final destination; frees the - * TCG temp 'value' it is passed. + * TCG temp 'value' it is passed. do_access is true to do the store, + * and false to skip it and only perform side-effects like base + * register writeback. */ -typedef void fp_sysreg_storefn(DisasContext *s, void *opaque, TCGv_i32 value); +typedef void fp_sysreg_storefn(DisasContext *s, void *opaque, TCGv_i32 value, + bool do_access); /* * Emit code to load the value to be copied to the sysreg; returns - * a new TCG temporary + * a new TCG temporary. do_access is true to do the store, + * and false to skip it and only perform side-effects like base + * register writeback. */ -typedef TCGv_i32 fp_sysreg_loadfn(DisasContext *s, void *opaque); +typedef TCGv_i32 fp_sysreg_loadfn(DisasContext *s, void *opaque, + bool do_access); /* Common decode/access checks for fp sysreg read/write */ typedef enum FPSysRegCheckResult { @@ -318,7 +324,7 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, switch (regno) { case ARM_VFP_FPSCR: - tmp = loadfn(s, opaque); + tmp = loadfn(s, opaque, true); gen_helper_vfp_set_fpscr(cpu_env, tmp); tcg_temp_free_i32(tmp); gen_lookup_tb(s); @@ -326,7 +332,7 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, case ARM_VFP_FPSCR_NZCVQC: { TCGv_i32 fpscr; - tmp = loadfn(s, opaque); + tmp = loadfn(s, opaque, true); if (dc_isar_feature(aa32_mve, s)) { /* QC is only present for MVE; otherwise RES0 */ TCGv_i32 qc = tcg_temp_new_i32(); @@ -347,9 +353,19 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, break; } case ARM_VFP_FPCXT_NS: + { + TCGLabel *lab_active = gen_new_label(); + lab_end = gen_new_label(); - /* fpInactive case: write is a NOP, so branch to end */ - gen_branch_fpInactive(s, TCG_COND_NE, lab_end); + gen_branch_fpInactive(s, TCG_COND_EQ, lab_active); + /* + * fpInactive case: write is a NOP, so only do side effects + * like register writeback before we branch to end + */ + loadfn(s, opaque, false); + tcg_gen_br(lab_end); + + gen_set_label(lab_active); /* * !fpInactive: if FPU disabled, take NOCP exception; * otherwise PreserveFPState(), and then FPCXT_NS writes @@ -366,7 +382,8 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, break; } gen_preserve_fp_state(s); - /* fall through */ + } + /* fall through */ case ARM_VFP_FPCXT_S: { TCGv_i32 sfpa, control; @@ -374,7 +391,7 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, * Set FPSCR and CONTROL.SFPA from value; the new FPSCR takes * bits [27:0] from value and zeroes bits [31:28]. */ - tmp = loadfn(s, opaque); + tmp = loadfn(s, opaque, true); sfpa = tcg_temp_new_i32(); tcg_gen_shri_i32(sfpa, tmp, 31); control = load_cpu_field(v7m.control[M_REG_S]); @@ -390,15 +407,16 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, case ARM_VFP_VPR: /* Behaves as NOP if not privileged */ if (IS_USER(s)) { + loadfn(s, opaque, false); break; } - tmp = loadfn(s, opaque); + tmp = loadfn(s, opaque, true); store_cpu_field(tmp, v7m.vpr); break; case ARM_VFP_P0: { TCGv_i32 vpr; - tmp = loadfn(s, opaque); + tmp = loadfn(s, opaque, true); vpr = load_cpu_field(v7m.vpr); tcg_gen_deposit_i32(vpr, vpr, tmp, R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH); @@ -442,13 +460,13 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, case ARM_VFP_FPSCR: tmp = tcg_temp_new_i32(); gen_helper_vfp_get_fpscr(tmp, cpu_env); - storefn(s, opaque, tmp); + storefn(s, opaque, tmp, true); break; case ARM_VFP_FPSCR_NZCVQC: tmp = tcg_temp_new_i32(); gen_helper_vfp_get_fpscr(tmp, cpu_env); tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK); - storefn(s, opaque, tmp); + storefn(s, opaque, tmp, true); break; case QEMU_VFP_FPSCR_NZCV: /* @@ -457,7 +475,7 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, */ tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); - storefn(s, opaque, tmp); + storefn(s, opaque, tmp, true); break; case ARM_VFP_FPCXT_S: { @@ -476,7 +494,7 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, * Store result before updating FPSCR etc, in case * it is a memory write which causes an exception. */ - storefn(s, opaque, tmp); + storefn(s, opaque, tmp, true); /* * Now we must reset FPSCR from FPDSCR_NS, and clear * CONTROL.SFPA; so we'll end the TB here. @@ -499,7 +517,7 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, gen_branch_fpInactive(s, TCG_COND_EQ, lab_active); /* fpInactive case: reads as FPDSCR_NS */ TCGv_i32 tmp = load_cpu_field(v7m.fpdscr[M_REG_NS]); - storefn(s, opaque, tmp); + storefn(s, opaque, tmp, true); lab_end = gen_new_label(); tcg_gen_br(lab_end); @@ -531,7 +549,7 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, tcg_gen_or_i32(tmp, tmp, sfpa); tcg_temp_free_i32(control); /* Store result before updating FPSCR, in case it faults */ - storefn(s, opaque, tmp); + storefn(s, opaque, tmp, true); /* If SFPA is zero then set FPSCR from FPDSCR_NS */ fpdscr = load_cpu_field(v7m.fpdscr[M_REG_NS]); zero = tcg_const_i32(0); @@ -546,15 +564,16 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, case ARM_VFP_VPR: /* Behaves as NOP if not privileged */ if (IS_USER(s)) { + storefn(s, opaque, NULL, false); break; } tmp = load_cpu_field(v7m.vpr); - storefn(s, opaque, tmp); + storefn(s, opaque, tmp, true); break; case ARM_VFP_P0: tmp = load_cpu_field(v7m.vpr); tcg_gen_extract_i32(tmp, tmp, R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH); - storefn(s, opaque, tmp); + storefn(s, opaque, tmp, true); break; default: g_assert_not_reached(); @@ -569,10 +588,15 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, return true; } -static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value) +static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value, + bool do_access) { arg_VMSR_VMRS *a = opaque; + if (!do_access) { + return; + } + if (a->rt == 15) { /* Set the 4 flag bits in the CPSR */ gen_set_nzcv(value); @@ -582,10 +606,13 @@ static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value) } } -static TCGv_i32 gpr_to_fp_sysreg(DisasContext *s, void *opaque) +static TCGv_i32 gpr_to_fp_sysreg(DisasContext *s, void *opaque, bool do_access) { arg_VMSR_VMRS *a = opaque; + if (!do_access) { + return NULL; + } return load_reg(s, a->rt); } @@ -614,7 +641,8 @@ static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) } } -static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value) +static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value, + bool do_access) { arg_vldr_sysreg *a = opaque; uint32_t offset = a->imm; @@ -624,6 +652,10 @@ static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value) offset = -offset; } + if (!do_access && !a->w) { + return; + } + addr = load_reg(s, a->rn); if (a->p) { tcg_gen_addi_i32(addr, addr, offset); @@ -633,9 +665,11 @@ static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value) gen_helper_v8m_stackcheck(cpu_env, addr); } - gen_aa32_st_i32(s, value, addr, get_mem_index(s), - MO_UL | MO_ALIGN | s->be_data); - tcg_temp_free_i32(value); + if (do_access) { + gen_aa32_st_i32(s, value, addr, get_mem_index(s), + MO_UL | MO_ALIGN | s->be_data); + tcg_temp_free_i32(value); + } if (a->w) { /* writeback */ @@ -648,17 +682,22 @@ static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value) } } -static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque) +static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque, + bool do_access) { arg_vldr_sysreg *a = opaque; uint32_t offset = a->imm; TCGv_i32 addr; - TCGv_i32 value = tcg_temp_new_i32(); + TCGv_i32 value = NULL; if (!a->a) { offset = -offset; } + if (!do_access && !a->w) { + return NULL; + } + addr = load_reg(s, a->rn); if (a->p) { tcg_gen_addi_i32(addr, addr, offset); @@ -668,8 +707,11 @@ static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque) gen_helper_v8m_stackcheck(cpu_env, addr); } - gen_aa32_ld_i32(s, value, addr, get_mem_index(s), - MO_UL | MO_ALIGN | s->be_data); + if (do_access) { + value = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, value, addr, get_mem_index(s), + MO_UL | MO_ALIGN | s->be_data); + } if (a->w) { /* writeback */ From 95aceeeac9a37d05c106ba807867616496d5c90e Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 18 Jun 2021 15:10:17 +0100 Subject: [PATCH 09/57] target/arm: Factor FP context update code out into helper function Factor the code in full_vfp_access_check() which updates the ownership of the FP context and creates a new FP context out into its own function. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210618141019.10671-6-peter.maydell@linaro.org --- target/arm/translate-vfp.c | 104 +++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 46 deletions(-) diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c index 8987ef2e5b..85418dee2e 100644 --- a/target/arm/translate-vfp.c +++ b/target/arm/translate-vfp.c @@ -131,6 +131,62 @@ void gen_preserve_fp_state(DisasContext *s) } } +/* + * Generate code for M-profile FP context handling: update the + * ownership of the FP context, and create a new context if + * necessary. This corresponds to the parts of the pseudocode + * ExecuteFPCheck() after the inital PreserveFPState() call. + */ +static void gen_update_fp_context(DisasContext *s) +{ + /* Update ownership of FP context: set FPCCR.S to match current state */ + if (s->v8m_fpccr_s_wrong) { + TCGv_i32 tmp; + + tmp = load_cpu_field(v7m.fpccr[M_REG_S]); + if (s->v8m_secure) { + tcg_gen_ori_i32(tmp, tmp, R_V7M_FPCCR_S_MASK); + } else { + tcg_gen_andi_i32(tmp, tmp, ~R_V7M_FPCCR_S_MASK); + } + store_cpu_field(tmp, v7m.fpccr[M_REG_S]); + /* Don't need to do this for any further FP insns in this TB */ + s->v8m_fpccr_s_wrong = false; + } + + if (s->v7m_new_fp_ctxt_needed) { + /* + * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA, + * the FPSCR, and VPR. + */ + TCGv_i32 control, fpscr; + uint32_t bits = R_V7M_CONTROL_FPCA_MASK; + + fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]); + gen_helper_vfp_set_fpscr(cpu_env, fpscr); + tcg_temp_free_i32(fpscr); + if (dc_isar_feature(aa32_mve, s)) { + TCGv_i32 z32 = tcg_const_i32(0); + store_cpu_field(z32, v7m.vpr); + } + + /* + * We don't need to arrange to end the TB, because the only + * parts of FPSCR which we cache in the TB flags are the VECLEN + * and VECSTRIDE, and those don't exist for M-profile. + */ + + if (s->v8m_secure) { + bits |= R_V7M_CONTROL_SFPA_MASK; + } + control = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_ori_i32(control, control, bits); + store_cpu_field(control, v7m.control[M_REG_S]); + /* Don't need to do this for any further FP insns in this TB */ + s->v7m_new_fp_ctxt_needed = false; + } +} + /* * Check that VFP access is enabled. If it is, do the necessary * M-profile lazy-FP handling and then return true. @@ -173,52 +229,8 @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled) /* Trigger lazy-state preservation if necessary */ gen_preserve_fp_state(s); - /* Update ownership of FP context: set FPCCR.S to match current state */ - if (s->v8m_fpccr_s_wrong) { - TCGv_i32 tmp; - - tmp = load_cpu_field(v7m.fpccr[M_REG_S]); - if (s->v8m_secure) { - tcg_gen_ori_i32(tmp, tmp, R_V7M_FPCCR_S_MASK); - } else { - tcg_gen_andi_i32(tmp, tmp, ~R_V7M_FPCCR_S_MASK); - } - store_cpu_field(tmp, v7m.fpccr[M_REG_S]); - /* Don't need to do this for any further FP insns in this TB */ - s->v8m_fpccr_s_wrong = false; - } - - if (s->v7m_new_fp_ctxt_needed) { - /* - * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA, - * the FPSCR, and VPR. - */ - TCGv_i32 control, fpscr; - uint32_t bits = R_V7M_CONTROL_FPCA_MASK; - - fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]); - gen_helper_vfp_set_fpscr(cpu_env, fpscr); - tcg_temp_free_i32(fpscr); - if (dc_isar_feature(aa32_mve, s)) { - TCGv_i32 z32 = tcg_const_i32(0); - store_cpu_field(z32, v7m.vpr); - } - - /* - * We don't need to arrange to end the TB, because the only - * parts of FPSCR which we cache in the TB flags are the VECLEN - * and VECSTRIDE, and those don't exist for M-profile. - */ - - if (s->v8m_secure) { - bits |= R_V7M_CONTROL_SFPA_MASK; - } - control = load_cpu_field(v7m.control[M_REG_S]); - tcg_gen_ori_i32(control, control, bits); - store_cpu_field(control, v7m.control[M_REG_S]); - /* Don't need to do this for any further FP insns in this TB */ - s->v7m_new_fp_ctxt_needed = false; - } + /* Update ownership of FP context and create new FP context if needed */ + gen_update_fp_context(s); } return true; From e8cedaf779c0c2b13e0cc1ca580beaf5a6562a73 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 18 Jun 2021 15:10:18 +0100 Subject: [PATCH 10/57] target/arm: Split vfp_access_check() into A and M versions vfp_access_check and its helper routine full_vfp_access_check() has gradually grown and is now an awkward mix of A-profile only and M-profile only pieces. Refactor it into an A-profile only and an M-profile only version, taking advantage of the fact that now the only direct call to full_vfp_access_check() is in A-profile-only code. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210618141019.10671-7-peter.maydell@linaro.org --- target/arm/translate-vfp.c | 79 +++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c index 85418dee2e..d89c7834fa 100644 --- a/target/arm/translate-vfp.c +++ b/target/arm/translate-vfp.c @@ -188,32 +188,19 @@ static void gen_update_fp_context(DisasContext *s) } /* - * Check that VFP access is enabled. If it is, do the necessary - * M-profile lazy-FP handling and then return true. - * If not, emit code to generate an appropriate exception and - * return false. + * Check that VFP access is enabled, A-profile specific version. + * + * If VFP is enabled, return true. If not, emit code to generate an + * appropriate exception and return false. * The ignore_vfp_enabled argument specifies that we should ignore - * whether VFP is enabled via FPEXC[EN]: this should be true for FMXR/FMRX + * whether VFP is enabled via FPEXC.EN: this should be true for FMXR/FMRX * accesses to FPSID, FPEXC, MVFR0, MVFR1, MVFR2, and false for all other insns. */ -static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled) +static bool vfp_access_check_a(DisasContext *s, bool ignore_vfp_enabled) { if (s->fp_excp_el) { - if (arm_dc_feature(s, ARM_FEATURE_M)) { - /* - * M-profile mostly catches the "FPU disabled" case early, in - * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP) - * which do coprocessor-checks are outside the large ranges of - * the encoding space handled by the patterns in m-nocp.decode, - * and for them we may need to raise NOCP here. - */ - gen_exception_insn(s, s->pc_curr, EXCP_NOCP, - syn_uncategorized(), s->fp_excp_el); - } else { - gen_exception_insn(s, s->pc_curr, EXCP_UDEF, - syn_fp_access_trap(1, 0xe, false), - s->fp_excp_el); - } + gen_exception_insn(s, s->pc_curr, EXCP_UDEF, + syn_fp_access_trap(1, 0xe, false), s->fp_excp_el); return false; } @@ -222,17 +209,39 @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled) unallocated_encoding(s); return false; } + return true; +} - if (arm_dc_feature(s, ARM_FEATURE_M)) { - /* Handle M-profile lazy FP state mechanics */ - - /* Trigger lazy-state preservation if necessary */ - gen_preserve_fp_state(s); - - /* Update ownership of FP context and create new FP context if needed */ - gen_update_fp_context(s); +/* + * Check that VFP access is enabled, M-profile specific version. + * + * If VFP is enabled, do the necessary M-profile lazy-FP handling and then + * return true. If not, emit code to generate an appropriate exception and + * return false. + */ +static bool vfp_access_check_m(DisasContext *s) +{ + if (s->fp_excp_el) { + /* + * M-profile mostly catches the "FPU disabled" case early, in + * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP) + * which do coprocessor-checks are outside the large ranges of + * the encoding space handled by the patterns in m-nocp.decode, + * and for them we may need to raise NOCP here. + */ + gen_exception_insn(s, s->pc_curr, EXCP_NOCP, + syn_uncategorized(), s->fp_excp_el); + return false; } + /* Handle M-profile lazy FP state mechanics */ + + /* Trigger lazy-state preservation if necessary */ + gen_preserve_fp_state(s); + + /* Update ownership of FP context and create new FP context if needed */ + gen_update_fp_context(s); + return true; } @@ -242,7 +251,11 @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled) */ bool vfp_access_check(DisasContext *s) { - return full_vfp_access_check(s, false); + if (arm_dc_feature(s, ARM_FEATURE_M)) { + return vfp_access_check_m(s); + } else { + return vfp_access_check_a(s, false); + } } static bool trans_VSEL(DisasContext *s, arg_VSEL *a) @@ -732,7 +745,11 @@ static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) return false; } - if (!full_vfp_access_check(s, ignore_vfp_enabled)) { + /* + * Call vfp_access_check_a() directly, because we need to tell + * it to ignore FPEXC.EN for some register accesses. + */ + if (!vfp_access_check_a(s, ignore_vfp_enabled)) { return true; } From 88137f787f374ac4117877bcc8c8af97326a10bd Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 18 Jun 2021 15:10:19 +0100 Subject: [PATCH 11/57] target/arm: Handle FPU check for FPCXT_NS insns via vfp_access_check_m() Instead of open-coding the "take NOCP exception if FPU disabled, otherwise call gen_preserve_fp_state()" code in the accessors for FPCXT_NS, add an argument to vfp_access_check_m() which tells it to skip the gen_update_fp_context() call, so we can use it for the FPCXT_NS case. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210618141019.10671-8-peter.maydell@linaro.org --- target/arm/translate-a32.h | 2 +- target/arm/translate-m-nocp.c | 10 ++-------- target/arm/translate-vfp.c | 13 ++++++++----- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/target/arm/translate-a32.h b/target/arm/translate-a32.h index abb3ecb6bc..2326405300 100644 --- a/target/arm/translate-a32.h +++ b/target/arm/translate-a32.h @@ -32,7 +32,7 @@ bool disas_neon_shared(DisasContext *s, uint32_t insn); void load_reg_var(DisasContext *s, TCGv_i32 var, int reg); void arm_gen_condlabel(DisasContext *s); bool vfp_access_check(DisasContext *s); -void gen_preserve_fp_state(DisasContext *s); +bool vfp_access_check_m(DisasContext *s, bool skip_context_update); void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop); void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop); void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop); diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c index 312a25f058..5eab04832c 100644 --- a/target/arm/translate-m-nocp.c +++ b/target/arm/translate-m-nocp.c @@ -371,9 +371,7 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, * otherwise PreserveFPState(), and then FPCXT_NS writes * behave the same as FPCXT_S writes. */ - if (s->fp_excp_el) { - gen_exception_insn(s, s->pc_curr, EXCP_NOCP, - syn_uncategorized(), s->fp_excp_el); + if (!vfp_access_check_m(s, true)) { /* * This was only a conditional exception, so override * gen_exception_insn()'s default to DISAS_NORETURN @@ -381,7 +379,6 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, s->base.is_jmp = DISAS_NEXT; break; } - gen_preserve_fp_state(s); } /* fall through */ case ARM_VFP_FPCXT_S: @@ -527,9 +524,7 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, * otherwise PreserveFPState(), and then FPCXT_NS * reads the same as FPCXT_S. */ - if (s->fp_excp_el) { - gen_exception_insn(s, s->pc_curr, EXCP_NOCP, - syn_uncategorized(), s->fp_excp_el); + if (!vfp_access_check_m(s, true)) { /* * This was only a conditional exception, so override * gen_exception_insn()'s default to DISAS_NORETURN @@ -537,7 +532,6 @@ static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, s->base.is_jmp = DISAS_NEXT; break; } - gen_preserve_fp_state(s); tmp = tcg_temp_new_i32(); sfpa = tcg_temp_new_i32(); fpscr = tcg_temp_new_i32(); diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c index d89c7834fa..86e43c02dc 100644 --- a/target/arm/translate-vfp.c +++ b/target/arm/translate-vfp.c @@ -109,7 +109,7 @@ static inline long vfp_f16_offset(unsigned reg, bool top) * Generate code for M-profile lazy FP state preservation if needed; * this corresponds to the pseudocode PreserveFPState() function. */ -void gen_preserve_fp_state(DisasContext *s) +static void gen_preserve_fp_state(DisasContext *s) { if (s->v7m_lspact) { /* @@ -218,8 +218,9 @@ static bool vfp_access_check_a(DisasContext *s, bool ignore_vfp_enabled) * If VFP is enabled, do the necessary M-profile lazy-FP handling and then * return true. If not, emit code to generate an appropriate exception and * return false. + * skip_context_update is true to skip the "update FP context" part of this. */ -static bool vfp_access_check_m(DisasContext *s) +bool vfp_access_check_m(DisasContext *s, bool skip_context_update) { if (s->fp_excp_el) { /* @@ -239,8 +240,10 @@ static bool vfp_access_check_m(DisasContext *s) /* Trigger lazy-state preservation if necessary */ gen_preserve_fp_state(s); - /* Update ownership of FP context and create new FP context if needed */ - gen_update_fp_context(s); + if (!skip_context_update) { + /* Update ownership of FP context and create new FP context if needed */ + gen_update_fp_context(s); + } return true; } @@ -252,7 +255,7 @@ static bool vfp_access_check_m(DisasContext *s) bool vfp_access_check(DisasContext *s) { if (arm_dc_feature(s, ARM_FEATURE_M)) { - return vfp_access_check_m(s); + return vfp_access_check_m(s, false); } else { return vfp_access_check_a(s, false); } From 507b6a500c2f0f6cf6182aa69efac4c20eb3e97b Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:45 +0100 Subject: [PATCH 12/57] target/arm: Implement MVE VLDR/VSTR (non-widening forms) Implement the forms of the MVE VLDR and VSTR insns which perform non-widening loads of bytes, halfwords or words from memory into vector elements of the same width (encodings T5, T6, T7). (At the moment we know for MVE and M-profile in general that vfp_access_check() can never return false, but we include the conventional return-true-on-failure check for consistency with non-M-profile translation code.) Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-2-peter.maydell@linaro.org --- target/arm/helper-mve.h | 24 ++++++ target/arm/helper.h | 2 + target/arm/internals.h | 11 +++ target/arm/meson.build | 1 + target/arm/mve.decode | 22 +++++ target/arm/mve_helper.c | 172 +++++++++++++++++++++++++++++++++++++ target/arm/translate-mve.c | 119 +++++++++++++++++++++++++ 7 files changed, 351 insertions(+) create mode 100644 target/arm/helper-mve.h create mode 100644 target/arm/mve_helper.c diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h new file mode 100644 index 0000000000..9e3b0b09af --- /dev/null +++ b/target/arm/helper-mve.h @@ -0,0 +1,24 @@ +/* + * M-profile MVE specific helper definitions + * + * Copyright (c) 2021 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ +DEF_HELPER_FLAGS_3(mve_vldrb, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrw, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vstrb, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vstrh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vstrw, TCG_CALL_NO_WG, void, env, ptr, i32) diff --git a/target/arm/helper.h b/target/arm/helper.h index dc6eb96d43..db87d7d537 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -1019,3 +1019,5 @@ DEF_HELPER_FLAGS_6(gvec_bfmlal_idx, TCG_CALL_NO_RWG, #include "helper-a64.h" #include "helper-sve.h" #endif + +#include "helper-mve.h" diff --git a/target/arm/internals.h b/target/arm/internals.h index 886db56b58..3ba86e8af8 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -1202,4 +1202,15 @@ static inline uint64_t useronly_maybe_clean_ptr(uint32_t desc, uint64_t ptr) return ptr; } +/* Values for M-profile PSR.ECI for MVE insns */ +enum MVEECIState { + ECI_NONE = 0, /* No completed beats */ + ECI_A0 = 1, /* Completed: A0 */ + ECI_A0A1 = 2, /* Completed: A0, A1 */ + /* 3 is reserved */ + ECI_A0A1A2 = 4, /* Completed: A0, A1, A2 */ + ECI_A0A1A2B0 = 5, /* Completed: A0, A1, A2, B0 */ + /* All other values reserved */ +}; + #endif diff --git a/target/arm/meson.build b/target/arm/meson.build index 2b50be3f86..25a02bf276 100644 --- a/target/arm/meson.build +++ b/target/arm/meson.build @@ -23,6 +23,7 @@ arm_ss.add(files( 'helper.c', 'iwmmxt_helper.c', 'm_helper.c', + 'mve_helper.c', 'neon_helper.c', 'op_helper.c', 'tlb_helper.c', diff --git a/target/arm/mve.decode b/target/arm/mve.decode index c8492bb576..858a161fd7 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -18,3 +18,25 @@ # # This file is processed by scripts/decodetree.py # + +%qd 22:1 13:3 + +&vldr_vstr rn qd imm p a w size l + +@vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd + +# Vector loads and stores + +# Non-widening loads/stores (P=0 W=0 is 'related encoding') +VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111100 ....... @vldr_vstr \ + size=0 p=0 w=1 +VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111101 ....... @vldr_vstr \ + size=1 p=0 w=1 +VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111110 ....... @vldr_vstr \ + size=2 p=0 w=1 +VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111100 ....... @vldr_vstr \ + size=0 p=1 +VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111101 ....... @vldr_vstr \ + size=1 p=1 +VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111110 ....... @vldr_vstr \ + size=2 p=1 diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c new file mode 100644 index 0000000000..60c61268c7 --- /dev/null +++ b/target/arm/mve_helper.c @@ -0,0 +1,172 @@ +/* + * M-profile MVE Operations + * + * Copyright (c) 2021 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "internals.h" +#include "vec_internal.h" +#include "exec/helper-proto.h" +#include "exec/cpu_ldst.h" +#include "exec/exec-all.h" + +static uint16_t mve_element_mask(CPUARMState *env) +{ + /* + * Return the mask of which elements in the MVE vector should be + * updated. This is a combination of multiple things: + * (1) by default, we update every lane in the vector + * (2) VPT predication stores its state in the VPR register; + * (3) low-overhead-branch tail predication will mask out part + * the vector on the final iteration of the loop + * (4) if EPSR.ECI is set then we must execute only some beats + * of the insn + * We combine all these into a 16-bit result with the same semantics + * as VPR.P0: 0 to mask the lane, 1 if it is active. + * 8-bit vector ops will look at all bits of the result; + * 16-bit ops will look at bits 0, 2, 4, ...; + * 32-bit ops will look at bits 0, 4, 8 and 12. + * Compare pseudocode GetCurInstrBeat(), though that only returns + * the 4-bit slice of the mask corresponding to a single beat. + */ + uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0); + + if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) { + mask |= 0xff; + } + if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) { + mask |= 0xff00; + } + + if (env->v7m.ltpsize < 4 && + env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) { + /* + * Tail predication active, and this is the last loop iteration. + * The element size is (1 << ltpsize), and we only want to process + * loopcount elements, so we want to retain the least significant + * (loopcount * esize) predicate bits and zero out bits above that. + */ + int masklen = env->regs[14] << env->v7m.ltpsize; + assert(masklen <= 16); + mask &= MAKE_64BIT_MASK(0, masklen); + } + + if ((env->condexec_bits & 0xf) == 0) { + /* + * ECI bits indicate which beats are already executed; + * we handle this by effectively predicating them out. + */ + int eci = env->condexec_bits >> 4; + switch (eci) { + case ECI_NONE: + break; + case ECI_A0: + mask &= 0xfff0; + break; + case ECI_A0A1: + mask &= 0xff00; + break; + case ECI_A0A1A2: + case ECI_A0A1A2B0: + mask &= 0xf000; + break; + default: + g_assert_not_reached(); + } + } + + return mask; +} + +static void mve_advance_vpt(CPUARMState *env) +{ + /* Advance the VPT and ECI state if necessary */ + uint32_t vpr = env->v7m.vpr; + unsigned mask01, mask23; + + if ((env->condexec_bits & 0xf) == 0) { + env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ? + (ECI_A0 << 4) : (ECI_NONE << 4); + } + + if (!(vpr & (R_V7M_VPR_MASK01_MASK | R_V7M_VPR_MASK23_MASK))) { + /* VPT not enabled, nothing to do */ + return; + } + + mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01); + mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23); + if (mask01 > 8) { + /* high bit set, but not 0b1000: invert the relevant half of P0 */ + vpr ^= 0xff; + } + if (mask23 > 8) { + /* high bit set, but not 0b1000: invert the relevant half of P0 */ + vpr ^= 0xff00; + } + vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1); + vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1); + env->v7m.vpr = vpr; +} + + +#define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ + { \ + TYPE *d = vd; \ + uint16_t mask = mve_element_mask(env); \ + unsigned b, e; \ + /* \ + * R_SXTM allows the dest reg to become UNKNOWN for abandoned \ + * beats so we don't care if we update part of the dest and \ + * then take an exception. \ + */ \ + for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ + if (mask & (1 << b)) { \ + d[H##ESIZE(e)] = cpu_##LDTYPE##_data_ra(env, addr, GETPC()); \ + } \ + addr += MSIZE; \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ + { \ + TYPE *d = vd; \ + uint16_t mask = mve_element_mask(env); \ + unsigned b, e; \ + for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ + if (mask & (1 << b)) { \ + cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \ + } \ + addr += MSIZE; \ + } \ + mve_advance_vpt(env); \ + } + +DO_VLDR(vldrb, 1, ldub, 1, uint8_t) +DO_VLDR(vldrh, 2, lduw, 2, uint16_t) +DO_VLDR(vldrw, 4, ldl, 4, uint32_t) + +DO_VSTR(vstrb, 1, stb, 1, uint8_t) +DO_VSTR(vstrh, 2, stw, 2, uint16_t) +DO_VSTR(vstrw, 4, stl, 4, uint32_t) + +#undef DO_VLDR +#undef DO_VSTR diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index e91f526a1a..f98bd6d038 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -27,3 +27,122 @@ /* Include the generated decoder */ #include "decode-mve.c.inc" + +typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32); + +/* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */ +static inline long mve_qreg_offset(unsigned reg) +{ + return offsetof(CPUARMState, vfp.zregs[reg].d[0]); +} + +static TCGv_ptr mve_qreg_ptr(unsigned reg) +{ + TCGv_ptr ret = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(ret, cpu_env, mve_qreg_offset(reg)); + return ret; +} + +static bool mve_check_qreg_bank(DisasContext *s, int qmask) +{ + /* + * Check whether Qregs are in range. For v8.1M only Q0..Q7 + * are supported, see VFPSmallRegisterBank(). + */ + return qmask < 8; +} + +static bool mve_eci_check(DisasContext *s) +{ + /* + * This is a beatwise insn: check that ECI is valid (not a + * reserved value) and note that we are handling it. + * Return true if OK, false if we generated an exception. + */ + s->eci_handled = true; + switch (s->eci) { + case ECI_NONE: + case ECI_A0: + case ECI_A0A1: + case ECI_A0A1A2: + case ECI_A0A1A2B0: + return true; + default: + /* Reserved value: INVSTATE UsageFault */ + gen_exception_insn(s, s->pc_curr, EXCP_INVSTATE, syn_uncategorized(), + default_exception_el(s)); + return false; + } +} + +static void mve_update_eci(DisasContext *s) +{ + /* + * The helper function will always update the CPUState field, + * so we only need to update the DisasContext field. + */ + if (s->eci) { + s->eci = (s->eci == ECI_A0A1A2B0) ? ECI_A0 : ECI_NONE; + } +} + +static bool do_ldst(DisasContext *s, arg_VLDR_VSTR *a, MVEGenLdStFn *fn) +{ + TCGv_i32 addr; + uint32_t offset; + TCGv_ptr qreg; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd) || + !fn) { + return false; + } + + /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */ + if (a->rn == 15 || (a->rn == 13 && a->w)) { + return false; + } + + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + offset = a->imm << a->size; + if (!a->a) { + offset = -offset; + } + addr = load_reg(s, a->rn); + if (a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + + qreg = mve_qreg_ptr(a->qd); + fn(cpu_env, qreg, addr); + tcg_temp_free_ptr(qreg); + + /* + * Writeback always happens after the last beat of the insn, + * regardless of predication + */ + if (a->w) { + if (!a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } + mve_update_eci(s); + return true; +} + +static bool trans_VLDR_VSTR(DisasContext *s, arg_VLDR_VSTR *a) +{ + static MVEGenLdStFn * const ldstfns[4][2] = { + { gen_helper_mve_vstrb, gen_helper_mve_vldrb }, + { gen_helper_mve_vstrh, gen_helper_mve_vldrh }, + { gen_helper_mve_vstrw, gen_helper_mve_vldrw }, + { NULL, NULL } + }; + return do_ldst(s, a, ldstfns[a->size][a->l]); +} From 2fc6b7510c6859478264b7402ba01dbee86b7e46 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:46 +0100 Subject: [PATCH 13/57] target/arm: Implement widening/narrowing MVE VLDR/VSTR insns Implement the variants of MVE VLDR (encodings T1, T2) which perform "widening" loads where bytes or halfwords are loaded from memory and zero or sign-extended into halfword or word length vector elements, and the narrowing MVE VSTR (encodings T1, T2) where bytes or halfwords are stored from halfword or word elements. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-3-peter.maydell@linaro.org --- target/arm/helper-mve.h | 10 ++++++++++ target/arm/mve.decode | 25 +++++++++++++++++++++++-- target/arm/mve_helper.c | 11 +++++++++++ target/arm/translate-mve.c | 14 ++++++++++++++ 4 files changed, 58 insertions(+), 2 deletions(-) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 9e3b0b09af..e47d4164ae 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -22,3 +22,13 @@ DEF_HELPER_FLAGS_3(mve_vldrw, TCG_CALL_NO_WG, void, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vstrb, TCG_CALL_NO_WG, void, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vstrh, TCG_CALL_NO_WG, void, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vstrw, TCG_CALL_NO_WG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vldrb_sh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrb_sw, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrb_uh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrb_uw, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrh_sw, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrh_uw, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vstrb_h, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vstrb_w, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vstrh_w, TCG_CALL_NO_WG, void, env, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 858a161fd7..3bc5f03453 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -21,12 +21,33 @@ %qd 22:1 13:3 -&vldr_vstr rn qd imm p a w size l +&vldr_vstr rn qd imm p a w size l u -@vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd +@vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0 +# Note that both Rn and Qd are 3 bits only (no D bit) +@vldst_wn ... u:1 ... . . . . l:1 . rn:3 qd:3 . ... .. imm:7 &vldr_vstr # Vector loads and stores +# Widening loads and narrowing stores: +# for these P=0 W=0 is 'related encoding'; sz=11 is 'related encoding' +# This means we need to expand out to multiple patterns for P, W, SZ. +# For stores the U bit must be 0 but we catch that in the trans_ function. +# The naming scheme here is "VLDSTB_H == in-memory byte load/store to/from +# signed halfword element in register", etc. +VLDSTB_H 111 . 110 0 a:1 0 1 . 0 ... ... 0 111 01 ....... @vldst_wn \ + p=0 w=1 size=1 +VLDSTB_H 111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 01 ....... @vldst_wn \ + p=1 size=1 +VLDSTB_W 111 . 110 0 a:1 0 1 . 0 ... ... 0 111 10 ....... @vldst_wn \ + p=0 w=1 size=2 +VLDSTB_W 111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 10 ....... @vldst_wn \ + p=1 size=2 +VLDSTH_W 111 . 110 0 a:1 0 1 . 1 ... ... 0 111 10 ....... @vldst_wn \ + p=0 w=1 size=2 +VLDSTH_W 111 . 110 1 a:1 0 w:1 . 1 ... ... 0 111 10 ....... @vldst_wn \ + p=1 size=2 + # Non-widening loads/stores (P=0 W=0 is 'related encoding') VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111100 ....... @vldr_vstr \ size=0 p=0 w=1 diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 60c61268c7..3c2b036c9c 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -168,5 +168,16 @@ DO_VSTR(vstrb, 1, stb, 1, uint8_t) DO_VSTR(vstrh, 2, stw, 2, uint16_t) DO_VSTR(vstrw, 4, stl, 4, uint32_t) +DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t) +DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t) +DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t) +DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t) +DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t) +DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t) + +DO_VSTR(vstrb_h, 1, stb, 2, int16_t) +DO_VSTR(vstrb_w, 1, stb, 4, int32_t) +DO_VSTR(vstrh_w, 2, stw, 4, int32_t) + #undef DO_VLDR #undef DO_VSTR diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index f98bd6d038..4cabdf7a69 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -146,3 +146,17 @@ static bool trans_VLDR_VSTR(DisasContext *s, arg_VLDR_VSTR *a) }; return do_ldst(s, a, ldstfns[a->size][a->l]); } + +#define DO_VLDST_WIDE_NARROW(OP, SLD, ULD, ST) \ + static bool trans_##OP(DisasContext *s, arg_VLDR_VSTR *a) \ + { \ + static MVEGenLdStFn * const ldstfns[2][2] = { \ + { gen_helper_mve_##ST, gen_helper_mve_##SLD }, \ + { NULL, gen_helper_mve_##ULD }, \ + }; \ + return do_ldst(s, a, ldstfns[a->u][a->l]); \ + } + +DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h) +DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w) +DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w) From 0f0f2bd54817ffad1ccb15dd0fb3adf2db1ec394 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:47 +0100 Subject: [PATCH 14/57] target/arm: Implement MVE VCLZ Implement the MVE VCLZ insn (and the necessary machinery for MVE 1-input vector ops). Note that for non-load instructions predication is always performed at a byte level granularity regardless of element size (R_ZLSJ), and so the masking logic here differs from that used in the VLDR and VSTR helpers. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-4-peter.maydell@linaro.org --- target/arm/helper-mve.h | 4 ++ target/arm/mve.decode | 8 ++++ target/arm/mve_helper.c | 82 ++++++++++++++++++++++++++++++++++++++ target/arm/translate-mve.c | 38 ++++++++++++++++++ 4 files changed, 132 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index e47d4164ae..c5c1315b16 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -32,3 +32,7 @@ DEF_HELPER_FLAGS_3(mve_vldrh_uw, TCG_CALL_NO_WG, void, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vstrb_h, TCG_CALL_NO_WG, void, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vstrb_w, TCG_CALL_NO_WG, void, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vstrh_w, TCG_CALL_NO_WG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vclzb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vclzh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vclzw, TCG_CALL_NO_WG, void, env, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 3bc5f03453..24999bf703 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -20,13 +20,17 @@ # %qd 22:1 13:3 +%qm 5:1 1:3 &vldr_vstr rn qd imm p a w size l u +&1op qd qm size @vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0 # Note that both Rn and Qd are 3 bits only (no D bit) @vldst_wn ... u:1 ... . . . . l:1 . rn:3 qd:3 . ... .. imm:7 &vldr_vstr +@1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm + # Vector loads and stores # Widening loads and narrowing stores: @@ -61,3 +65,7 @@ VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111101 ....... @vldr_vstr \ size=1 p=1 VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111110 ....... @vldr_vstr \ size=2 p=1 + +# Vector miscellaneous + +VCLZ 1111 1111 1 . 11 .. 00 ... 0 0100 11 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 3c2b036c9c..f2fae523e2 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -181,3 +181,85 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) #undef DO_VLDR #undef DO_VSTR + +/* + * The mergemask(D, R, M) macro performs the operation "*D = R" but + * storing only the bytes which correspond to 1 bits in M, + * leaving other bytes in *D unchanged. We use _Generic + * to select the correct implementation based on the type of D. + */ + +static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask) +{ + if (mask & 1) { + *d = r; + } +} + +static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask) +{ + mergemask_ub((uint8_t *)d, r, mask); +} + +static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask) +{ + uint16_t bmask = expand_pred_b_data[mask & 3]; + *d = (*d & ~bmask) | (r & bmask); +} + +static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask) +{ + mergemask_uh((uint16_t *)d, r, mask); +} + +static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask) +{ + uint32_t bmask = expand_pred_b_data[mask & 0xf]; + *d = (*d & ~bmask) | (r & bmask); +} + +static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask) +{ + mergemask_uw((uint32_t *)d, r, mask); +} + +static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask) +{ + uint64_t bmask = expand_pred_b_data[mask & 0xff]; + *d = (*d & ~bmask) | (r & bmask); +} + +static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask) +{ + mergemask_uq((uint64_t *)d, r, mask); +} + +#define mergemask(D, R, M) \ + _Generic(D, \ + uint8_t *: mergemask_ub, \ + int8_t *: mergemask_sb, \ + uint16_t *: mergemask_uh, \ + int16_t *: mergemask_sh, \ + uint32_t *: mergemask_uw, \ + int32_t *: mergemask_sw, \ + uint64_t *: mergemask_uq, \ + int64_t *: mergemask_sq)(D, R, M) + +#define DO_1OP(OP, ESIZE, TYPE, FN) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ + { \ + TYPE *d = vd, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_CLZ_B(N) (clz32(N) - 24) +#define DO_CLZ_H(N) (clz32(N) - 16) + +DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B) +DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H) +DO_1OP(vclzw, 4, uint32_t, clz32) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 4cabdf7a69..9eb6a68c97 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -29,6 +29,7 @@ #include "decode-mve.c.inc" typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); /* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */ static inline long mve_qreg_offset(unsigned reg) @@ -160,3 +161,40 @@ static bool trans_VLDR_VSTR(DisasContext *s, arg_VLDR_VSTR *a) DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h) DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w) DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w) + +static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn) +{ + TCGv_ptr qd, qm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd | a->qm) || + !fn) { + return false; + } + + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + qm = mve_qreg_ptr(a->qm); + fn(cpu_env, qd, qm); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qm); + mve_update_eci(s); + return true; +} + +#define DO_1OP(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_1op *a) \ + { \ + static MVEGenOneOpFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_1op(s, a, fns[a->size]); \ + } + +DO_1OP(VCLZ, vclz) From 6437f1f77c3bca329b6464e9357647f33d85e9ef Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:48 +0100 Subject: [PATCH 15/57] target/arm: Implement MVE VCLS Implement the MVE VCLS insn. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-5-peter.maydell@linaro.org --- target/arm/helper-mve.h | 4 ++++ target/arm/mve.decode | 1 + target/arm/mve_helper.c | 7 +++++++ target/arm/translate-mve.c | 1 + 4 files changed, 13 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index c5c1315b16..bdd6675ea1 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -33,6 +33,10 @@ DEF_HELPER_FLAGS_3(mve_vstrb_h, TCG_CALL_NO_WG, void, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vstrb_w, TCG_CALL_NO_WG, void, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vstrh_w, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vclsb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vclsh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vclsw, TCG_CALL_NO_WG, void, env, ptr, ptr) + DEF_HELPER_FLAGS_3(mve_vclzb, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vclzh, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vclzw, TCG_CALL_NO_WG, void, env, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 24999bf703..adceef9159 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -68,4 +68,5 @@ VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111110 ....... @vldr_vstr \ # Vector miscellaneous +VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op VCLZ 1111 1111 1 . 11 .. 00 ... 0 0100 11 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index f2fae523e2..ba01ea3bcd 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -257,6 +257,13 @@ static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask) mve_advance_vpt(env); \ } +#define DO_CLS_B(N) (clrsb32(N) - 24) +#define DO_CLS_H(N) (clrsb32(N) - 16) + +DO_1OP(vclsb, 1, int8_t, DO_CLS_B) +DO_1OP(vclsh, 2, int16_t, DO_CLS_H) +DO_1OP(vclsw, 4, int32_t, clrsb32) + #define DO_CLZ_B(N) (clz32(N) - 24) #define DO_CLZ_H(N) (clz32(N) - 16) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 9eb6a68c97..4e5d032242 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -198,3 +198,4 @@ static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn) } DO_1OP(VCLZ, vclz) +DO_1OP(VCLS, vcls) From 249b5309c44831555b7fb6dab68d7a6f9f573882 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:49 +0100 Subject: [PATCH 16/57] target/arm: Implement MVE VREV16, VREV32, VREV64 Implement the MVE instructions VREV16, VREV32 and VREV64. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-6-peter.maydell@linaro.org --- target/arm/helper-mve.h | 7 +++++++ target/arm/mve.decode | 4 ++++ target/arm/mve_helper.c | 7 +++++++ target/arm/translate-mve.c | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index bdd6675ea1..4c89387587 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -40,3 +40,10 @@ DEF_HELPER_FLAGS_3(mve_vclsw, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vclzb, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vclzh, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vclzw, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vrev16b, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vrev32b, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vrev32h, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vrev64b, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vrev64h, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vrev64w, TCG_CALL_NO_WG, void, env, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index adceef9159..16ee511a5c 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -70,3 +70,7 @@ VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111110 ....... @vldr_vstr \ VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op VCLZ 1111 1111 1 . 11 .. 00 ... 0 0100 11 . 0 ... 0 @1op + +VREV16 1111 1111 1 . 11 .. 00 ... 0 0001 01 . 0 ... 0 @1op +VREV32 1111 1111 1 . 11 .. 00 ... 0 0000 11 . 0 ... 0 @1op +VREV64 1111 1111 1 . 11 .. 00 ... 0 0000 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index ba01ea3bcd..8b565b50a9 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -270,3 +270,10 @@ DO_1OP(vclsw, 4, int32_t, clrsb32) DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B) DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H) DO_1OP(vclzw, 4, uint32_t, clz32) + +DO_1OP(vrev16b, 2, uint16_t, bswap16) +DO_1OP(vrev32b, 4, uint32_t, bswap32) +DO_1OP(vrev32h, 4, uint32_t, hswap32) +DO_1OP(vrev64b, 8, uint64_t, bswap64) +DO_1OP(vrev64h, 8, uint64_t, hswap64) +DO_1OP(vrev64w, 8, uint64_t, wswap64) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 4e5d032242..32a8324c5e 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -199,3 +199,36 @@ static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn) DO_1OP(VCLZ, vclz) DO_1OP(VCLS, vcls) + +static bool trans_VREV16(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + gen_helper_mve_vrev16b, + NULL, + NULL, + NULL, + }; + return do_1op(s, a, fns[a->size]); +} + +static bool trans_VREV32(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + gen_helper_mve_vrev32b, + gen_helper_mve_vrev32h, + NULL, + NULL, + }; + return do_1op(s, a, fns[a->size]); +} + +static bool trans_VREV64(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + gen_helper_mve_vrev64b, + gen_helper_mve_vrev64h, + gen_helper_mve_vrev64w, + NULL, + }; + return do_1op(s, a, fns[a->size]); +} From 8abd3c80b18757c54e9b270244ca351e407b6405 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:50 +0100 Subject: [PATCH 17/57] target/arm: Implement MVE VMVN (register) Implement the MVE VMVN(register) operation. Note that for predication this operation is byte-by-byte. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-7-peter.maydell@linaro.org --- target/arm/helper-mve.h | 2 ++ target/arm/mve.decode | 3 +++ target/arm/mve_helper.c | 4 ++++ target/arm/translate-mve.c | 5 +++++ 4 files changed, 14 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 4c89387587..f1dc52f7a5 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -47,3 +47,5 @@ DEF_HELPER_FLAGS_3(mve_vrev32h, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vrev64b, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vrev64h, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vrev64w, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vmvn, TCG_CALL_NO_WG, void, env, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 16ee511a5c..ff8afb682f 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -30,6 +30,7 @@ @vldst_wn ... u:1 ... . . . . l:1 . rn:3 qd:3 . ... .. imm:7 &vldr_vstr @1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm +@1op_nosz .... .... .... .... .... .... .... .... &1op qd=%qd qm=%qm size=0 # Vector loads and stores @@ -74,3 +75,5 @@ VCLZ 1111 1111 1 . 11 .. 00 ... 0 0100 11 . 0 ... 0 @1op VREV16 1111 1111 1 . 11 .. 00 ... 0 0001 01 . 0 ... 0 @1op VREV32 1111 1111 1 . 11 .. 00 ... 0 0000 11 . 0 ... 0 @1op VREV64 1111 1111 1 . 11 .. 00 ... 0 0000 01 . 0 ... 0 @1op + +VMVN 1111 1111 1 . 11 00 00 ... 0 0101 11 . 0 ... 0 @1op_nosz diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 8b565b50a9..fa0a32d284 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -277,3 +277,7 @@ DO_1OP(vrev32h, 4, uint32_t, hswap32) DO_1OP(vrev64b, 8, uint64_t, bswap64) DO_1OP(vrev64h, 8, uint64_t, hswap64) DO_1OP(vrev64w, 8, uint64_t, wswap64) + +#define DO_NOT(N) (~(N)) + +DO_1OP(vmvn, 8, uint64_t, DO_NOT) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 32a8324c5e..bd908abcff 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -232,3 +232,8 @@ static bool trans_VREV64(DisasContext *s, arg_1op *a) }; return do_1op(s, a, fns[a->size]); } + +static bool trans_VMVN(DisasContext *s, arg_1op *a) +{ + return do_1op(s, a, gen_helper_mve_vmvn); +} From 59c917733809c6ac7d08a10ec3cf23ae50130248 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:51 +0100 Subject: [PATCH 18/57] target/arm: Implement MVE VABS Implement the MVE VABS functions (both integer and floating point). Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-8-peter.maydell@linaro.org --- target/arm/helper-mve.h | 6 ++++++ target/arm/mve.decode | 3 +++ target/arm/mve_helper.c | 13 +++++++++++++ target/arm/translate-mve.c | 15 +++++++++++++++ 4 files changed, 37 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index f1dc52f7a5..76508d5dd7 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -49,3 +49,9 @@ DEF_HELPER_FLAGS_3(mve_vrev64h, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vrev64w, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vmvn, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vabsb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vabsh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vabsw, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vfabsh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vfabss, TCG_CALL_NO_WG, void, env, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index ff8afb682f..66963dc184 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -77,3 +77,6 @@ VREV32 1111 1111 1 . 11 .. 00 ... 0 0000 11 . 0 ... 0 @1op VREV64 1111 1111 1 . 11 .. 00 ... 0 0000 01 . 0 ... 0 @1op VMVN 1111 1111 1 . 11 00 00 ... 0 0101 11 . 0 ... 0 @1op_nosz + +VABS 1111 1111 1 . 11 .. 01 ... 0 0011 01 . 0 ... 0 @1op +VABS_fp 1111 1111 1 . 11 .. 01 ... 0 0111 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index fa0a32d284..2cf28f054b 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -24,6 +24,7 @@ #include "exec/helper-proto.h" #include "exec/cpu_ldst.h" #include "exec/exec-all.h" +#include "tcg/tcg.h" static uint16_t mve_element_mask(CPUARMState *env) { @@ -281,3 +282,15 @@ DO_1OP(vrev64w, 8, uint64_t, wswap64) #define DO_NOT(N) (~(N)) DO_1OP(vmvn, 8, uint64_t, DO_NOT) + +#define DO_ABS(N) ((N) < 0 ? -(N) : (N)) +#define DO_FABSH(N) ((N) & dup_const(MO_16, 0x7fff)) +#define DO_FABSS(N) ((N) & dup_const(MO_32, 0x7fffffff)) + +DO_1OP(vabsb, 1, int8_t, DO_ABS) +DO_1OP(vabsh, 2, int16_t, DO_ABS) +DO_1OP(vabsw, 4, int32_t, DO_ABS) + +/* We can do these 64 bits at a time */ +DO_1OP(vfabsh, 8, uint64_t, DO_FABSH) +DO_1OP(vfabss, 8, uint64_t, DO_FABSS) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index bd908abcff..90996813a8 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -199,6 +199,7 @@ static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn) DO_1OP(VCLZ, vclz) DO_1OP(VCLS, vcls) +DO_1OP(VABS, vabs) static bool trans_VREV16(DisasContext *s, arg_1op *a) { @@ -237,3 +238,17 @@ static bool trans_VMVN(DisasContext *s, arg_1op *a) { return do_1op(s, a, gen_helper_mve_vmvn); } + +static bool trans_VABS_fp(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + NULL, + gen_helper_mve_vfabsh, + gen_helper_mve_vfabss, + NULL, + }; + if (!dc_isar_feature(aa32_mve_fp, s)) { + return false; + } + return do_1op(s, a, fns[a->size]); +} From 399a8c766c0526b51cd180e1b1c776d6dc95bad8 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:52 +0100 Subject: [PATCH 19/57] target/arm: Implement MVE VNEG Implement the MVE VNEG insn (both integer and floating point forms). Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-9-peter.maydell@linaro.org --- target/arm/helper-mve.h | 6 ++++++ target/arm/mve.decode | 2 ++ target/arm/mve_helper.c | 12 ++++++++++++ target/arm/translate-mve.c | 15 +++++++++++++++ 4 files changed, 35 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 76508d5dd7..733a54d2e3 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -55,3 +55,9 @@ DEF_HELPER_FLAGS_3(mve_vabsh, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vabsw, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vfabsh, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vfabss, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vnegb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vnegh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vnegw, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vfnegh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vfnegs, TCG_CALL_NO_WG, void, env, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 66963dc184..82cc0abcb8 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -80,3 +80,5 @@ VMVN 1111 1111 1 . 11 00 00 ... 0 0101 11 . 0 ... 0 @1op_nosz VABS 1111 1111 1 . 11 .. 01 ... 0 0011 01 . 0 ... 0 @1op VABS_fp 1111 1111 1 . 11 .. 01 ... 0 0111 01 . 0 ... 0 @1op +VNEG 1111 1111 1 . 11 .. 01 ... 0 0011 11 . 0 ... 0 @1op +VNEG_fp 1111 1111 1 . 11 .. 01 ... 0 0111 11 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 2cf28f054b..7b662f9e03 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -294,3 +294,15 @@ DO_1OP(vabsw, 4, int32_t, DO_ABS) /* We can do these 64 bits at a time */ DO_1OP(vfabsh, 8, uint64_t, DO_FABSH) DO_1OP(vfabss, 8, uint64_t, DO_FABSS) + +#define DO_NEG(N) (-(N)) +#define DO_FNEGH(N) ((N) ^ dup_const(MO_16, 0x8000)) +#define DO_FNEGS(N) ((N) ^ dup_const(MO_32, 0x80000000)) + +DO_1OP(vnegb, 1, int8_t, DO_NEG) +DO_1OP(vnegh, 2, int16_t, DO_NEG) +DO_1OP(vnegw, 4, int32_t, DO_NEG) + +/* We can do these 64 bits at a time */ +DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH) +DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 90996813a8..ad2e4af284 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -200,6 +200,7 @@ static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn) DO_1OP(VCLZ, vclz) DO_1OP(VCLS, vcls) DO_1OP(VABS, vabs) +DO_1OP(VNEG, vneg) static bool trans_VREV16(DisasContext *s, arg_1op *a) { @@ -252,3 +253,17 @@ static bool trans_VABS_fp(DisasContext *s, arg_1op *a) } return do_1op(s, a, fns[a->size]); } + +static bool trans_VNEG_fp(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + NULL, + gen_helper_mve_vfnegh, + gen_helper_mve_vfnegs, + NULL, + }; + if (!dc_isar_feature(aa32_mve_fp, s)) { + return false; + } + return do_1op(s, a, fns[a->size]); +} From 614dd4f3ba2a025eae5235c3466ef6da191879f6 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:53 +0100 Subject: [PATCH 20/57] tcg: Make gen_dup_i32/i64() public as tcg_gen_dup_i32/i64 The Arm MVE VDUP implementation would like to be able to emit code to duplicate a byte or halfword value into an i32. We have code to do this already in tcg-op-gvec.c, so all we need to do is make the functions global. For consistency with other functions made available to the frontends: * we rename to tcg_gen_dup_* * we expose both the _i32 and _i64 forms * we provide the #define for a _tl form Suggested-by: Richard Henderson Reviewed-by: Richard Henderson Signed-off-by: Peter Maydell Message-id: 20210617121628.20116-10-peter.maydell@linaro.org --- include/tcg/tcg-op.h | 8 ++++++++ include/tcg/tcg.h | 1 - tcg/tcg-op-gvec.c | 20 ++++++++++---------- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h index ef8a008ea7..1a2ae93758 100644 --- a/include/tcg/tcg-op.h +++ b/include/tcg/tcg-op.h @@ -338,6 +338,9 @@ void tcg_gen_umin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2); void tcg_gen_umax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2); void tcg_gen_abs_i32(TCGv_i32, TCGv_i32); +/* Replicate a value of size @vece from @in to all the lanes in @out */ +void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in); + static inline void tcg_gen_discard_i32(TCGv_i32 arg) { tcg_gen_op1_i32(INDEX_op_discard, arg); @@ -534,6 +537,9 @@ void tcg_gen_umin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2); void tcg_gen_umax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2); void tcg_gen_abs_i64(TCGv_i64, TCGv_i64); +/* Replicate a value of size @vece from @in to all the lanes in @out */ +void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in); + #if TCG_TARGET_REG_BITS == 64 static inline void tcg_gen_discard_i64(TCGv_i64 arg) { @@ -1127,6 +1133,7 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t); #define tcg_gen_atomic_smax_fetch_tl tcg_gen_atomic_smax_fetch_i64 #define tcg_gen_atomic_umax_fetch_tl tcg_gen_atomic_umax_fetch_i64 #define tcg_gen_dup_tl_vec tcg_gen_dup_i64_vec +#define tcg_gen_dup_tl tcg_gen_dup_i64 #else #define tcg_gen_movi_tl tcg_gen_movi_i32 #define tcg_gen_mov_tl tcg_gen_mov_i32 @@ -1241,6 +1248,7 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t); #define tcg_gen_atomic_smax_fetch_tl tcg_gen_atomic_smax_fetch_i32 #define tcg_gen_atomic_umax_fetch_tl tcg_gen_atomic_umax_fetch_i32 #define tcg_gen_dup_tl_vec tcg_gen_dup_i32_vec +#define tcg_gen_dup_tl tcg_gen_dup_i32 #endif #if UINTPTR_MAX == UINT32_MAX diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h index 064dab383b..483e1e1f24 100644 --- a/include/tcg/tcg.h +++ b/include/tcg/tcg.h @@ -1331,7 +1331,6 @@ uint64_t dup_const(unsigned vece, uint64_t c); : (qemu_build_not_reached_always(), 0)) \ : dup_const(VECE, C)) - /* * Memory helpers that will be used by TCG generated code. */ diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c index 498a959839..515db120cc 100644 --- a/tcg/tcg-op-gvec.c +++ b/tcg/tcg-op-gvec.c @@ -386,7 +386,7 @@ uint64_t (dup_const)(unsigned vece, uint64_t c) } /* Duplicate IN into OUT as per VECE. */ -static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) +void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) { switch (vece) { case MO_8: @@ -404,7 +404,7 @@ static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) } } -static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) +void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) { switch (vece) { case MO_8: @@ -578,15 +578,15 @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, && (vece != MO_32 || !check_size_impl(oprsz, 4))) { t_64 = tcg_temp_new_i64(); tcg_gen_extu_i32_i64(t_64, in_32); - gen_dup_i64(vece, t_64, t_64); + tcg_gen_dup_i64(vece, t_64, t_64); } else { t_32 = tcg_temp_new_i32(); - gen_dup_i32(vece, t_32, in_32); + tcg_gen_dup_i32(vece, t_32, in_32); } } else if (in_64) { /* We are given a 64-bit variable input. */ t_64 = tcg_temp_new_i64(); - gen_dup_i64(vece, t_64, in_64); + tcg_gen_dup_i64(vece, t_64, in_64); } else { /* We are given a constant input. */ /* For 64-bit hosts, use 64-bit constants for "simple" constants @@ -1311,14 +1311,14 @@ void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, } else if (g->fni8 && check_size_impl(oprsz, 8)) { TCGv_i64 t64 = tcg_temp_new_i64(); - gen_dup_i64(g->vece, t64, c); + tcg_gen_dup_i64(g->vece, t64, c); expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); tcg_temp_free_i64(t64); } else if (g->fni4 && check_size_impl(oprsz, 4)) { TCGv_i32 t32 = tcg_temp_new_i32(); tcg_gen_extrl_i64_i32(t32, c); - gen_dup_i32(g->vece, t32, t32); + tcg_gen_dup_i32(g->vece, t32, t32); expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); tcg_temp_free_i32(t32); } else { @@ -2538,7 +2538,7 @@ void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) { TCGv_i64 tmp = tcg_temp_new_i64(); - gen_dup_i64(vece, tmp, c); + tcg_gen_dup_i64(vece, tmp, c); tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); tcg_temp_free_i64(tmp); } @@ -2562,7 +2562,7 @@ void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) { TCGv_i64 tmp = tcg_temp_new_i64(); - gen_dup_i64(vece, tmp, c); + tcg_gen_dup_i64(vece, tmp, c); tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); tcg_temp_free_i64(tmp); } @@ -2586,7 +2586,7 @@ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) { TCGv_i64 tmp = tcg_temp_new_i64(); - gen_dup_i64(vece, tmp, c); + tcg_gen_dup_i64(vece, tmp, c); tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); tcg_temp_free_i64(tmp); } From ab59362fca0c23fbd21daceb78d6b2966fbf9793 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:54 +0100 Subject: [PATCH 21/57] target/arm: Implement MVE VDUP Implement the MVE VDUP insn, which duplicates a value from a general-purpose register into every lane of a vector register (subject to predication). Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-11-peter.maydell@linaro.org --- target/arm/helper-mve.h | 2 ++ target/arm/mve.decode | 10 ++++++++++ target/arm/mve_helper.c | 16 ++++++++++++++++ target/arm/translate-mve.c | 27 +++++++++++++++++++++++++++ 4 files changed, 55 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 733a54d2e3..64c3f9e049 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -33,6 +33,8 @@ DEF_HELPER_FLAGS_3(mve_vstrb_h, TCG_CALL_NO_WG, void, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vstrb_w, TCG_CALL_NO_WG, void, env, ptr, i32) DEF_HELPER_FLAGS_3(mve_vstrh_w, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vdup, TCG_CALL_NO_WG, void, env, ptr, i32) + DEF_HELPER_FLAGS_3(mve_vclsb, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vclsh, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vclsw, TCG_CALL_NO_WG, void, env, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 82cc0abcb8..09849917f5 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -21,6 +21,7 @@ %qd 22:1 13:3 %qm 5:1 1:3 +%qn 7:1 17:3 &vldr_vstr rn qd imm p a w size l u &1op qd qm size @@ -82,3 +83,12 @@ VABS 1111 1111 1 . 11 .. 01 ... 0 0011 01 . 0 ... 0 @1op VABS_fp 1111 1111 1 . 11 .. 01 ... 0 0111 01 . 0 ... 0 @1op VNEG 1111 1111 1 . 11 .. 01 ... 0 0011 11 . 0 ... 0 @1op VNEG_fp 1111 1111 1 . 11 .. 01 ... 0 0111 11 . 0 ... 0 @1op + +&vdup qd rt size +# Qd is in the fields usually named Qn +@vdup .... .... . . .. ... . rt:4 .... . . . . .... qd=%qn &vdup + +# B and E bits encode size, which we decode here to the usual size values +VDUP 1110 1110 1 1 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=0 +VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 1 1 0000 @vdup size=1 +VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2 diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 7b662f9e03..e17ffdccac 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -246,6 +246,22 @@ static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask) uint64_t *: mergemask_uq, \ int64_t *: mergemask_sq)(D, R, M) +void HELPER(mve_vdup)(CPUARMState *env, void *vd, uint32_t val) +{ + /* + * The generated code already replicated an 8 or 16 bit constant + * into the 32-bit value, so we only need to write the 32-bit + * value to all elements of the Qreg, allowing for predication. + */ + uint32_t *d = vd; + uint16_t mask = mve_element_mask(env); + unsigned e; + for (e = 0; e < 16 / 4; e++, mask >>= 4) { + mergemask(&d[H4(e)], val, mask); + } + mve_advance_vpt(env); +} + #define DO_1OP(OP, ESIZE, TYPE, FN) \ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ { \ diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index ad2e4af284..3714be7f8d 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -162,6 +162,33 @@ DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h) DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w) DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w) +static bool trans_VDUP(DisasContext *s, arg_VDUP *a) +{ + TCGv_ptr qd; + TCGv_i32 rt; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd)) { + return false; + } + if (a->rt == 13 || a->rt == 15) { + /* UNPREDICTABLE; we choose to UNDEF */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + rt = load_reg(s, a->rt); + tcg_gen_dup_i32(a->size, rt, rt); + gen_helper_mve_vdup(cpu_env, qd, rt); + tcg_temp_free_ptr(qd); + tcg_temp_free_i32(rt); + mve_update_eci(s); + return true; +} + static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn) { TCGv_ptr qd, qm; From 68245e442c9e5175d5e9d3a797dcab7eee800253 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:55 +0100 Subject: [PATCH 22/57] target/arm: Implement MVE VAND, VBIC, VORR, VORN, VEOR Implement the MVE vector logical operations operating on two registers. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-12-peter.maydell@linaro.org --- target/arm/helper-mve.h | 6 ++++++ target/arm/mve.decode | 9 +++++++++ target/arm/mve_helper.c | 26 ++++++++++++++++++++++++++ target/arm/translate-mve.c | 37 +++++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 64c3f9e049..01b6123f25 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -63,3 +63,9 @@ DEF_HELPER_FLAGS_3(mve_vnegh, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vnegw, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vfnegh, TCG_CALL_NO_WG, void, env, ptr, ptr) DEF_HELPER_FLAGS_3(mve_vfnegs, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vand, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vbic, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vorr, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vorn, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_veor, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 09849917f5..332e0b8d1d 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -25,6 +25,7 @@ &vldr_vstr rn qd imm p a w size l u &1op qd qm size +&2op qd qm qn size @vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0 # Note that both Rn and Qd are 3 bits only (no D bit) @@ -32,6 +33,7 @@ @1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm @1op_nosz .... .... .... .... .... .... .... .... &1op qd=%qd qm=%qm size=0 +@2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0 # Vector loads and stores @@ -68,6 +70,13 @@ VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111101 ....... @vldr_vstr \ VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111110 ....... @vldr_vstr \ size=2 p=1 +# Vector 2-op +VAND 1110 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VBIC 1110 1111 0 . 01 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VORR 1110 1111 0 . 10 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VORN 1110 1111 0 . 11 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VEOR 1111 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index e17ffdccac..da62b0e012 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -322,3 +322,29 @@ DO_1OP(vnegw, 4, int32_t, DO_NEG) /* We can do these 64 bits at a time */ DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH) DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS) + +#define DO_2OP(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, \ + void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], \ + FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_AND(N, M) ((N) & (M)) +#define DO_BIC(N, M) ((N) & ~(M)) +#define DO_ORR(N, M) ((N) | (M)) +#define DO_ORN(N, M) ((N) | ~(M)) +#define DO_EOR(N, M) ((N) ^ (M)) + +DO_2OP(vand, 8, uint64_t, DO_AND) +DO_2OP(vbic, 8, uint64_t, DO_BIC) +DO_2OP(vorr, 8, uint64_t, DO_ORR) +DO_2OP(vorn, 8, uint64_t, DO_ORN) +DO_2OP(veor, 8, uint64_t, DO_EOR) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 3714be7f8d..2546567774 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -30,6 +30,7 @@ typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32); typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); +typedef void MVEGenTwoOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr); /* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */ static inline long mve_qreg_offset(unsigned reg) @@ -294,3 +295,39 @@ static bool trans_VNEG_fp(DisasContext *s, arg_1op *a) } return do_1op(s, a, fns[a->size]); } + +static bool do_2op(DisasContext *s, arg_2op *a, MVEGenTwoOpFn fn) +{ + TCGv_ptr qd, qn, qm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd | a->qn | a->qm) || + !fn) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + qn = mve_qreg_ptr(a->qn); + qm = mve_qreg_ptr(a->qm); + fn(cpu_env, qd, qn, qm); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qn); + tcg_temp_free_ptr(qm); + mve_update_eci(s); + return true; +} + +#define DO_LOGIC(INSN, HELPER) \ + static bool trans_##INSN(DisasContext *s, arg_2op *a) \ + { \ + return do_2op(s, a, HELPER); \ + } + +DO_LOGIC(VAND, gen_helper_mve_vand) +DO_LOGIC(VBIC, gen_helper_mve_vbic) +DO_LOGIC(VORR, gen_helper_mve_vorr) +DO_LOGIC(VORN, gen_helper_mve_vorn) +DO_LOGIC(VEOR, gen_helper_mve_veor) From 9333fe4dd39709ce9898750d517568e5c2fb2e32 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:56 +0100 Subject: [PATCH 23/57] target/arm: Implement MVE VADD, VSUB, VMUL Implement the MVE VADD, VSUB and VMUL insns. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-13-peter.maydell@linaro.org --- target/arm/helper-mve.h | 12 ++++++++++++ target/arm/mve.decode | 5 +++++ target/arm/mve_helper.c | 14 ++++++++++++++ target/arm/translate-mve.c | 16 ++++++++++++++++ 4 files changed, 47 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 01b6123f25..707b9cbd54 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -69,3 +69,15 @@ DEF_HELPER_FLAGS_4(mve_vbic, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vorr, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vorn, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_veor, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vaddb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vaddh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vaddw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vsubb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vsubh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vsubw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vmulb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 332e0b8d1d..f7d1d303f1 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -33,6 +33,7 @@ @1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm @1op_nosz .... .... .... .... .... .... .... .... &1op qd=%qd qm=%qm size=0 +@2op .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn @2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0 # Vector loads and stores @@ -77,6 +78,10 @@ VORR 1110 1111 0 . 10 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz VORN 1110 1111 0 . 11 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz VEOR 1111 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VADD 1110 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op +VSUB 1111 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op +VMUL 1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index da62b0e012..23da96402e 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -337,6 +337,12 @@ DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS) mve_advance_vpt(env); \ } +/* provide unsigned 2-op helpers for all sizes */ +#define DO_2OP_U(OP, FN) \ + DO_2OP(OP##b, 1, uint8_t, FN) \ + DO_2OP(OP##h, 2, uint16_t, FN) \ + DO_2OP(OP##w, 4, uint32_t, FN) + #define DO_AND(N, M) ((N) & (M)) #define DO_BIC(N, M) ((N) & ~(M)) #define DO_ORR(N, M) ((N) | (M)) @@ -348,3 +354,11 @@ DO_2OP(vbic, 8, uint64_t, DO_BIC) DO_2OP(vorr, 8, uint64_t, DO_ORR) DO_2OP(vorn, 8, uint64_t, DO_ORN) DO_2OP(veor, 8, uint64_t, DO_EOR) + +#define DO_ADD(N, M) ((N) + (M)) +#define DO_SUB(N, M) ((N) - (M)) +#define DO_MUL(N, M) ((N) * (M)) + +DO_2OP_U(vadd, DO_ADD) +DO_2OP_U(vsub, DO_SUB) +DO_2OP_U(vmul, DO_MUL) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 2546567774..5d3dee4699 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -331,3 +331,19 @@ DO_LOGIC(VBIC, gen_helper_mve_vbic) DO_LOGIC(VORR, gen_helper_mve_vorr) DO_LOGIC(VORN, gen_helper_mve_vorn) DO_LOGIC(VEOR, gen_helper_mve_veor) + +#define DO_2OP(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_2op *a) \ + { \ + static MVEGenTwoOpFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_2op(s, a, fns[a->size]); \ + } + +DO_2OP(VADD, vadd) +DO_2OP(VSUB, vsub) +DO_2OP(VMUL, vmul) From ba62cc56e8a0aa84337c50766d499ba4199394df Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:57 +0100 Subject: [PATCH 24/57] target/arm: Implement MVE VMULH Implement the MVE VMULH insn, which performs a vector multiply and returns the high half of the result. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-14-peter.maydell@linaro.org --- target/arm/helper-mve.h | 7 +++++++ target/arm/mve.decode | 3 +++ target/arm/mve_helper.c | 26 ++++++++++++++++++++++++++ target/arm/translate-mve.c | 2 ++ 4 files changed, 38 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 707b9cbd54..5c80b185cc 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -81,3 +81,10 @@ DEF_HELPER_FLAGS_4(mve_vsubw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vmulhsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulhsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulhsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulhub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulhuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulhuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index f7d1d303f1..ca4c27209d 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -82,6 +82,9 @@ VADD 1110 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op VSUB 1111 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op VMUL 1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op +VMULH_S 111 0 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op +VMULH_U 111 1 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 23da96402e..f1dd688f78 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -362,3 +362,29 @@ DO_2OP(veor, 8, uint64_t, DO_EOR) DO_2OP_U(vadd, DO_ADD) DO_2OP_U(vsub, DO_SUB) DO_2OP_U(vmul, DO_MUL) + +/* + * Because the computation type is at least twice as large as required, + * these work for both signed and unsigned source types. + */ +static inline uint8_t do_mulh_b(int32_t n, int32_t m) +{ + return (n * m) >> 8; +} + +static inline uint16_t do_mulh_h(int32_t n, int32_t m) +{ + return (n * m) >> 16; +} + +static inline uint32_t do_mulh_w(int64_t n, int64_t m) +{ + return (n * m) >> 32; +} + +DO_2OP(vmulhsb, 1, int8_t, do_mulh_b) +DO_2OP(vmulhsh, 2, int16_t, do_mulh_h) +DO_2OP(vmulhsw, 4, int32_t, do_mulh_w) +DO_2OP(vmulhub, 1, uint8_t, do_mulh_b) +DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h) +DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 5d3dee4699..de7d8b6c75 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -347,3 +347,5 @@ DO_LOGIC(VEOR, gen_helper_mve_veor) DO_2OP(VADD, vadd) DO_2OP(VSUB, vsub) DO_2OP(VMUL, vmul) +DO_2OP(VMULH_S, vmulhs) +DO_2OP(VMULH_U, vmulhu) From fca87b78f3d178518a38063498d477f5e10c5c22 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:58 +0100 Subject: [PATCH 25/57] target/arm: Implement MVE VRMULH Implement the MVE VRMULH insn, which performs a rounding multiply and then returns the high half. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-15-peter.maydell@linaro.org --- target/arm/helper-mve.h | 7 +++++++ target/arm/mve.decode | 3 +++ target/arm/mve_helper.c | 22 ++++++++++++++++++++++ target/arm/translate-mve.c | 2 ++ 4 files changed, 34 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 5c80b185cc..0e496971f0 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -88,3 +88,10 @@ DEF_HELPER_FLAGS_4(mve_vmulhsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulhub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulhuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulhuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vrmulhsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrmulhsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrmulhsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrmulhub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrmulhuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrmulhuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index ca4c27209d..4ab6c9dba9 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -85,6 +85,9 @@ VMUL 1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op VMULH_S 111 0 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op VMULH_U 111 1 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op +VRMULH_S 111 0 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op +VRMULH_U 111 1 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index f1dd688f78..6cd47d3458 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -382,9 +382,31 @@ static inline uint32_t do_mulh_w(int64_t n, int64_t m) return (n * m) >> 32; } +static inline uint8_t do_rmulh_b(int32_t n, int32_t m) +{ + return (n * m + (1U << 7)) >> 8; +} + +static inline uint16_t do_rmulh_h(int32_t n, int32_t m) +{ + return (n * m + (1U << 15)) >> 16; +} + +static inline uint32_t do_rmulh_w(int64_t n, int64_t m) +{ + return (n * m + (1U << 31)) >> 32; +} + DO_2OP(vmulhsb, 1, int8_t, do_mulh_b) DO_2OP(vmulhsh, 2, int16_t, do_mulh_h) DO_2OP(vmulhsw, 4, int32_t, do_mulh_w) DO_2OP(vmulhub, 1, uint8_t, do_mulh_b) DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h) DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w) + +DO_2OP(vrmulhsb, 1, int8_t, do_rmulh_b) +DO_2OP(vrmulhsh, 2, int16_t, do_rmulh_h) +DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w) +DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b) +DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h) +DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index de7d8b6c75..bc66058fd3 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -349,3 +349,5 @@ DO_2OP(VSUB, vsub) DO_2OP(VMUL, vmul) DO_2OP(VMULH_S, vmulhs) DO_2OP(VMULH_U, vmulhu) +DO_2OP(VRMULH_S, vrmulhs) +DO_2OP(VRMULH_U, vrmulhu) From cd367ff3919e020b50a10cf4955fc31042e73f24 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:15:59 +0100 Subject: [PATCH 26/57] target/arm: Implement MVE VMAX, VMIN Implement the MVE VMAX and VMIN insns. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-16-peter.maydell@linaro.org --- target/arm/helper-mve.h | 14 ++++++++++++++ target/arm/mve.decode | 5 +++++ target/arm/mve_helper.c | 14 ++++++++++++++ target/arm/translate-mve.c | 4 ++++ 4 files changed, 37 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 0e496971f0..5181d3b941 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -95,3 +95,17 @@ DEF_HELPER_FLAGS_4(mve_vrmulhsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vrmulhub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vrmulhuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vrmulhuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vmaxsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmaxsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmaxsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmaxub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmaxuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmaxuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vminsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vminsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vminsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vminub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vminuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vminuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 4ab6c9dba9..42d5504500 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -88,6 +88,11 @@ VMULH_U 111 1 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op VRMULH_S 111 0 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op VRMULH_U 111 1 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op +VMAX_S 111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op +VMAX_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op +VMIN_S 111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op +VMIN_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 6cd47d3458..c040e42bda 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -343,6 +343,12 @@ DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS) DO_2OP(OP##h, 2, uint16_t, FN) \ DO_2OP(OP##w, 4, uint32_t, FN) +/* provide signed 2-op helpers for all sizes */ +#define DO_2OP_S(OP, FN) \ + DO_2OP(OP##b, 1, int8_t, FN) \ + DO_2OP(OP##h, 2, int16_t, FN) \ + DO_2OP(OP##w, 4, int32_t, FN) + #define DO_AND(N, M) ((N) & (M)) #define DO_BIC(N, M) ((N) & ~(M)) #define DO_ORR(N, M) ((N) | (M)) @@ -410,3 +416,11 @@ DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w) DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b) DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h) DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w) + +#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) +#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) + +DO_2OP_S(vmaxs, DO_MAX) +DO_2OP_U(vmaxu, DO_MAX) +DO_2OP_S(vmins, DO_MIN) +DO_2OP_U(vminu, DO_MIN) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index bc66058fd3..107c393a99 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -351,3 +351,7 @@ DO_2OP(VMULH_S, vmulhs) DO_2OP(VMULH_U, vmulhu) DO_2OP(VRMULH_S, vrmulhs) DO_2OP(VRMULH_U, vrmulhu) +DO_2OP(VMAX_S, vmaxs) +DO_2OP(VMAX_U, vmaxu) +DO_2OP(VMIN_S, vmins) +DO_2OP(VMIN_U, vminu) From bc67aa8d561e6ebf93b724c9abf3a7a1f95839c9 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:00 +0100 Subject: [PATCH 27/57] target/arm: Implement MVE VABD Implement the MVE VABD insn. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-17-peter.maydell@linaro.org --- target/arm/helper-mve.h | 7 +++++++ target/arm/mve.decode | 3 +++ target/arm/mve_helper.c | 5 +++++ target/arm/translate-mve.c | 2 ++ 4 files changed, 17 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 5181d3b941..5cd4e7d736 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -109,3 +109,10 @@ DEF_HELPER_FLAGS_4(mve_vminsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vminub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vminuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vminuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vabdsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vabdsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vabdsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vabdub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vabduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vabduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 42d5504500..087d3db2a3 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -93,6 +93,9 @@ VMAX_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op VMIN_S 111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op VMIN_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op +VABD_S 111 0 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op +VABD_U 111 1 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index c040e42bda..63eacd7349 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -424,3 +424,8 @@ DO_2OP_S(vmaxs, DO_MAX) DO_2OP_U(vmaxu, DO_MAX) DO_2OP_S(vmins, DO_MIN) DO_2OP_U(vminu, DO_MIN) + +#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) + +DO_2OP_S(vabds, DO_ABD) +DO_2OP_U(vabdu, DO_ABD) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 107c393a99..041fd1ef14 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -355,3 +355,5 @@ DO_2OP(VMAX_S, vmaxs) DO_2OP(VMAX_U, vmaxu) DO_2OP(VMIN_S, vmins) DO_2OP(VMIN_U, vminu) +DO_2OP(VABD_S, vabds) +DO_2OP(VABD_U, vabdu) From abc48e310cc95f616ae65ccb167019eebf7e705b Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:01 +0100 Subject: [PATCH 28/57] target/arm: Implement MVE VHADD, VHSUB Implement MVE VHADD and VHSUB insns, which perform an addition or subtraction and then halve the result. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-18-peter.maydell@linaro.org --- target/arm/helper-mve.h | 14 ++++++++++++++ target/arm/mve.decode | 5 +++++ target/arm/mve_helper.c | 25 +++++++++++++++++++++++++ target/arm/translate-mve.c | 4 ++++ 4 files changed, 48 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 5cd4e7d736..02bef53ed4 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -116,3 +116,17 @@ DEF_HELPER_FLAGS_4(mve_vabdsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vabdub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vabduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vabduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vhaddsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhaddsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhaddsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhaddub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhadduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhadduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vhsubsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhsubsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhsubsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhsubub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhsubuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhsubuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 087d3db2a3..241d1c44c1 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -96,6 +96,11 @@ VMIN_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op VABD_S 111 0 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op VABD_U 111 1 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op +VHADD_S 111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op +VHADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op +VHSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op +VHSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 63eacd7349..835832fdf6 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -429,3 +429,28 @@ DO_2OP_U(vminu, DO_MIN) DO_2OP_S(vabds, DO_ABD) DO_2OP_U(vabdu, DO_ABD) + +static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m) +{ + return ((uint64_t)n + m) >> 1; +} + +static inline int32_t do_vhadd_s(int32_t n, int32_t m) +{ + return ((int64_t)n + m) >> 1; +} + +static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m) +{ + return ((uint64_t)n - m) >> 1; +} + +static inline int32_t do_vhsub_s(int32_t n, int32_t m) +{ + return ((int64_t)n - m) >> 1; +} + +DO_2OP_S(vhadds, do_vhadd_s) +DO_2OP_U(vhaddu, do_vhadd_u) +DO_2OP_S(vhsubs, do_vhsub_s) +DO_2OP_U(vhsubu, do_vhsub_u) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 041fd1ef14..f593d3693b 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -357,3 +357,7 @@ DO_2OP(VMIN_S, vmins) DO_2OP(VMIN_U, vminu) DO_2OP(VABD_S, vabds) DO_2OP(VABD_U, vabdu) +DO_2OP(VHADD_S, vhadds) +DO_2OP(VHADD_U, vhaddu) +DO_2OP(VHSUB_S, vhsubs) +DO_2OP(VHSUB_U, vhsubu) From ac6ad1dca84e39038e149c7b91adf9642e89ca70 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:02 +0100 Subject: [PATCH 29/57] target/arm: Implement MVE VMULL Implement the MVE VMULL insn, which multiplies two single width integer elements to produce a double width result. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-19-peter.maydell@linaro.org --- target/arm/helper-mve.h | 14 ++++++++++++++ target/arm/mve.decode | 5 +++++ target/arm/mve_helper.c | 34 ++++++++++++++++++++++++++++++++++ target/arm/translate-mve.c | 4 ++++ 4 files changed, 57 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 02bef53ed4..9bbeb7ec49 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -130,3 +130,17 @@ DEF_HELPER_FLAGS_4(mve_vhsubsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vhsubub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vhsubuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vhsubuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vmullbsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullbsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullbsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullbub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullbuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullbuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vmulltsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulltsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulltsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulltub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulltuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulltuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 241d1c44c1..5a480d61cd 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -101,6 +101,11 @@ VHADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op VHSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op VHSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op +VMULL_BS 111 0 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op +VMULL_BU 111 1 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op +VMULL_TS 111 0 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op +VMULL_TU 111 1 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 835832fdf6..a3d09db83b 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -349,6 +349,26 @@ DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS) DO_2OP(OP##h, 2, int16_t, FN) \ DO_2OP(OP##w, 4, int32_t, FN) +/* + * "Long" operations where two half-sized inputs (taken from either the + * top or the bottom of the input vector) produce a double-width result. + * Here ESIZE, TYPE are for the input, and LESIZE, LTYPE for the output. + */ +#define DO_2OP_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ + { \ + LTYPE *d = vd; \ + TYPE *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned le; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], \ + m[H##ESIZE(le * 2 + TOP)]); \ + mergemask(&d[H##LESIZE(le)], r, mask); \ + } \ + mve_advance_vpt(env); \ + } + #define DO_AND(N, M) ((N) & (M)) #define DO_BIC(N, M) ((N) & ~(M)) #define DO_ORR(N, M) ((N) | (M)) @@ -369,6 +389,20 @@ DO_2OP_U(vadd, DO_ADD) DO_2OP_U(vsub, DO_SUB) DO_2OP_U(vmul, DO_MUL) +DO_2OP_L(vmullbsb, 0, 1, int8_t, 2, int16_t, DO_MUL) +DO_2OP_L(vmullbsh, 0, 2, int16_t, 4, int32_t, DO_MUL) +DO_2OP_L(vmullbsw, 0, 4, int32_t, 8, int64_t, DO_MUL) +DO_2OP_L(vmullbub, 0, 1, uint8_t, 2, uint16_t, DO_MUL) +DO_2OP_L(vmullbuh, 0, 2, uint16_t, 4, uint32_t, DO_MUL) +DO_2OP_L(vmullbuw, 0, 4, uint32_t, 8, uint64_t, DO_MUL) + +DO_2OP_L(vmulltsb, 1, 1, int8_t, 2, int16_t, DO_MUL) +DO_2OP_L(vmulltsh, 1, 2, int16_t, 4, int32_t, DO_MUL) +DO_2OP_L(vmulltsw, 1, 4, int32_t, 8, int64_t, DO_MUL) +DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL) +DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL) +DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL) + /* * Because the computation type is at least twice as large as required, * these work for both signed and unsigned source types. diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index f593d3693b..1cadc3b04d 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -361,3 +361,7 @@ DO_2OP(VHADD_S, vhadds) DO_2OP(VHADD_U, vhaddu) DO_2OP(VHSUB_S, vhsubs) DO_2OP(VHSUB_U, vhsubu) +DO_2OP(VMULL_BS, vmullbs) +DO_2OP(VMULL_BU, vmullbu) +DO_2OP(VMULL_TS, vmullts) +DO_2OP(VMULL_TU, vmulltu) From 1d2386f70a0cb2ad9c5fab2cf1eedb80bb5b313d Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:03 +0100 Subject: [PATCH 30/57] target/arm: Implement MVE VMLALDAV Implement the MVE VMLALDAV insn, which multiplies pairs of integer elements, accumulating them into a 64-bit result in a pair of general-purpose registers. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-20-peter.maydell@linaro.org --- target/arm/helper-mve.h | 8 ++++ target/arm/mve.decode | 15 ++++++ target/arm/mve_helper.c | 34 ++++++++++++++ target/arm/translate-mve.c | 96 ++++++++++++++++++++++++++++++++++++++ target/arm/translate.h | 10 ++++ 5 files changed, 163 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 9bbeb7ec49..0138e28278 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -144,3 +144,11 @@ DEF_HELPER_FLAGS_4(mve_vmulltsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulltub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulltuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulltuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlaldavxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) + +DEF_HELPER_FLAGS_4(mve_vmlaldavuh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlaldavuw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 5a480d61cd..bde54d05bb 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -130,3 +130,18 @@ VNEG_fp 1111 1111 1 . 11 .. 01 ... 0 0111 11 . 0 ... 0 @1op VDUP 1110 1110 1 1 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=0 VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 1 1 0000 @vdup size=1 VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2 + +# multiply-add long dual accumulate +# rdahi: bits [3:1] from insn, bit 0 is 1 +# rdalo: bits [3:1] from insn, bit 0 is 0 +%rdahi 20:3 !function=times_2_plus_1 +%rdalo 13:3 !function=times_2 +# size bit is 0 for 16 bit, 1 for 32 bit +%size_16 16:1 !function=plus_1 + +&vmlaldav rdahi rdalo size qn qm x a + +@vmlaldav .... .... . ... ... . ... . .... .... qm:3 . \ + qn=%qn rdahi=%rdahi rdalo=%rdalo size=%size_16 &vmlaldav +VMLALDAV_S 1110 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 0 @vmlaldav +VMLALDAV_U 1111 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 0 @vmlaldav diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index a3d09db83b..4d586c4d26 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -488,3 +488,37 @@ DO_2OP_S(vhadds, do_vhadd_s) DO_2OP_U(vhaddu, do_vhadd_u) DO_2OP_S(vhsubs, do_vhsub_s) DO_2OP_U(vhsubu, do_vhsub_u) + + +/* + * Multiply add long dual accumulate ops. + */ +#define DO_LDAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \ + uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ + void *vm, uint64_t a) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *n = vn, *m = vm; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + if (e & 1) { \ + a ODDACC \ + (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \ + } else { \ + a EVENACC \ + (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \ + } \ + } \ + } \ + mve_advance_vpt(env); \ + return a; \ + } + +DO_LDAV(vmlaldavsh, 2, int16_t, false, +=, +=) +DO_LDAV(vmlaldavxsh, 2, int16_t, true, +=, +=) +DO_LDAV(vmlaldavsw, 4, int32_t, false, +=, +=) +DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=) + +DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=) +DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 1cadc3b04d..f8ceeac5a4 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -31,6 +31,7 @@ typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32); typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); typedef void MVEGenTwoOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr); +typedef void MVEGenDualAccOpFn(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64); /* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */ static inline long mve_qreg_offset(unsigned reg) @@ -88,6 +89,22 @@ static void mve_update_eci(DisasContext *s) } } +static bool mve_skip_first_beat(DisasContext *s) +{ + /* Return true if PSR.ECI says we must skip the first beat of this insn */ + switch (s->eci) { + case ECI_NONE: + return false; + case ECI_A0: + case ECI_A0A1: + case ECI_A0A1A2: + case ECI_A0A1A2B0: + return true; + default: + g_assert_not_reached(); + } +} + static bool do_ldst(DisasContext *s, arg_VLDR_VSTR *a, MVEGenLdStFn *fn) { TCGv_i32 addr; @@ -365,3 +382,82 @@ DO_2OP(VMULL_BS, vmullbs) DO_2OP(VMULL_BU, vmullbu) DO_2OP(VMULL_TS, vmullts) DO_2OP(VMULL_TU, vmulltu) + +static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a, + MVEGenDualAccOpFn *fn) +{ + TCGv_ptr qn, qm; + TCGv_i64 rda; + TCGv_i32 rdalo, rdahi; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qn | a->qm) || + !fn) { + return false; + } + /* + * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related + * encoding; rdalo always has bit 0 clear so cannot be 13 or 15. + */ + if (a->rdahi == 13 || a->rdahi == 15) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qn = mve_qreg_ptr(a->qn); + qm = mve_qreg_ptr(a->qm); + + /* + * This insn is subject to beat-wise execution. Partial execution + * of an A=0 (no-accumulate) insn which does not execute the first + * beat must start with the current rda value, not 0. + */ + if (a->a || mve_skip_first_beat(s)) { + rda = tcg_temp_new_i64(); + rdalo = load_reg(s, a->rdalo); + rdahi = load_reg(s, a->rdahi); + tcg_gen_concat_i32_i64(rda, rdalo, rdahi); + tcg_temp_free_i32(rdalo); + tcg_temp_free_i32(rdahi); + } else { + rda = tcg_const_i64(0); + } + + fn(rda, cpu_env, qn, qm, rda); + tcg_temp_free_ptr(qn); + tcg_temp_free_ptr(qm); + + rdalo = tcg_temp_new_i32(); + rdahi = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(rdalo, rda); + tcg_gen_extrh_i64_i32(rdahi, rda); + store_reg(s, a->rdalo, rdalo); + store_reg(s, a->rdahi, rdahi); + tcg_temp_free_i64(rda); + mve_update_eci(s); + return true; +} + +static bool trans_VMLALDAV_S(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenDualAccOpFn * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_mve_vmlaldavsh, gen_helper_mve_vmlaldavxsh }, + { gen_helper_mve_vmlaldavsw, gen_helper_mve_vmlaldavxsw }, + { NULL, NULL }, + }; + return do_long_dual_acc(s, a, fns[a->size][a->x]); +} + +static bool trans_VMLALDAV_U(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenDualAccOpFn * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_mve_vmlaldavuh, NULL }, + { gen_helper_mve_vmlaldavuw, NULL }, + { NULL, NULL }, + }; + return do_long_dual_acc(s, a, fns[a->size][a->x]); +} diff --git a/target/arm/translate.h b/target/arm/translate.h index 2821b325e3..99c917c571 100644 --- a/target/arm/translate.h +++ b/target/arm/translate.h @@ -136,6 +136,11 @@ static inline int negate(DisasContext *s, int x) return -x; } +static inline int plus_1(DisasContext *s, int x) +{ + return x + 1; +} + static inline int plus_2(DisasContext *s, int x) { return x + 2; @@ -151,6 +156,11 @@ static inline int times_4(DisasContext *s, int x) return x * 4; } +static inline int times_2_plus_1(DisasContext *s, int x) +{ + return x * 2 + 1; +} + static inline int arm_dc_feature(DisasContext *dc, int feature) { return (dc->features & (1ULL << feature)) != 0; From 181cd97143629a304f75acf894ca79b26bf32378 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:04 +0100 Subject: [PATCH 31/57] target/arm: Implement MVE VMLSLDAV Implement the MVE insn VMLSLDAV, which multiplies source elements, alternately adding and subtracting them, and accumulates into a 64-bit result in a pair of general purpose registers. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-21-peter.maydell@linaro.org --- target/arm/helper-mve.h | 5 +++++ target/arm/mve.decode | 2 ++ target/arm/mve_helper.c | 5 +++++ target/arm/translate-mve.c | 11 +++++++++++ 4 files changed, 23 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 0138e28278..7356385d60 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -152,3 +152,8 @@ DEF_HELPER_FLAGS_4(mve_vmlaldavxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavuh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavuw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) + +DEF_HELPER_FLAGS_4(mve_vmlsldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlsldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlsldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlsldavxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index bde54d05bb..1be2d6b270 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -145,3 +145,5 @@ VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2 qn=%qn rdahi=%rdahi rdalo=%rdalo size=%size_16 &vmlaldav VMLALDAV_S 1110 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 0 @vmlaldav VMLALDAV_U 1111 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 0 @vmlaldav + +VMLSLDAV 1110 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 1 @vmlaldav diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 4d586c4d26..b496e1f4d3 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -522,3 +522,8 @@ DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=) DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=) DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=) + +DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=) +DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=) +DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=) +DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index f8ceeac5a4..77b461c218 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -461,3 +461,14 @@ static bool trans_VMLALDAV_U(DisasContext *s, arg_vmlaldav *a) }; return do_long_dual_acc(s, a, fns[a->size][a->x]); } + +static bool trans_VMLSLDAV(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenDualAccOpFn * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_mve_vmlsldavsh, gen_helper_mve_vmlsldavxsh }, + { gen_helper_mve_vmlsldavsw, gen_helper_mve_vmlsldavxsw }, + { NULL, NULL }, + }; + return do_long_dual_acc(s, a, fns[a->size][a->x]); +} From 38548747335a0796ab1d636c8b5bcf5c248ce437 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:05 +0100 Subject: [PATCH 32/57] target/arm: Implement MVE VRMLALDAVH, VRMLSLDAVH Implement the MVE VRMLALDAVH and VRMLSLDAVH insns, which accumulate the results of a rounded multiply of pairs of elements into a 72-bit accumulator, returning the top 64 bits in a pair of general purpose registers. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-22-peter.maydell@linaro.org --- target/arm/helper-mve.h | 8 ++++++++ target/arm/mve.decode | 7 +++++++ target/arm/mve_helper.c | 37 +++++++++++++++++++++++++++++++++++++ target/arm/translate-mve.c | 24 ++++++++++++++++++++++++ 4 files changed, 76 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 7356385d60..f9d4b242be 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -157,3 +157,11 @@ DEF_HELPER_FLAGS_4(mve_vmlsldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlsldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlsldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlsldavxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) + +DEF_HELPER_FLAGS_4(mve_vrmlaldavhsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vrmlaldavhxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) + +DEF_HELPER_FLAGS_4(mve_vrmlaldavhuw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) + +DEF_HELPER_FLAGS_4(mve_vrmlsldavhsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vrmlsldavhxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 1be2d6b270..ac68f072bb 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -143,7 +143,14 @@ VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2 @vmlaldav .... .... . ... ... . ... . .... .... qm:3 . \ qn=%qn rdahi=%rdahi rdalo=%rdalo size=%size_16 &vmlaldav +@vmlaldav_nosz .... .... . ... ... . ... . .... .... qm:3 . \ + qn=%qn rdahi=%rdahi rdalo=%rdalo size=0 &vmlaldav VMLALDAV_S 1110 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 0 @vmlaldav VMLALDAV_U 1111 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 0 @vmlaldav VMLSLDAV 1110 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 1 @vmlaldav + +VRMLALDAVH_S 1110 1110 1 ... ... 0 ... x:1 1111 . 0 a:1 0 ... 0 @vmlaldav_nosz +VRMLALDAVH_U 1111 1110 1 ... ... 0 ... x:1 1111 . 0 a:1 0 ... 0 @vmlaldav_nosz + +VRMLSLDAVH 1111 1110 1 ... ... 0 ... x:1 1110 . 0 a:1 0 ... 1 @vmlaldav_nosz diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index b496e1f4d3..c1427b0f02 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -18,6 +18,7 @@ */ #include "qemu/osdep.h" +#include "qemu/int128.h" #include "cpu.h" #include "internals.h" #include "vec_internal.h" @@ -527,3 +528,39 @@ DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=) DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=) DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=) DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=) + +/* + * Rounding multiply add long dual accumulate high: we must keep + * a 72-bit internal accumulator value and return the top 64 bits. + */ +#define DO_LDAVH(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC, TO128) \ + uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ + void *vm, uint64_t a) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *n = vn, *m = vm; \ + Int128 acc = int128_lshift(TO128(a), 8); \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + if (e & 1) { \ + acc = ODDACC(acc, TO128(n[H##ESIZE(e - 1 * XCHG)] * \ + m[H##ESIZE(e)])); \ + } else { \ + acc = EVENACC(acc, TO128(n[H##ESIZE(e + 1 * XCHG)] * \ + m[H##ESIZE(e)])); \ + } \ + acc = int128_add(acc, int128_make64(1 << 7)); \ + } \ + } \ + mve_advance_vpt(env); \ + return int128_getlo(int128_rshift(acc, 8)); \ + } + +DO_LDAVH(vrmlaldavhsw, 4, int32_t, false, int128_add, int128_add, int128_makes64) +DO_LDAVH(vrmlaldavhxsw, 4, int32_t, true, int128_add, int128_add, int128_makes64) + +DO_LDAVH(vrmlaldavhuw, 4, uint32_t, false, int128_add, int128_add, int128_make64) + +DO_LDAVH(vrmlsldavhsw, 4, int32_t, false, int128_add, int128_sub, int128_makes64) +DO_LDAVH(vrmlsldavhxsw, 4, int32_t, true, int128_add, int128_sub, int128_makes64) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 77b461c218..a0c4f10a93 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -472,3 +472,27 @@ static bool trans_VMLSLDAV(DisasContext *s, arg_vmlaldav *a) }; return do_long_dual_acc(s, a, fns[a->size][a->x]); } + +static bool trans_VRMLALDAVH_S(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenDualAccOpFn * const fns[] = { + gen_helper_mve_vrmlaldavhsw, gen_helper_mve_vrmlaldavhxsw, + }; + return do_long_dual_acc(s, a, fns[a->x]); +} + +static bool trans_VRMLALDAVH_U(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenDualAccOpFn * const fns[] = { + gen_helper_mve_vrmlaldavhuw, NULL, + }; + return do_long_dual_acc(s, a, fns[a->x]); +} + +static bool trans_VRMLSLDAVH(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenDualAccOpFn * const fns[] = { + gen_helper_mve_vrmlsldavhsw, gen_helper_mve_vrmlsldavhxsw, + }; + return do_long_dual_acc(s, a, fns[a->x]); +} From e51896b3866ffb74df5aaa3b33c35e7113e5c6b9 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:06 +0100 Subject: [PATCH 33/57] target/arm: Implement MVE VADD (scalar) Implement the scalar form of the MVE VADD insn. This takes the scalar operand from a general purpose register. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-23-peter.maydell@linaro.org --- target/arm/helper-mve.h | 4 ++++ target/arm/mve.decode | 7 ++++++ target/arm/mve_helper.c | 22 +++++++++++++++++++ target/arm/translate-mve.c | 45 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index f9d4b242be..16b974a427 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -145,6 +145,10 @@ DEF_HELPER_FLAGS_4(mve_vmulltub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulltuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulltuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index ac68f072bb..0ee7a72708 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -26,6 +26,7 @@ &vldr_vstr rn qd imm p a w size l u &1op qd qm size &2op qd qm qn size +&2scalar qd qn rm size @vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0 # Note that both Rn and Qd are 3 bits only (no D bit) @@ -36,6 +37,8 @@ @2op .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn @2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0 +@2scalar .... .... .. size:2 .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn + # Vector loads and stores # Widening loads and narrowing stores: @@ -154,3 +157,7 @@ VRMLALDAVH_S 1110 1110 1 ... ... 0 ... x:1 1111 . 0 a:1 0 ... 0 @vmlaldav_no VRMLALDAVH_U 1111 1110 1 ... ... 0 ... x:1 1111 . 0 a:1 0 ... 0 @vmlaldav_nosz VRMLSLDAVH 1111 1110 1 ... ... 0 ... x:1 1110 . 0 a:1 0 ... 1 @vmlaldav_nosz + +# Scalar operations + +VADD_scalar 1110 1110 0 . .. ... 1 ... 0 1111 . 100 .... @2scalar diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index c1427b0f02..6b8cead463 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -491,6 +491,28 @@ DO_2OP_S(vhsubs, do_vhsub_s) DO_2OP_U(vhsubu, do_vhsub_u) +#define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + TYPE *d = vd, *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask); \ + } \ + mve_advance_vpt(env); \ + } + +/* provide unsigned 2-op scalar helpers for all sizes */ +#define DO_2OP_SCALAR_U(OP, FN) \ + DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \ + DO_2OP_SCALAR(OP##h, 2, uint16_t, FN) \ + DO_2OP_SCALAR(OP##w, 4, uint32_t, FN) + +DO_2OP_SCALAR_U(vadd_scalar, DO_ADD) + /* * Multiply add long dual accumulate ops. */ diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index a0c4f10a93..388848b4ff 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -31,6 +31,7 @@ typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32); typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); typedef void MVEGenTwoOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr); +typedef void MVEGenTwoOpScalarFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); typedef void MVEGenDualAccOpFn(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64); /* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */ @@ -383,6 +384,50 @@ DO_2OP(VMULL_BU, vmullbu) DO_2OP(VMULL_TS, vmullts) DO_2OP(VMULL_TU, vmulltu) +static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, + MVEGenTwoOpScalarFn fn) +{ + TCGv_ptr qd, qn; + TCGv_i32 rm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd | a->qn) || + !fn) { + return false; + } + if (a->rm == 13 || a->rm == 15) { + /* UNPREDICTABLE */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + qn = mve_qreg_ptr(a->qn); + rm = load_reg(s, a->rm); + fn(cpu_env, qd, qn, rm); + tcg_temp_free_i32(rm); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qn); + mve_update_eci(s); + return true; +} + +#define DO_2OP_SCALAR(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_2scalar *a) \ + { \ + static MVEGenTwoOpScalarFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_2op_scalar(s, a, fns[a->size]); \ + } + +DO_2OP_SCALAR(VADD_scalar, vadd_scalar) + static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a, MVEGenDualAccOpFn *fn) { From 91a358fdfb3b116a6ea72a38d5c217caad1d45b5 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:07 +0100 Subject: [PATCH 34/57] target/arm: Implement MVE VSUB, VMUL (scalar) Implement the scalar forms of the MVE VSUB and VMUL insns. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-24-peter.maydell@linaro.org --- target/arm/helper-mve.h | 8 ++++++++ target/arm/mve.decode | 2 ++ target/arm/mve_helper.c | 2 ++ target/arm/translate-mve.c | 2 ++ 4 files changed, 14 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 16b974a427..912505d015 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -149,6 +149,14 @@ DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vsub_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vsub_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vsub_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vmul_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmul_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmul_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 0ee7a72708..af5fba78ce 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -161,3 +161,5 @@ VRMLSLDAVH 1111 1110 1 ... ... 0 ... x:1 1110 . 0 a:1 0 ... 1 @vmlaldav_no # Scalar operations VADD_scalar 1110 1110 0 . .. ... 1 ... 0 1111 . 100 .... @2scalar +VSUB_scalar 1110 1110 0 . .. ... 1 ... 1 1111 . 100 .... @2scalar +VMUL_scalar 1110 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 6b8cead463..33755bc313 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -512,6 +512,8 @@ DO_2OP_U(vhsubu, do_vhsub_u) DO_2OP_SCALAR(OP##w, 4, uint32_t, FN) DO_2OP_SCALAR_U(vadd_scalar, DO_ADD) +DO_2OP_SCALAR_U(vsub_scalar, DO_SUB) +DO_2OP_SCALAR_U(vmul_scalar, DO_MUL) /* * Multiply add long dual accumulate ops. diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 388848b4ff..3c059ad91c 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -427,6 +427,8 @@ static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, } DO_2OP_SCALAR(VADD_scalar, vadd_scalar) +DO_2OP_SCALAR(VSUB_scalar, vsub_scalar) +DO_2OP_SCALAR(VMUL_scalar, vmul_scalar) static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a, MVEGenDualAccOpFn *fn) From 644f717c35ec29d53f6fc34523e096fbad6eeaf9 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:08 +0100 Subject: [PATCH 35/57] target/arm: Implement MVE VHADD, VHSUB (scalar) Implement the scalar variants of the MVE VHADD and VHSUB insns. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-25-peter.maydell@linaro.org --- target/arm/helper-mve.h | 16 ++++++++++++++++ target/arm/mve.decode | 4 ++++ target/arm/mve_helper.c | 8 ++++++++ target/arm/translate-mve.c | 4 ++++ 4 files changed, 32 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 912505d015..52086d769f 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -157,6 +157,22 @@ DEF_HELPER_FLAGS_4(mve_vmul_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vmul_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vmul_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhadds_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhadds_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhadds_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vhaddu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhaddu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhaddu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vhsubs_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhsubs_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhsubs_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vhsubu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhsubu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhsubu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index af5fba78ce..5c332b04a7 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -163,3 +163,7 @@ VRMLSLDAVH 1111 1110 1 ... ... 0 ... x:1 1110 . 0 a:1 0 ... 1 @vmlaldav_no VADD_scalar 1110 1110 0 . .. ... 1 ... 0 1111 . 100 .... @2scalar VSUB_scalar 1110 1110 0 . .. ... 1 ... 1 1111 . 100 .... @2scalar VMUL_scalar 1110 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar +VHADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar +VHADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar +VHSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar +VHSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 33755bc313..d76168a7a4 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -510,10 +510,18 @@ DO_2OP_U(vhsubu, do_vhsub_u) DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \ DO_2OP_SCALAR(OP##h, 2, uint16_t, FN) \ DO_2OP_SCALAR(OP##w, 4, uint32_t, FN) +#define DO_2OP_SCALAR_S(OP, FN) \ + DO_2OP_SCALAR(OP##b, 1, int8_t, FN) \ + DO_2OP_SCALAR(OP##h, 2, int16_t, FN) \ + DO_2OP_SCALAR(OP##w, 4, int32_t, FN) DO_2OP_SCALAR_U(vadd_scalar, DO_ADD) DO_2OP_SCALAR_U(vsub_scalar, DO_SUB) DO_2OP_SCALAR_U(vmul_scalar, DO_MUL) +DO_2OP_SCALAR_S(vhadds_scalar, do_vhadd_s) +DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u) +DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s) +DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u) /* * Multiply add long dual accumulate ops. diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 3c059ad91c..4b379bfe6e 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -429,6 +429,10 @@ static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, DO_2OP_SCALAR(VADD_scalar, vadd_scalar) DO_2OP_SCALAR(VSUB_scalar, vsub_scalar) DO_2OP_SCALAR(VMUL_scalar, vmul_scalar) +DO_2OP_SCALAR(VHADD_S_scalar, vhadds_scalar) +DO_2OP_SCALAR(VHADD_U_scalar, vhaddu_scalar) +DO_2OP_SCALAR(VHSUB_S_scalar, vhsubs_scalar) +DO_2OP_SCALAR(VHSUB_U_scalar, vhsubu_scalar) static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a, MVEGenDualAccOpFn *fn) From b050543b68308427792cc024fb2905b041ebc253 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:09 +0100 Subject: [PATCH 36/57] target/arm: Implement MVE VBRSR Implement the MVE VBRSR insn, which reverses a specified number of bits in each element, setting the rest to zero. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-26-peter.maydell@linaro.org --- target/arm/helper-mve.h | 4 ++++ target/arm/mve.decode | 1 + target/arm/mve_helper.c | 43 ++++++++++++++++++++++++++++++++++++++ target/arm/translate-mve.c | 1 + 4 files changed, 49 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 52086d769f..1b807e1cf5 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -173,6 +173,10 @@ DEF_HELPER_FLAGS_4(mve_vhsubu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vhsubu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vhsubu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vbrsrb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vbrsrh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vbrsrw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 5c332b04a7..a3dbdb72a5 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -167,3 +167,4 @@ VHADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar VHADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar VHSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar VHSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar +VBRSR 1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index d76168a7a4..441bc8467b 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -523,6 +523,49 @@ DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u) DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s) DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u) +static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m) +{ + m &= 0xff; + if (m == 0) { + return 0; + } + n = revbit8(n); + if (m < 8) { + n >>= 8 - m; + } + return n; +} + +static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m) +{ + m &= 0xff; + if (m == 0) { + return 0; + } + n = revbit16(n); + if (m < 16) { + n >>= 16 - m; + } + return n; +} + +static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m) +{ + m &= 0xff; + if (m == 0) { + return 0; + } + n = revbit32(n); + if (m < 32) { + n >>= 32 - m; + } + return n; +} + +DO_2OP_SCALAR(vbrsrb, 1, uint8_t, do_vbrsrb) +DO_2OP_SCALAR(vbrsrh, 2, uint16_t, do_vbrsrh) +DO_2OP_SCALAR(vbrsrw, 4, uint32_t, do_vbrsrw) + /* * Multiply add long dual accumulate ops. */ diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 4b379bfe6e..6320064a08 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -433,6 +433,7 @@ DO_2OP_SCALAR(VHADD_S_scalar, vhadds_scalar) DO_2OP_SCALAR(VHADD_U_scalar, vhaddu_scalar) DO_2OP_SCALAR(VHSUB_S_scalar, vhsubs_scalar) DO_2OP_SCALAR(VHSUB_U_scalar, vhsubu_scalar) +DO_2OP_SCALAR(VBRSR, vbrsr) static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a, MVEGenDualAccOpFn *fn) From 387debdb93d2635fb6d62bff38887d17ef4d8117 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:10 +0100 Subject: [PATCH 37/57] target/arm: Implement MVE VPST Implement the MVE VPST insn, which sets the predicate mask fields in the VPR to the immediate value encoded in the insn. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-27-peter.maydell@linaro.org --- target/arm/mve.decode | 4 +++ target/arm/translate-mve.c | 59 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index a3dbdb72a5..e189e2de64 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -168,3 +168,7 @@ VHADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar VHSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar VHSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar VBRSR 1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar + +# Predicate operations +%mask_22_13 22:1 13:3 +VPST 1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13 diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 6320064a08..7c4c06e434 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -90,6 +90,19 @@ static void mve_update_eci(DisasContext *s) } } +static void mve_update_and_store_eci(DisasContext *s) +{ + /* + * For insns which don't call a helper function that will call + * mve_advance_vpt(), this version updates s->eci and also stores + * it out to the CPUState field. + */ + if (s->eci) { + mve_update_eci(s); + store_cpu_field(tcg_constant_i32(s->eci << 4), condexec_bits); + } +} + static bool mve_skip_first_beat(DisasContext *s) { /* Return true if PSR.ECI says we must skip the first beat of this insn */ @@ -548,3 +561,49 @@ static bool trans_VRMLSLDAVH(DisasContext *s, arg_vmlaldav *a) }; return do_long_dual_acc(s, a, fns[a->x]); } + +static bool trans_VPST(DisasContext *s, arg_VPST *a) +{ + TCGv_i32 vpr; + + /* mask == 0 is a "related encoding" */ + if (!dc_isar_feature(aa32_mve, s) || !a->mask) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + /* + * Set the VPR mask fields. We take advantage of MASK01 and MASK23 + * being adjacent fields in the register. + * + * This insn is not predicated, but it is subject to beat-wise + * execution, and the mask is updated on the odd-numbered beats. + * So if PSR.ECI says we should skip beat 1, we mustn't update the + * 01 mask field. + */ + vpr = load_cpu_field(v7m.vpr); + switch (s->eci) { + case ECI_NONE: + case ECI_A0: + /* Update both 01 and 23 fields */ + tcg_gen_deposit_i32(vpr, vpr, + tcg_constant_i32(a->mask | (a->mask << 4)), + R_V7M_VPR_MASK01_SHIFT, + R_V7M_VPR_MASK01_LENGTH + R_V7M_VPR_MASK23_LENGTH); + break; + case ECI_A0A1: + case ECI_A0A1A2: + case ECI_A0A1A2B0: + /* Update only the 23 mask field */ + tcg_gen_deposit_i32(vpr, vpr, + tcg_constant_i32(a->mask), + R_V7M_VPR_MASK23_SHIFT, R_V7M_VPR_MASK23_LENGTH); + break; + default: + g_assert_not_reached(); + } + store_cpu_field(vpr, v7m.vpr); + mve_update_and_store_eci(s); + return true; +} From 39f2ec8592dd3c823034dc4decc64c7e4cc42bfd Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:11 +0100 Subject: [PATCH 38/57] target/arm: Implement MVE VQADD and VQSUB Implement the MVE VQADD and VQSUB insns, which perform saturating addition of a scalar to each element. Note that individual bytes of each result element are used or discarded according to the predicate mask, but FPSCR.QC is only set if the predicate mask for the lowest byte of the element is set. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-28-peter.maydell@linaro.org --- target/arm/helper-mve.h | 16 ++++++++++ target/arm/mve.decode | 5 +++ target/arm/mve_helper.c | 62 ++++++++++++++++++++++++++++++++++++++ target/arm/translate-mve.c | 4 +++ 4 files changed, 87 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 1b807e1cf5..092efdab47 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -173,6 +173,22 @@ DEF_HELPER_FLAGS_4(mve_vhsubu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vhsubu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vhsubu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqadds_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqadds_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqadds_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqaddu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqaddu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqaddu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqsubs_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqsubs_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqsubs_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqsubu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqsubu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqsubu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(mve_vbrsrb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vbrsrh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vbrsrw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index e189e2de64..c85227c675 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -167,6 +167,11 @@ VHADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar VHADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar VHSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar VHSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar + +VQADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar +VQADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar +VQSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar +VQSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar VBRSR 1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar # Predicate operations diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 441bc8467b..91139d2c0d 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -490,6 +490,33 @@ DO_2OP_U(vhaddu, do_vhadd_u) DO_2OP_S(vhsubs, do_vhsub_s) DO_2OP_U(vhsubu, do_vhsub_u) +static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) +{ + if (val > max) { + *s = true; + return max; + } else if (val < min) { + *s = true; + return min; + } + return val; +} + +#define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s) +#define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s) +#define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s) + +#define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s) +#define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s) +#define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s) + +#define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s) +#define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s) +#define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s) + +#define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s) +#define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s) +#define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s) #define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ @@ -505,6 +532,27 @@ DO_2OP_U(vhsubu, do_vhsub_u) mve_advance_vpt(env); \ } +#define DO_2OP_SAT_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + TYPE *d = vd, *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat), \ + mask); \ + qc |= sat & mask & 1; \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + /* provide unsigned 2-op scalar helpers for all sizes */ #define DO_2OP_SCALAR_U(OP, FN) \ DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \ @@ -523,6 +571,20 @@ DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u) DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s) DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u) +DO_2OP_SAT_SCALAR(vqaddu_scalarb, 1, uint8_t, DO_UQADD_B) +DO_2OP_SAT_SCALAR(vqaddu_scalarh, 2, uint16_t, DO_UQADD_H) +DO_2OP_SAT_SCALAR(vqaddu_scalarw, 4, uint32_t, DO_UQADD_W) +DO_2OP_SAT_SCALAR(vqadds_scalarb, 1, int8_t, DO_SQADD_B) +DO_2OP_SAT_SCALAR(vqadds_scalarh, 2, int16_t, DO_SQADD_H) +DO_2OP_SAT_SCALAR(vqadds_scalarw, 4, int32_t, DO_SQADD_W) + +DO_2OP_SAT_SCALAR(vqsubu_scalarb, 1, uint8_t, DO_UQSUB_B) +DO_2OP_SAT_SCALAR(vqsubu_scalarh, 2, uint16_t, DO_UQSUB_H) +DO_2OP_SAT_SCALAR(vqsubu_scalarw, 4, uint32_t, DO_UQSUB_W) +DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B) +DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H) +DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W) + static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m) { m &= 0xff; diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 7c4c06e434..27c69d9c7d 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -446,6 +446,10 @@ DO_2OP_SCALAR(VHADD_S_scalar, vhadds_scalar) DO_2OP_SCALAR(VHADD_U_scalar, vhaddu_scalar) DO_2OP_SCALAR(VHSUB_S_scalar, vhsubs_scalar) DO_2OP_SCALAR(VHSUB_U_scalar, vhsubu_scalar) +DO_2OP_SCALAR(VQADD_S_scalar, vqadds_scalar) +DO_2OP_SCALAR(VQADD_U_scalar, vqaddu_scalar) +DO_2OP_SCALAR(VQSUB_S_scalar, vqsubs_scalar) +DO_2OP_SCALAR(VQSUB_U_scalar, vqsubu_scalar) DO_2OP_SCALAR(VBRSR, vbrsr) static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a, From 66c0576754b100606e041fef54e5b897417426c7 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:12 +0100 Subject: [PATCH 39/57] target/arm: Implement MVE VQDMULH and VQRDMULH (scalar) Implement the MVE VQDMULH and VQRDMULH scalar insns, which multiply elements by the scalar, double, possibly round, take the high half and saturate. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-29-peter.maydell@linaro.org --- target/arm/helper-mve.h | 8 ++++++++ target/arm/mve.decode | 3 +++ target/arm/mve_helper.c | 25 +++++++++++++++++++++++++ target/arm/translate-mve.c | 2 ++ 4 files changed, 38 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 092efdab47..a0a01d0cc3 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -189,6 +189,14 @@ DEF_HELPER_FLAGS_4(mve_vqsubu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vqsubu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vqsubu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmulh_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmulh_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmulh_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqrdmulh_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrdmulh_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrdmulh_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(mve_vbrsrb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vbrsrh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vbrsrw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index c85227c675..47ce6ebb83 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -174,6 +174,9 @@ VQSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar VQSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar VBRSR 1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar +VQDMULH_scalar 1110 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar +VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar + # Predicate operations %mask_22_13 22:1 13:3 VPST 1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13 diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 91139d2c0d..20c7145a68 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -518,6 +518,24 @@ static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) #define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s) #define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s) +/* + * For QDMULH and QRDMULH we simplify "double and shift by esize" into + * "shift by esize-1", adjusting the QRDMULH rounding constant to match. + */ +#define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \ + INT8_MIN, INT8_MAX, s) +#define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \ + INT16_MIN, INT16_MAX, s) +#define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \ + INT32_MIN, INT32_MAX, s) + +#define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \ + INT8_MIN, INT8_MAX, s) +#define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \ + INT16_MIN, INT16_MAX, s) +#define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \ + INT32_MIN, INT32_MAX, s) + #define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ uint32_t rm) \ @@ -585,6 +603,13 @@ DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B) DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H) DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W) +DO_2OP_SAT_SCALAR(vqdmulh_scalarb, 1, int8_t, DO_QDMULH_B) +DO_2OP_SAT_SCALAR(vqdmulh_scalarh, 2, int16_t, DO_QDMULH_H) +DO_2OP_SAT_SCALAR(vqdmulh_scalarw, 4, int32_t, DO_QDMULH_W) +DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B) +DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H) +DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W) + static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m) { m &= 0xff; diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 27c69d9c7d..84a7320cf8 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -450,6 +450,8 @@ DO_2OP_SCALAR(VQADD_S_scalar, vqadds_scalar) DO_2OP_SCALAR(VQADD_U_scalar, vqaddu_scalar) DO_2OP_SCALAR(VQSUB_S_scalar, vqsubs_scalar) DO_2OP_SCALAR(VQSUB_U_scalar, vqsubu_scalar) +DO_2OP_SCALAR(VQDMULH_scalar, vqdmulh_scalar) +DO_2OP_SCALAR(VQRDMULH_scalar, vqrdmulh_scalar) DO_2OP_SCALAR(VBRSR, vbrsr) static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a, From a88903537d73b1d9728e3d824920b4d0096f10bc Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:13 +0100 Subject: [PATCH 40/57] target/arm: Implement MVE VQDMULL scalar Implement the MVE VQDMULL scalar insn. This multiplies the top or bottom half of each element by the scalar, doubles and saturates to a double-width result. Note that this encoding overlaps with VQADD and VQSUB; it uses what in VQADD and VQSUB would be the 'size=0b11' encoding. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-30-peter.maydell@linaro.org --- target/arm/helper-mve.h | 5 +++ target/arm/mve.decode | 23 +++++++++++--- target/arm/mve_helper.c | 65 ++++++++++++++++++++++++++++++++++++++ target/arm/translate-mve.c | 30 ++++++++++++++++++ 4 files changed, 119 insertions(+), 4 deletions(-) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index a0a01d0cc3..41dd61264e 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -201,6 +201,11 @@ DEF_HELPER_FLAGS_4(mve_vbrsrb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vbrsrh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vbrsrw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmullb_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmullb_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmullt_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmullt_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 47ce6ebb83..a71ad7252b 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -23,6 +23,9 @@ %qm 5:1 1:3 %qn 7:1 17:3 +# VQDMULL has size in bit 28: 0 for 16 bit, 1 for 32 bit +%size_28 28:1 !function=plus_1 + &vldr_vstr rn qd imm p a w size l u &1op qd qm size &2op qd qm qn size @@ -38,6 +41,7 @@ @2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0 @2scalar .... .... .. size:2 .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn +@2scalar_nosz .... .... .... .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn # Vector loads and stores @@ -168,15 +172,26 @@ VHADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar VHSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar VHSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar -VQADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar -VQADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar -VQSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar -VQSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar +{ + VQADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar + VQADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar + VQDMULLB_scalar 111 . 1110 0 . 11 ... 0 ... 0 1111 . 110 .... @2scalar_nosz \ + size=%size_28 +} + +{ + VQSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar + VQSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar + VQDMULLT_scalar 111 . 1110 0 . 11 ... 0 ... 1 1111 . 110 .... @2scalar_nosz \ + size=%size_28 +} + VBRSR 1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar VQDMULH_scalar 1110 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar + # Predicate operations %mask_22_13 22:1 13:3 VPST 1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13 diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 20c7145a68..d58cca5e72 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -610,6 +610,71 @@ DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B) DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H) DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W) +/* + * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the + * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type. + * SATMASK specifies which bits of the predicate mask matter for determining + * whether to propagate a saturation indication into FPSCR.QC -- for + * the 16x16->32 case we must check only the bit corresponding to the T or B + * half that we used, but for the 32x32->64 case we propagate if the mask + * bit is set for either half. + */ +#define DO_2OP_SAT_SCALAR_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + LTYPE *d = vd; \ + TYPE *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned le; \ + bool qc = false; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + bool sat = false; \ + LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat); \ + mergemask(&d[H##LESIZE(le)], r, mask); \ + qc |= sat && (mask & SATMASK); \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat) +{ + int64_t r = ((int64_t)n * m) * 2; + return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat); +} + +static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat) +{ + /* The multiply can't overflow, but the doubling might */ + int64_t r = (int64_t)n * m; + if (r > INT64_MAX / 2) { + *sat = true; + return INT64_MAX; + } else if (r < INT64_MIN / 2) { + *sat = true; + return INT64_MIN; + } else { + return r * 2; + } +} + +#define SATMASK16B 1 +#define SATMASK16T (1 << 2) +#define SATMASK32 ((1 << 4) | 1) + +DO_2OP_SAT_SCALAR_L(vqdmullb_scalarh, 0, 2, int16_t, 4, int32_t, \ + do_qdmullh, SATMASK16B) +DO_2OP_SAT_SCALAR_L(vqdmullb_scalarw, 0, 4, int32_t, 8, int64_t, \ + do_qdmullw, SATMASK32) +DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \ + do_qdmullh, SATMASK16T) +DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \ + do_qdmullw, SATMASK32) + static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m) { m &= 0xff; diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 84a7320cf8..f73b36cae4 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -454,6 +454,36 @@ DO_2OP_SCALAR(VQDMULH_scalar, vqdmulh_scalar) DO_2OP_SCALAR(VQRDMULH_scalar, vqrdmulh_scalar) DO_2OP_SCALAR(VBRSR, vbrsr) +static bool trans_VQDMULLB_scalar(DisasContext *s, arg_2scalar *a) +{ + static MVEGenTwoOpScalarFn * const fns[] = { + NULL, + gen_helper_mve_vqdmullb_scalarh, + gen_helper_mve_vqdmullb_scalarw, + NULL, + }; + if (a->qd == a->qn && a->size == MO_32) { + /* UNPREDICTABLE; we choose to undef */ + return false; + } + return do_2op_scalar(s, a, fns[a->size]); +} + +static bool trans_VQDMULLT_scalar(DisasContext *s, arg_2scalar *a) +{ + static MVEGenTwoOpScalarFn * const fns[] = { + NULL, + gen_helper_mve_vqdmullt_scalarh, + gen_helper_mve_vqdmullt_scalarw, + NULL, + }; + if (a->qd == a->qn && a->size == MO_32) { + /* UNPREDICTABLE; we choose to undef */ + return false; + } + return do_2op_scalar(s, a, fns[a->size]); +} + static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a, MVEGenDualAccOpFn *fn) { From 380caf6c0762f43a9468aeebaf4ba7e1dd8edc9a Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:14 +0100 Subject: [PATCH 41/57] target/arm: Implement MVE VQDMULH, VQRDMULH (vector) Implement the vector forms of the MVE VQDMULH and VQRDMULH insns. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-31-peter.maydell@linaro.org --- target/arm/helper-mve.h | 8 ++++++++ target/arm/mve.decode | 3 +++ target/arm/mve_helper.c | 27 +++++++++++++++++++++++++++ target/arm/translate-mve.c | 2 ++ 4 files changed, 40 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 41dd61264e..c4e766c651 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -145,6 +145,14 @@ DEF_HELPER_FLAGS_4(mve_vmulltub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulltuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vmulltuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmulhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmulhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmulhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrdmulhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmulhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmulhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index a71ad7252b..9860d43f73 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -113,6 +113,9 @@ VMULL_BU 111 1 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op VMULL_TS 111 0 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op VMULL_TU 111 1 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op +VQDMULH 1110 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op +VQRDMULH 1111 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index d58cca5e72..1fbd066499 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -370,6 +370,25 @@ DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS) mve_advance_vpt(env); \ } +#define DO_2OP_SAT(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + TYPE r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + qc |= sat & mask & 1; \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + #define DO_AND(N, M) ((N) & (M)) #define DO_BIC(N, M) ((N) & ~(M)) #define DO_ORR(N, M) ((N) | (M)) @@ -536,6 +555,14 @@ static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) #define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \ INT32_MIN, INT32_MAX, s) +DO_2OP_SAT(vqdmulhb, 1, int8_t, DO_QDMULH_B) +DO_2OP_SAT(vqdmulhh, 2, int16_t, DO_QDMULH_H) +DO_2OP_SAT(vqdmulhw, 4, int32_t, DO_QDMULH_W) + +DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B) +DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H) +DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W) + #define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ uint32_t rm) \ diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index f73b36cae4..ec9a985286 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -396,6 +396,8 @@ DO_2OP(VMULL_BS, vmullbs) DO_2OP(VMULL_BU, vmullbu) DO_2OP(VMULL_TS, vmullts) DO_2OP(VMULL_TU, vmulltu) +DO_2OP(VQDMULH, vqdmulh) +DO_2OP(VQRDMULH, vqrdmulh) static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, MVEGenTwoOpScalarFn fn) From f741707bb36f7281ceccbdc0c44dcce61fbe1023 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:15 +0100 Subject: [PATCH 42/57] target/arm: Implement MVE VQADD, VQSUB (vector) Implement the vector forms of the MVE VQADD and VQSUB insns. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-32-peter.maydell@linaro.org --- target/arm/helper-mve.h | 16 ++++++++++++++++ target/arm/mve.decode | 5 +++++ target/arm/mve_helper.c | 14 ++++++++++++++ target/arm/translate-mve.c | 4 ++++ 4 files changed, 39 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index c4e766c651..93847fc04a 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -153,6 +153,22 @@ DEF_HELPER_FLAGS_4(mve_vqrdmulhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqrdmulhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqrdmulhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqaddsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqaddsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqaddsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqaddub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqadduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqadduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqsubsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqsubsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqsubsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqsubub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqsubuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqsubuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 9860d43f73..80fa647c08 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -116,6 +116,11 @@ VMULL_TU 111 1 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op VQDMULH 1110 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op VQRDMULH 1111 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op +VQADD_S 111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op +VQADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op +VQSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op +VQSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 1fbd066499..4c97ae88f9 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -563,6 +563,20 @@ DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B) DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H) DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W) +DO_2OP_SAT(vqaddub, 1, uint8_t, DO_UQADD_B) +DO_2OP_SAT(vqadduh, 2, uint16_t, DO_UQADD_H) +DO_2OP_SAT(vqadduw, 4, uint32_t, DO_UQADD_W) +DO_2OP_SAT(vqaddsb, 1, int8_t, DO_SQADD_B) +DO_2OP_SAT(vqaddsh, 2, int16_t, DO_SQADD_H) +DO_2OP_SAT(vqaddsw, 4, int32_t, DO_SQADD_W) + +DO_2OP_SAT(vqsubub, 1, uint8_t, DO_UQSUB_B) +DO_2OP_SAT(vqsubuh, 2, uint16_t, DO_UQSUB_H) +DO_2OP_SAT(vqsubuw, 4, uint32_t, DO_UQSUB_W) +DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B) +DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H) +DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W) + #define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ uint32_t rm) \ diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index ec9a985286..9f59ed591b 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -398,6 +398,10 @@ DO_2OP(VMULL_TS, vmullts) DO_2OP(VMULL_TU, vmulltu) DO_2OP(VQDMULH, vqdmulh) DO_2OP(VQRDMULH, vqrdmulh) +DO_2OP(VQADD_S, vqadds) +DO_2OP(VQADD_U, vqaddu) +DO_2OP(VQSUB_S, vqsubs) +DO_2OP(VQSUB_U, vqsubu) static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, MVEGenTwoOpScalarFn fn) From 483da6613937ea34fbf4b970668021dd76e46636 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:16 +0100 Subject: [PATCH 43/57] target/arm: Implement MVE VQSHL (vector) Implement the MVE VQSHL insn (encoding T4, which is the vector-shift-by-vector version). The DO_SQSHL_OP and DO_UQSHL_OP macros here are derived from the neon_helper.c code for qshl_u{8,16,32} and qshl_s{8,16,32}. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-33-peter.maydell@linaro.org --- target/arm/helper-mve.h | 8 ++++++++ target/arm/mve.decode | 12 ++++++++++++ target/arm/mve_helper.c | 34 ++++++++++++++++++++++++++++++++++ target/arm/translate-mve.c | 2 ++ 4 files changed, 56 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 93847fc04a..1c5626bb72 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -169,6 +169,14 @@ DEF_HELPER_FLAGS_4(mve_vqsubub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqsubuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqsubuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 80fa647c08..2c37e26576 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -40,6 +40,15 @@ @2op .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn @2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0 +# The _rev suffix indicates that Vn and Vm are reversed. This is +# the case for shifts. In the Arm ARM these insns are documented +# with the Vm and Vn fields in their usual places, but in the +# assembly the operands are listed "backwards", ie in the order +# Qd, Qm, Qn where other insns use Qd, Qn, Qm. For QEMU we choose +# to consider Vm and Vn as being in different fields in the insn. +# This gives us consistency with A64 and Neon. +@2op_rev .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qn qn=%qm + @2scalar .... .... .. size:2 .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn @2scalar_nosz .... .... .... .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn @@ -121,6 +130,9 @@ VQADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op VQSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op VQSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op +VQSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev +VQSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 4c97ae88f9..d9f5fe13b1 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -389,6 +389,18 @@ DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS) mve_advance_vpt(env); \ } +/* provide unsigned 2-op helpers for all sizes */ +#define DO_2OP_SAT_U(OP, FN) \ + DO_2OP_SAT(OP##b, 1, uint8_t, FN) \ + DO_2OP_SAT(OP##h, 2, uint16_t, FN) \ + DO_2OP_SAT(OP##w, 4, uint32_t, FN) + +/* provide signed 2-op helpers for all sizes */ +#define DO_2OP_SAT_S(OP, FN) \ + DO_2OP_SAT(OP##b, 1, int8_t, FN) \ + DO_2OP_SAT(OP##h, 2, int16_t, FN) \ + DO_2OP_SAT(OP##w, 4, int32_t, FN) + #define DO_AND(N, M) ((N) & (M)) #define DO_BIC(N, M) ((N) & ~(M)) #define DO_ORR(N, M) ((N) | (M)) @@ -577,6 +589,28 @@ DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B) DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H) DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W) +/* + * This wrapper fixes up the impedance mismatch between do_sqrshl_bhs() + * and friends wanting a uint32_t* sat and our needing a bool*. + */ +#define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp) \ + ({ \ + uint32_t su32 = 0; \ + typeof(N) r = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32); \ + if (su32) { \ + *satp = true; \ + } \ + r; \ + }) + +#define DO_SQSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp) +#define DO_UQSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp) + +DO_2OP_SAT_S(vqshls, DO_SQSHL_OP) +DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP) + #define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ uint32_t rm) \ diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 9f59ed591b..52fef6cd89 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -402,6 +402,8 @@ DO_2OP(VQADD_S, vqadds) DO_2OP(VQADD_U, vqaddu) DO_2OP(VQSUB_S, vqsubs) DO_2OP(VQSUB_U, vqsubu) +DO_2OP(VQSHL_S, vqshls) +DO_2OP(VQSHL_U, vqshlu) static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, MVEGenTwoOpScalarFn fn) From 9dc868c41d8c630f3c13040e2732b4df6d4739de Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:17 +0100 Subject: [PATCH 44/57] target/arm: Implement MVE VQRSHL Implement the MV VQRSHL (vector) insn. Again, the code to perform the actual shifts is borrowed from neon_helper.c. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-34-peter.maydell@linaro.org --- target/arm/helper-mve.h | 8 ++++++++ target/arm/mve.decode | 3 +++ target/arm/mve_helper.c | 6 ++++++ target/arm/translate-mve.c | 2 ++ 4 files changed, 19 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 1c5626bb72..42be99ad52 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -177,6 +177,14 @@ DEF_HELPER_FLAGS_4(mve_vqshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 2c37e26576..e78eab6d65 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -133,6 +133,9 @@ VQSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op VQSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev VQSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev +VQRSHL_S 111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev +VQRSHL_U 111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index d9f5fe13b1..64cb5b0c84 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -607,9 +607,15 @@ DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W) WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp) #define DO_UQSHL_OP(N, M, satp) \ WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp) +#define DO_SQRSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp) +#define DO_UQRSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp) DO_2OP_SAT_S(vqshls, DO_SQSHL_OP) DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP) +DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP) +DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP) #define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 52fef6cd89..bd4c6150ca 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -404,6 +404,8 @@ DO_2OP(VQSUB_S, vqsubs) DO_2OP(VQSUB_U, vqsubu) DO_2OP(VQSHL_S, vqshls) DO_2OP(VQSHL_U, vqshlu) +DO_2OP(VQRSHL_S, vqrshls) +DO_2OP(VQRSHL_U, vqrshlu) static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, MVEGenTwoOpScalarFn fn) From 0372cad813193bab3fb88985129ac59c801ca065 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:18 +0100 Subject: [PATCH 45/57] target/arm: Implement MVE VSHL insn Implement the MVE VSHL insn (vector form). Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-35-peter.maydell@linaro.org --- target/arm/helper-mve.h | 8 ++++++++ target/arm/mve.decode | 3 +++ target/arm/mve_helper.c | 6 ++++++ target/arm/translate-mve.c | 2 ++ 4 files changed, 19 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 42be99ad52..56b3e8591a 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -169,6 +169,14 @@ DEF_HELPER_FLAGS_4(mve_vqsubub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqsubuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqsubuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vqshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index e78eab6d65..ebf156b46b 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -130,6 +130,9 @@ VQADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op VQSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op VQSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op +VSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev +VSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev + VQSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev VQSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 64cb5b0c84..4a07ddfc56 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -521,6 +521,12 @@ DO_2OP_U(vhaddu, do_vhadd_u) DO_2OP_S(vhsubs, do_vhsub_s) DO_2OP_U(vhsubu, do_vhsub_u) +#define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) +#define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) + +DO_2OP_S(vshls, DO_VSHLS) +DO_2OP_U(vshlu, DO_VSHLU) + static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) { if (val > max) { diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index bd4c6150ca..487ac3185c 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -402,6 +402,8 @@ DO_2OP(VQADD_S, vqadds) DO_2OP(VQADD_U, vqaddu) DO_2OP(VQSUB_S, vqsubs) DO_2OP(VQSUB_U, vqsubu) +DO_2OP(VSHL_S, vshls) +DO_2OP(VSHL_U, vshlu) DO_2OP(VQSHL_S, vqshls) DO_2OP(VQSHL_U, vqshlu) DO_2OP(VQRSHL_S, vqrshls) From bb002345ebfe09f6f96fc41043f93d2e286cd136 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:19 +0100 Subject: [PATCH 46/57] target/arm: Implement MVE VRSHL Implement the MVE VRSHL insn (vector form). Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-36-peter.maydell@linaro.org --- target/arm/helper-mve.h | 8 ++++++++ target/arm/mve.decode | 3 +++ target/arm/mve_helper.c | 4 ++++ target/arm/translate-mve.c | 2 ++ 4 files changed, 17 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 56b3e8591a..b7e2243a19 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -177,6 +177,14 @@ DEF_HELPER_FLAGS_4(mve_vshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vrshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vqshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index ebf156b46b..c30fb2c153 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -133,6 +133,9 @@ VQSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op VSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev VSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev +VRSHL_S 111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev +VRSHL_U 111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev + VQSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev VQSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 4a07ddfc56..770938d76c 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -523,9 +523,13 @@ DO_2OP_U(vhsubu, do_vhsub_u) #define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) #define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) +#define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL) +#define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL) DO_2OP_S(vshls, DO_VSHLS) DO_2OP_U(vshlu, DO_VSHLU) +DO_2OP_S(vrshls, DO_VRSHLS) +DO_2OP_U(vrshlu, DO_VRSHLU) static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) { diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 487ac3185c..d75cc377fe 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -404,6 +404,8 @@ DO_2OP(VQSUB_S, vqsubs) DO_2OP(VQSUB_U, vqsubu) DO_2OP(VSHL_S, vshls) DO_2OP(VSHL_U, vshlu) +DO_2OP(VRSHL_S, vrshls) +DO_2OP(VRSHL_U, vrshlu) DO_2OP(VQSHL_S, vqshls) DO_2OP(VQSHL_U, vqshlu) DO_2OP(VQRSHL_S, vqrshls) From fd677f8055fa88d72f01eb9aeb1dd90606d85444 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:20 +0100 Subject: [PATCH 47/57] target/arm: Implement MVE VQDMLADH and VQRDMLADH Implement the MVE VQDMLADH and VQRDMLADH insns. These multiply elements, and then add pairs of products, double, possibly round, saturate and return the high half of the result. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-37-peter.maydell@linaro.org --- target/arm/helper-mve.h | 16 +++++++ target/arm/mve.decode | 5 +++ target/arm/mve_helper.c | 89 ++++++++++++++++++++++++++++++++++++++ target/arm/translate-mve.c | 4 ++ 4 files changed, 114 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index b7e2243a19..c3cc6a0847 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -201,6 +201,22 @@ DEF_HELPER_FLAGS_4(mve_vqrshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqrshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqrshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmladhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmladhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmladhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqdmladhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmladhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmladhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrdmladhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmladhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmladhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrdmladhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmladhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmladhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index c30fb2c153..d267c8838e 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -142,6 +142,11 @@ VQSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev VQRSHL_S 111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev VQRSHL_U 111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev +VQDMLADH 1110 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op +VQDMLADHX 1110 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op +VQRDMLADH 1110 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op +VQRDMLADHX 1110 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 770938d76c..2ae46ec481 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -627,6 +627,95 @@ DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP) DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP) DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP) +/* + * Multiply add dual returning high half + * The 'FN' here takes four inputs A, B, C, D, a 0/1 indicator of + * whether to add the rounding constant, and the pointer to the + * saturation flag, and should do "(A * B + C * D) * 2 + rounding constant", + * saturate to twice the input size and return the high half; or + * (A * B - C * D) etc for VQDMLSDH. + */ +#define DO_VQDMLADH_OP(OP, ESIZE, TYPE, XCHG, ROUND, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + if ((e & 1) == XCHG) { \ + TYPE r = FN(n[H##ESIZE(e)], \ + m[H##ESIZE(e - XCHG)], \ + n[H##ESIZE(e + (1 - 2 * XCHG))], \ + m[H##ESIZE(e + (1 - XCHG))], \ + ROUND, &sat); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + qc |= sat & mask & 1; \ + } \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +static int8_t do_vqdmladh_b(int8_t a, int8_t b, int8_t c, int8_t d, + int round, bool *sat) +{ + int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 7); + return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; +} + +static int16_t do_vqdmladh_h(int16_t a, int16_t b, int16_t c, int16_t d, + int round, bool *sat) +{ + int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 15); + return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; +} + +static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d, + int round, bool *sat) +{ + int64_t m1 = (int64_t)a * b; + int64_t m2 = (int64_t)c * d; + int64_t r; + /* + * Architecturally we should do the entire add, double, round + * and then check for saturation. We do three saturating adds, + * but we need to be careful about the order. If the first + * m1 + m2 saturates then it's impossible for the *2+rc to + * bring it back into the non-saturated range. However, if + * m1 + m2 is negative then it's possible that doing the doubling + * would take the intermediate result below INT64_MAX and the + * addition of the rounding constant then brings it back in range. + * So we add half the rounding constant before doubling rather + * than adding the rounding constant after the doubling. + */ + if (sadd64_overflow(m1, m2, &r) || + sadd64_overflow(r, (round << 30), &r) || + sadd64_overflow(r, r, &r)) { + *sat = true; + return r < 0 ? INT32_MAX : INT32_MIN; + } + return r >> 32; +} + +DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b) +DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h) +DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w) +DO_VQDMLADH_OP(vqdmladhxb, 1, int8_t, 1, 0, do_vqdmladh_b) +DO_VQDMLADH_OP(vqdmladhxh, 2, int16_t, 1, 0, do_vqdmladh_h) +DO_VQDMLADH_OP(vqdmladhxw, 4, int32_t, 1, 0, do_vqdmladh_w) + +DO_VQDMLADH_OP(vqrdmladhb, 1, int8_t, 0, 1, do_vqdmladh_b) +DO_VQDMLADH_OP(vqrdmladhh, 2, int16_t, 0, 1, do_vqdmladh_h) +DO_VQDMLADH_OP(vqrdmladhw, 4, int32_t, 0, 1, do_vqdmladh_w) +DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b) +DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h) +DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w) + #define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ uint32_t rm) \ diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index d75cc377fe..d830b42d5c 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -410,6 +410,10 @@ DO_2OP(VQSHL_S, vqshls) DO_2OP(VQSHL_U, vqshlu) DO_2OP(VQRSHL_S, vqrshls) DO_2OP(VQRSHL_U, vqrshlu) +DO_2OP(VQDMLADH, vqdmladh) +DO_2OP(VQDMLADHX, vqdmladhx) +DO_2OP(VQRDMLADH, vqrdmladh) +DO_2OP(VQRDMLADHX, vqrdmladhx) static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, MVEGenTwoOpScalarFn fn) From 92f117326af14d9bffc2ec99e0f112d33c0615ca Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:21 +0100 Subject: [PATCH 48/57] target/arm: Implement MVE VQDMLSDH and VQRDMLSDH Implement the MVE VQDMLSDH and VQRDMLSDH insns, which are like VQDMLADH and VQRDMLADH except that products are subtracted rather than added. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-38-peter.maydell@linaro.org --- target/arm/helper-mve.h | 16 ++++++++++++++ target/arm/mve.decode | 5 +++++ target/arm/mve_helper.c | 44 ++++++++++++++++++++++++++++++++++++++ target/arm/translate-mve.c | 4 ++++ 4 files changed, 69 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index c3cc6a0847..61f8082e0e 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -217,6 +217,22 @@ DEF_HELPER_FLAGS_4(mve_vqrdmladhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqrdmladhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqrdmladhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmlsdhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmlsdhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmlsdhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqdmlsdhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmlsdhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmlsdhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrdmlsdhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmlsdhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmlsdhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrdmlsdhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmlsdhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmlsdhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index d267c8838e..fa4fb1b203 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -147,6 +147,11 @@ VQDMLADHX 1110 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op VQRDMLADH 1110 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op VQRDMLADHX 1110 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op +VQDMLSDH 1111 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op +VQDMLSDHX 1111 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op +VQRDMLSDH 1111 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op +VQRDMLSDHX 1111 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 2ae46ec481..5703748c4a 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -702,6 +702,36 @@ static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d, return r >> 32; } +static int8_t do_vqdmlsdh_b(int8_t a, int8_t b, int8_t c, int8_t d, + int round, bool *sat) +{ + int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7); + return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; +} + +static int16_t do_vqdmlsdh_h(int16_t a, int16_t b, int16_t c, int16_t d, + int round, bool *sat) +{ + int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15); + return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; +} + +static int32_t do_vqdmlsdh_w(int32_t a, int32_t b, int32_t c, int32_t d, + int round, bool *sat) +{ + int64_t m1 = (int64_t)a * b; + int64_t m2 = (int64_t)c * d; + int64_t r; + /* The same ordering issue as in do_vqdmladh_w applies here too */ + if (ssub64_overflow(m1, m2, &r) || + sadd64_overflow(r, (round << 30), &r) || + sadd64_overflow(r, r, &r)) { + *sat = true; + return r < 0 ? INT32_MAX : INT32_MIN; + } + return r >> 32; +} + DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b) DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h) DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w) @@ -716,6 +746,20 @@ DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b) DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h) DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w) +DO_VQDMLADH_OP(vqdmlsdhb, 1, int8_t, 0, 0, do_vqdmlsdh_b) +DO_VQDMLADH_OP(vqdmlsdhh, 2, int16_t, 0, 0, do_vqdmlsdh_h) +DO_VQDMLADH_OP(vqdmlsdhw, 4, int32_t, 0, 0, do_vqdmlsdh_w) +DO_VQDMLADH_OP(vqdmlsdhxb, 1, int8_t, 1, 0, do_vqdmlsdh_b) +DO_VQDMLADH_OP(vqdmlsdhxh, 2, int16_t, 1, 0, do_vqdmlsdh_h) +DO_VQDMLADH_OP(vqdmlsdhxw, 4, int32_t, 1, 0, do_vqdmlsdh_w) + +DO_VQDMLADH_OP(vqrdmlsdhb, 1, int8_t, 0, 1, do_vqdmlsdh_b) +DO_VQDMLADH_OP(vqrdmlsdhh, 2, int16_t, 0, 1, do_vqdmlsdh_h) +DO_VQDMLADH_OP(vqrdmlsdhw, 4, int32_t, 0, 1, do_vqdmlsdh_w) +DO_VQDMLADH_OP(vqrdmlsdhxb, 1, int8_t, 1, 1, do_vqdmlsdh_b) +DO_VQDMLADH_OP(vqrdmlsdhxh, 2, int16_t, 1, 1, do_vqdmlsdh_h) +DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w) + #define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ uint32_t rm) \ diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index d830b42d5c..27b3e378ac 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -414,6 +414,10 @@ DO_2OP(VQDMLADH, vqdmladh) DO_2OP(VQDMLADHX, vqdmladhx) DO_2OP(VQRDMLADH, vqrdmladh) DO_2OP(VQRDMLADHX, vqrdmladhx) +DO_2OP(VQDMLSDH, vqdmlsdh) +DO_2OP(VQDMLSDHX, vqdmlsdhx) +DO_2OP(VQRDMLSDH, vqrdmlsdh) +DO_2OP(VQRDMLSDHX, vqrdmlsdhx) static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, MVEGenTwoOpScalarFn fn) From 43364321f354b8722d5bab730052b625adc3a92c Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:22 +0100 Subject: [PATCH 49/57] target/arm: Implement MVE VQDMULL (vector) Implement the vector form of the MVE VQDMULL insn. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-39-peter.maydell@linaro.org --- target/arm/helper-mve.h | 5 +++++ target/arm/mve.decode | 5 +++++ target/arm/mve_helper.c | 30 ++++++++++++++++++++++++++++++ target/arm/translate-mve.c | 30 ++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 61f8082e0e..34a46ed38e 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -233,6 +233,11 @@ DEF_HELPER_FLAGS_4(mve_vqrdmlsdhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqrdmlsdhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqrdmlsdhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmullbh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmullbw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmullth, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmulltw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index fa4fb1b203..3a2a7e75a3 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -39,6 +39,8 @@ @1op_nosz .... .... .... .... .... .... .... .... &1op qd=%qd qm=%qm size=0 @2op .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn @2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0 +@2op_sz28 .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn \ + size=%size_28 # The _rev suffix indicates that Vn and Vm are reversed. This is # the case for shifts. In the Arm ARM these insns are documented @@ -152,6 +154,9 @@ VQDMLSDHX 1111 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op VQRDMLSDH 1111 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op VQRDMLSDHX 1111 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op +VQDMULLB 111 . 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 1 @2op_sz28 +VQDMULLT 111 . 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 1 @2op_sz28 + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 5703748c4a..fb8933a9da 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -899,6 +899,36 @@ DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \ DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \ do_qdmullw, SATMASK32) +/* + * Long saturating ops + */ +#define DO_2OP_SAT_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + void *vm) \ + { \ + LTYPE *d = vd; \ + TYPE *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned le; \ + bool qc = false; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + bool sat = false; \ + LTYPE op1 = n[H##ESIZE(le * 2 + TOP)]; \ + LTYPE op2 = m[H##ESIZE(le * 2 + TOP)]; \ + mergemask(&d[H##LESIZE(le)], FN(op1, op2, &sat), mask); \ + qc |= sat && (mask & SATMASK); \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +DO_2OP_SAT_L(vqdmullbh, 0, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16B) +DO_2OP_SAT_L(vqdmullbw, 0, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32) +DO_2OP_SAT_L(vqdmullth, 1, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16T) +DO_2OP_SAT_L(vqdmulltw, 1, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32) + static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m) { m &= 0xff; diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 27b3e378ac..05789a1981 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -419,6 +419,36 @@ DO_2OP(VQDMLSDHX, vqdmlsdhx) DO_2OP(VQRDMLSDH, vqrdmlsdh) DO_2OP(VQRDMLSDHX, vqrdmlsdhx) +static bool trans_VQDMULLB(DisasContext *s, arg_2op *a) +{ + static MVEGenTwoOpFn * const fns[] = { + NULL, + gen_helper_mve_vqdmullbh, + gen_helper_mve_vqdmullbw, + NULL, + }; + if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) { + /* UNPREDICTABLE; we choose to undef */ + return false; + } + return do_2op(s, a, fns[a->size]); +} + +static bool trans_VQDMULLT(DisasContext *s, arg_2op *a) +{ + static MVEGenTwoOpFn * const fns[] = { + NULL, + gen_helper_mve_vqdmullth, + gen_helper_mve_vqdmulltw, + NULL, + }; + if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) { + /* UNPREDICTABLE; we choose to undef */ + return false; + } + return do_2op(s, a, fns[a->size]); +} + static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, MVEGenTwoOpScalarFn fn) { From 1eb987a89d944515b05ccd8b913bee7fd0d547ae Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:23 +0100 Subject: [PATCH 50/57] target/arm: Implement MVE VRHADD Implement the MVE VRHADD insn, which performs a rounded halving addition. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-40-peter.maydell@linaro.org --- target/arm/helper-mve.h | 8 ++++++++ target/arm/mve.decode | 3 +++ target/arm/mve_helper.c | 6 ++++++ target/arm/translate-mve.c | 2 ++ 4 files changed, 19 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 34a46ed38e..2f0cf99359 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -238,6 +238,14 @@ DEF_HELPER_FLAGS_4(mve_vqdmullbw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqdmullth, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vqdmulltw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrhaddsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrhaddsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrhaddsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vrhaddub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrhadduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrhadduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 3a2a7e75a3..6b969902df 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -157,6 +157,9 @@ VQRDMLSDHX 1111 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op VQDMULLB 111 . 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 1 @2op_sz28 VQDMULLT 111 . 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 1 @2op_sz28 +VRHADD_S 111 0 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op +VRHADD_U 111 1 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index fb8933a9da..8e18a967b7 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -531,6 +531,12 @@ DO_2OP_U(vshlu, DO_VSHLU) DO_2OP_S(vrshls, DO_VRSHLS) DO_2OP_U(vrshlu, DO_VRSHLU) +#define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1) +#define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1) + +DO_2OP_S(vrhadds, DO_RHADD_S) +DO_2OP_U(vrhaddu, DO_RHADD_U) + static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) { if (val > max) { diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 05789a1981..febf644079 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -418,6 +418,8 @@ DO_2OP(VQDMLSDH, vqdmlsdh) DO_2OP(VQDMLSDHX, vqdmlsdhx) DO_2OP(VQRDMLSDH, vqrdmlsdh) DO_2OP(VQRDMLSDHX, vqrdmlsdhx) +DO_2OP(VRHADD_S, vrhadds) +DO_2OP(VRHADD_U, vrhaddu) static bool trans_VQDMULLB(DisasContext *s, arg_2op *a) { From 89bc4c4f78c2435fdf8dc10b650cfe73c75f1f2c Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:24 +0100 Subject: [PATCH 51/57] target/arm: Implement MVE VADC, VSBC Implement the MVE VADC and VSBC insns. These perform an add-with-carry or subtract-with-carry of the 32-bit elements in each lane of the input vectors, where the carry-out of each add is the carry-in of the next. The initial carry input is either 1 or is from FPSCR.C; the carry out at the end is written back to FPSCR.C. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-41-peter.maydell@linaro.org --- target/arm/helper-mve.h | 5 ++++ target/arm/mve.decode | 5 ++++ target/arm/mve_helper.c | 52 ++++++++++++++++++++++++++++++++++++++ target/arm/translate-mve.c | 37 +++++++++++++++++++++++++++ 4 files changed, 99 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 2f0cf99359..459c8eebdd 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -246,6 +246,11 @@ DEF_HELPER_FLAGS_4(mve_vrhaddub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vrhadduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vrhadduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vadc, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vadci, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vsbc, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vsbci, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 6b969902df..79915f45d7 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -160,6 +160,11 @@ VQDMULLT 111 . 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 1 @2op_sz28 VRHADD_S 111 0 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op VRHADD_U 111 1 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op +VADC 1110 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz +VSBC 1111 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz +VADCI 1110 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz +VSBCI 1111 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz + # Vector miscellaneous VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 8e18a967b7..8f55f70659 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -537,6 +537,58 @@ DO_2OP_U(vrshlu, DO_VRSHLU) DO_2OP_S(vrhadds, DO_RHADD_S) DO_2OP_U(vrhaddu, DO_RHADD_U) +static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m, + uint32_t inv, uint32_t carry_in, bool update_flags) +{ + uint16_t mask = mve_element_mask(env); + unsigned e; + + /* If any additions trigger, we will update flags. */ + if (mask & 0x1111) { + update_flags = true; + } + + for (e = 0; e < 16 / 4; e++, mask >>= 4) { + uint64_t r = carry_in; + r += n[H4(e)]; + r += m[H4(e)] ^ inv; + if (mask & 1) { + carry_in = r >> 32; + } + mergemask(&d[H4(e)], r, mask); + } + + if (update_flags) { + /* Store C, clear NZV. */ + env->vfp.xregs[ARM_VFP_FPSCR] &= ~FPCR_NZCV_MASK; + env->vfp.xregs[ARM_VFP_FPSCR] |= carry_in * FPCR_C; + } + mve_advance_vpt(env); +} + +void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C; + do_vadc(env, vd, vn, vm, 0, carry_in, false); +} + +void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C; + do_vadc(env, vd, vn, vm, -1, carry_in, false); +} + + +void HELPER(mve_vadci)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + do_vadc(env, vd, vn, vm, 0, 0, true); +} + +void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + do_vadc(env, vd, vn, vm, -1, 1, true); +} + static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) { if (val > max) { diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index febf644079..f8cc6080c9 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -451,6 +451,43 @@ static bool trans_VQDMULLT(DisasContext *s, arg_2op *a) return do_2op(s, a, fns[a->size]); } +/* + * VADC and VSBC: these perform an add-with-carry or subtract-with-carry + * of the 32-bit elements in each lane of the input vectors, where the + * carry-out of each add is the carry-in of the next. The initial carry + * input is either fixed (0 for VADCI, 1 for VSBCI) or is from FPSCR.C + * (for VADC and VSBC); the carry out at the end is written back to FPSCR.C. + * These insns are subject to beat-wise execution. Partial execution + * of an I=1 (initial carry input fixed) insn which does not + * execute the first beat must start with the current FPSCR.NZCV + * value, not the fixed constant input. + */ +static bool trans_VADC(DisasContext *s, arg_2op *a) +{ + return do_2op(s, a, gen_helper_mve_vadc); +} + +static bool trans_VADCI(DisasContext *s, arg_2op *a) +{ + if (mve_skip_first_beat(s)) { + return trans_VADC(s, a); + } + return do_2op(s, a, gen_helper_mve_vadci); +} + +static bool trans_VSBC(DisasContext *s, arg_2op *a) +{ + return do_2op(s, a, gen_helper_mve_vsbc); +} + +static bool trans_VSBCI(DisasContext *s, arg_2op *a) +{ + if (mve_skip_first_beat(s)) { + return trans_VSBC(s, a); + } + return do_2op(s, a, gen_helper_mve_vsbci); +} + static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, MVEGenTwoOpScalarFn fn) { From 67ec113b119360092dee679ca0f5eca8ac60992c Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:25 +0100 Subject: [PATCH 52/57] target/arm: Implement MVE VCADD Implement the MVE VCADD insn, which performs a complex add with rotate. Note that the size=0b11 encoding is VSBC. The architecture grants some leeway for the "destination and Vm source overlap" case for the size MO_32 case, but we choose not to make use of it, instead always calculating all 16 bytes worth of results before setting the destination register. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-42-peter.maydell@linaro.org --- target/arm/helper-mve.h | 8 ++++++++ target/arm/mve.decode | 9 +++++++-- target/arm/mve_helper.c | 29 +++++++++++++++++++++++++++++ target/arm/translate-mve.c | 7 +++++++ 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 459c8eebdd..b8ad3df9cc 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -251,6 +251,14 @@ DEF_HELPER_FLAGS_4(mve_vadci, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vsbc, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vsbci, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vcadd90b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vcadd90h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vcadd90w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vcadd270b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vcadd270h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vcadd270w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 79915f45d7..afe6007864 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -161,9 +161,14 @@ VRHADD_S 111 0 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op VRHADD_U 111 1 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op VADC 1110 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz -VSBC 1111 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz VADCI 1110 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz -VSBCI 1111 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz + +{ + VSBC 1111 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz + VSBCI 1111 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz + VCADD90 1111 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op + VCADD270 1111 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op +} # Vector miscellaneous diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 8f55f70659..620b659fec 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -589,6 +589,35 @@ void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm) do_vadc(env, vd, vn, vm, -1, 1, true); } +#define DO_VCADD(OP, ESIZE, TYPE, FN0, FN1) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE r[16 / ESIZE]; \ + /* Calculate all results first to avoid overwriting inputs */ \ + for (e = 0; e < 16 / ESIZE; e++) { \ + if (!(e & 1)) { \ + r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]); \ + } else { \ + r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]); \ + } \ + } \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], r[e], mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VCADD_ALL(OP, FN0, FN1) \ + DO_VCADD(OP##b, 1, int8_t, FN0, FN1) \ + DO_VCADD(OP##h, 2, int16_t, FN0, FN1) \ + DO_VCADD(OP##w, 4, int32_t, FN0, FN1) + +DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD) +DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB) + static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) { if (val > max) { diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index f8cc6080c9..b164907426 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -420,6 +420,13 @@ DO_2OP(VQRDMLSDH, vqrdmlsdh) DO_2OP(VQRDMLSDHX, vqrdmlsdhx) DO_2OP(VRHADD_S, vrhadds) DO_2OP(VRHADD_U, vrhaddu) +/* + * VCADD Qd == Qm at size MO_32 is UNPREDICTABLE; we choose not to diagnose + * so we can reuse the DO_2OP macro. (Our implementation calculates the + * "expected" results in this case.) + */ +DO_2OP(VCADD90, vcadd90) +DO_2OP(VCADD270, vcadd270) static bool trans_VQDMULLB(DisasContext *s, arg_2op *a) { From 8625693ac48f54e87f663736c0bbde7ea450f1f7 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:26 +0100 Subject: [PATCH 53/57] target/arm: Implement MVE VHCADD Implement the MVE VHCADD insn, which is similar to VCADD but performs a halving step. This one overlaps with VADC. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-43-peter.maydell@linaro.org --- target/arm/helper-mve.h | 8 ++++++++ target/arm/mve.decode | 8 ++++++-- target/arm/mve_helper.c | 2 ++ target/arm/translate-mve.c | 4 +++- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index b8ad3df9cc..161308b67e 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -259,6 +259,14 @@ DEF_HELPER_FLAGS_4(mve_vcadd270b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vcadd270h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) DEF_HELPER_FLAGS_4(mve_vcadd270w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhcadd90b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhcadd90h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhcadd90w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vhcadd270b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhcadd270h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhcadd270w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index afe6007864..695097dcca 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -160,8 +160,12 @@ VQDMULLT 111 . 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 1 @2op_sz28 VRHADD_S 111 0 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op VRHADD_U 111 1 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op -VADC 1110 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz -VADCI 1110 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz +{ + VADC 1110 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz + VADCI 1110 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz + VHCADD90 1110 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op + VHCADD270 1110 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op +} { VSBC 1111 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 620b659fec..e3cec487e0 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -617,6 +617,8 @@ void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm) DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD) DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB) +DO_VCADD_ALL(vhcadd90, do_vhsub_s, do_vhadd_s) +DO_VCADD_ALL(vhcadd270, do_vhadd_s, do_vhsub_s) static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) { diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index b164907426..73c15f4133 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -423,10 +423,12 @@ DO_2OP(VRHADD_U, vrhaddu) /* * VCADD Qd == Qm at size MO_32 is UNPREDICTABLE; we choose not to diagnose * so we can reuse the DO_2OP macro. (Our implementation calculates the - * "expected" results in this case.) + * "expected" results in this case.) Similarly for VHCADD. */ DO_2OP(VCADD90, vcadd90) DO_2OP(VCADD270, vcadd270) +DO_2OP(VHCADD90, vhcadd90) +DO_2OP(VHCADD270, vhcadd270) static bool trans_VQDMULLB(DisasContext *s, arg_2op *a) { From 6f060a636bf46869e43a28a0f426ddaea16314f9 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:27 +0100 Subject: [PATCH 54/57] target/arm: Implement MVE VADDV Implement the MVE VADDV insn, which performs an addition across vector lanes. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-44-peter.maydell@linaro.org --- target/arm/helper-mve.h | 7 +++++++ target/arm/mve.decode | 2 ++ target/arm/mve_helper.c | 24 +++++++++++++++++++++ target/arm/translate-mve.c | 43 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+) diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h index 161308b67e..4bbb9b3ae2 100644 --- a/target/arm/helper-mve.h +++ b/target/arm/helper-mve.h @@ -348,3 +348,10 @@ DEF_HELPER_FLAGS_4(mve_vrmlaldavhuw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vrmlsldavhsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) DEF_HELPER_FLAGS_4(mve_vrmlsldavhxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) + +DEF_HELPER_FLAGS_3(mve_vaddvsb, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vaddvub, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vaddvsh, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vaddvuh, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vaddvsw, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vaddvuw, TCG_CALL_NO_WG, i32, env, ptr, i32) diff --git a/target/arm/mve.decode b/target/arm/mve.decode index 695097dcca..d9ece7be5d 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -252,6 +252,8 @@ VBRSR 1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar VQDMULH_scalar 1110 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar +# Vector add across vector +VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo # Predicate operations %mask_22_13 22:1 13:3 diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index e3cec487e0..05552ce7ee 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -1134,3 +1134,27 @@ DO_LDAVH(vrmlaldavhuw, 4, uint32_t, false, int128_add, int128_add, int128_make64 DO_LDAVH(vrmlsldavhsw, 4, int32_t, false, int128_add, int128_sub, int128_makes64) DO_LDAVH(vrmlsldavhxsw, 4, int32_t, true, int128_add, int128_sub, int128_makes64) + +/* Vector add across vector */ +#define DO_VADDV(OP, ESIZE, TYPE) \ + uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ + uint32_t ra) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *m = vm; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + ra += m[H##ESIZE(e)]; \ + } \ + } \ + mve_advance_vpt(env); \ + return ra; \ + } \ + +DO_VADDV(vaddvsb, 1, uint8_t) +DO_VADDV(vaddvsh, 2, uint16_t) +DO_VADDV(vaddvsw, 4, uint32_t) +DO_VADDV(vaddvub, 1, uint8_t) +DO_VADDV(vaddvuh, 2, uint16_t) +DO_VADDV(vaddvuw, 4, uint32_t) diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 73c15f4133..04d84e8846 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -33,6 +33,7 @@ typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); typedef void MVEGenTwoOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr); typedef void MVEGenTwoOpScalarFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); typedef void MVEGenDualAccOpFn(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64); +typedef void MVEGenVADDVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32); /* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */ static inline long mve_qreg_offset(unsigned reg) @@ -743,3 +744,45 @@ static bool trans_VPST(DisasContext *s, arg_VPST *a) mve_update_and_store_eci(s); return true; } + +static bool trans_VADDV(DisasContext *s, arg_VADDV *a) +{ + /* VADDV: vector add across vector */ + static MVEGenVADDVFn * const fns[4][2] = { + { gen_helper_mve_vaddvsb, gen_helper_mve_vaddvub }, + { gen_helper_mve_vaddvsh, gen_helper_mve_vaddvuh }, + { gen_helper_mve_vaddvsw, gen_helper_mve_vaddvuw }, + { NULL, NULL } + }; + TCGv_ptr qm; + TCGv_i32 rda; + + if (!dc_isar_feature(aa32_mve, s) || + a->size == 3) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + /* + * This insn is subject to beat-wise execution. Partial execution + * of an A=0 (no-accumulate) insn which does not execute the first + * beat must start with the current value of Rda, not zero. + */ + if (a->a || mve_skip_first_beat(s)) { + /* Accumulate input from Rda */ + rda = load_reg(s, a->rda); + } else { + /* Accumulate starting at zero */ + rda = tcg_const_i32(0); + } + + qm = mve_qreg_ptr(a->qm); + fns[a->size][a->u](rda, cpu_env, qm, rda); + store_reg(s, a->rda, rda); + tcg_temp_free_ptr(qm); + + mve_update_eci(s); + return true; +} From 4f57ef959cf83cc780658c7e97ba5f737aa666f2 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 17 Jun 2021 13:16:28 +0100 Subject: [PATCH 55/57] target/arm: Make VMOV scalar <-> gpreg beatwise for MVE In a CPU with MVE, the VMOV (vector lane to general-purpose register) and VMOV (general-purpose register to vector lane) insns are not predicated, but they are subject to beatwise execution if they are not in an IT block. Since our implementation always executes all 4 beats in one tick, this means only that we need to handle PSR.ECI: * we must do the usual check for bad ECI state * we must advance ECI state if the insn succeeds * if ECI says we should not be executing the beat corresponding to the lane of the vector register being accessed then we should skip performing the move Note that if PSR.ECI is non-zero then we cannot be in an IT block. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20210617121628.20116-45-peter.maydell@linaro.org --- target/arm/translate-a32.h | 2 + target/arm/translate-mve.c | 4 +- target/arm/translate-vfp.c | 77 +++++++++++++++++++++++++++++++++++--- 3 files changed, 75 insertions(+), 8 deletions(-) diff --git a/target/arm/translate-a32.h b/target/arm/translate-a32.h index 2326405300..6dfcafe179 100644 --- a/target/arm/translate-a32.h +++ b/target/arm/translate-a32.h @@ -47,6 +47,8 @@ long neon_full_reg_offset(unsigned reg); long neon_element_offset(int reg, int element, MemOp memop); void gen_rev16(TCGv_i32 dest, TCGv_i32 var); void clear_eci_state(DisasContext *s); +bool mve_eci_check(DisasContext *s); +void mve_update_and_store_eci(DisasContext *s); static inline TCGv_i32 load_cpu_offset(int offset) { diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index 04d84e8846..67462bdf27 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -57,7 +57,7 @@ static bool mve_check_qreg_bank(DisasContext *s, int qmask) return qmask < 8; } -static bool mve_eci_check(DisasContext *s) +bool mve_eci_check(DisasContext *s) { /* * This is a beatwise insn: check that ECI is valid (not a @@ -91,7 +91,7 @@ static void mve_update_eci(DisasContext *s) } } -static void mve_update_and_store_eci(DisasContext *s) +void mve_update_and_store_eci(DisasContext *s) { /* * For insns which don't call a helper function that will call diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c index 86e43c02dc..b2991e21ec 100644 --- a/target/arm/translate-vfp.c +++ b/target/arm/translate-vfp.c @@ -581,6 +581,48 @@ static bool trans_VCVT(DisasContext *s, arg_VCVT *a) return true; } +static bool mve_skip_vmov(DisasContext *s, int vn, int index, int size) +{ + /* + * In a CPU with MVE, the VMOV (vector lane to general-purpose register) + * and VMOV (general-purpose register to vector lane) insns are not + * predicated, but they are subject to beatwise execution if they are + * not in an IT block. + * + * Since our implementation always executes all 4 beats in one tick, + * this means only that if PSR.ECI says we should not be executing + * the beat corresponding to the lane of the vector register being + * accessed then we should skip performing the move, and that we need + * to do the usual check for bad ECI state and advance of ECI state. + * + * Note that if PSR.ECI is non-zero then we cannot be in an IT block. + * + * Return true if this VMOV scalar <-> gpreg should be skipped because + * the MVE PSR.ECI state says we skip the beat where the store happens. + */ + + /* Calculate the byte offset into Qn which we're going to access */ + int ofs = (index << size) + ((vn & 1) * 8); + + if (!dc_isar_feature(aa32_mve, s)) { + return false; + } + + switch (s->eci) { + case ECI_NONE: + return false; + case ECI_A0: + return ofs < 4; + case ECI_A0A1: + return ofs < 8; + case ECI_A0A1A2: + case ECI_A0A1A2B0: + return ofs < 12; + default: + g_assert_not_reached(); + } +} + static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a) { /* VMOV scalar to general purpose register */ @@ -603,14 +645,26 @@ static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a) return false; } + if (dc_isar_feature(aa32_mve, s)) { + if (!mve_eci_check(s)) { + return true; + } + } + if (!vfp_access_check(s)) { return true; } - tmp = tcg_temp_new_i32(); - read_neon_element32(tmp, a->vn, a->index, a->size | (a->u ? 0 : MO_SIGN)); - store_reg(s, a->rt, tmp); + if (!mve_skip_vmov(s, a->vn, a->index, a->size)) { + tmp = tcg_temp_new_i32(); + read_neon_element32(tmp, a->vn, a->index, + a->size | (a->u ? 0 : MO_SIGN)); + store_reg(s, a->rt, tmp); + } + if (dc_isar_feature(aa32_mve, s)) { + mve_update_and_store_eci(s); + } return true; } @@ -636,14 +690,25 @@ static bool trans_VMOV_from_gp(DisasContext *s, arg_VMOV_from_gp *a) return false; } + if (dc_isar_feature(aa32_mve, s)) { + if (!mve_eci_check(s)) { + return true; + } + } + if (!vfp_access_check(s)) { return true; } - tmp = load_reg(s, a->rt); - write_neon_element32(tmp, a->vn, a->index, a->size); - tcg_temp_free_i32(tmp); + if (!mve_skip_vmov(s, a->vn, a->index, a->size)) { + tmp = load_reg(s, a->rt); + write_neon_element32(tmp, a->vn, a->index, a->size); + tcg_temp_free_i32(tmp); + } + if (dc_isar_feature(aa32_mve, s)) { + mve_update_and_store_eci(s); + } return true; } From 86f0d4c7290eb2b21ec3eb44956ec245441275db Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Wed, 16 Jun 2021 12:56:14 -0700 Subject: [PATCH 56/57] target/arm: Implement MTE3 MTE3 introduces an asymmetric tag checking mode, in which loads are checked synchronously and stores are checked asynchronously. Add support for it. Signed-off-by: Peter Collingbourne Reviewed-by: Richard Henderson Message-id: 20210616195614.11785-1-pcc@google.com [PMM: Add line to emulation.rst] Signed-off-by: Peter Maydell --- docs/system/arm/emulation.rst | 1 + target/arm/cpu64.c | 2 +- target/arm/mte_helper.c | 82 ++++++++++++++++++++++------------- 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst index 836c1ca845..144dc491d9 100644 --- a/docs/system/arm/emulation.rst +++ b/docs/system/arm/emulation.rst @@ -29,6 +29,7 @@ the following architecture extensions: - FEAT_LSE (Large System Extensions) - FEAT_MTE (Memory Tagging Extension) - FEAT_MTE2 (Memory Tagging Extension) +- FEAT_MTE3 (MTE Asymmetric Fault Handling) - FEAT_PAN (Privileged access never) - FEAT_PAN2 (AT S1E1R and AT S1E1W instruction variants affected by PSTATE.PAN) - FEAT_PAuth (Pointer authentication) diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 1c23187d1a..c7a1626bec 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -683,7 +683,7 @@ static void aarch64_max_initfn(Object *obj) * during realize if the board provides no tag memory, much like * we do for EL2 with the virtualization=on property. */ - t = FIELD_DP64(t, ID_AA64PFR1, MTE, 2); + t = FIELD_DP64(t, ID_AA64PFR1, MTE, 3); cpu->isar.id_aa64pfr1 = t; t = cpu->isar.id_aa64mmfr0; diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c index 9e615cc513..724175210b 100644 --- a/target/arm/mte_helper.c +++ b/target/arm/mte_helper.c @@ -538,13 +538,50 @@ void HELPER(stzgm_tags)(CPUARMState *env, uint64_t ptr, uint64_t val) } } +static void mte_sync_check_fail(CPUARMState *env, uint32_t desc, + uint64_t dirty_ptr, uintptr_t ra) +{ + int is_write, syn; + + env->exception.vaddress = dirty_ptr; + + is_write = FIELD_EX32(desc, MTEDESC, WRITE); + syn = syn_data_abort_no_iss(arm_current_el(env) != 0, 0, 0, 0, 0, is_write, + 0x11); + raise_exception_ra(env, EXCP_DATA_ABORT, syn, exception_target_el(env), ra); + g_assert_not_reached(); +} + +static void mte_async_check_fail(CPUARMState *env, uint64_t dirty_ptr, + uintptr_t ra, ARMMMUIdx arm_mmu_idx, int el) +{ + int select; + + if (regime_has_2_ranges(arm_mmu_idx)) { + select = extract64(dirty_ptr, 55, 1); + } else { + select = 0; + } + env->cp15.tfsr_el[el] |= 1 << select; +#ifdef CONFIG_USER_ONLY + /* + * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT, + * which then sends a SIGSEGV when the thread is next scheduled. + * This cpu will return to the main loop at the end of the TB, + * which is rather sooner than "normal". But the alternative + * is waiting until the next syscall. + */ + qemu_cpu_kick(env_cpu(env)); +#endif +} + /* Record a tag check failure. */ static void mte_check_fail(CPUARMState *env, uint32_t desc, uint64_t dirty_ptr, uintptr_t ra) { int mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX); ARMMMUIdx arm_mmu_idx = core_to_aa64_mmu_idx(mmu_idx); - int el, reg_el, tcf, select, is_write, syn; + int el, reg_el, tcf; uint64_t sctlr; reg_el = regime_el(env, arm_mmu_idx); @@ -564,14 +601,8 @@ static void mte_check_fail(CPUARMState *env, uint32_t desc, switch (tcf) { case 1: /* Tag check fail causes a synchronous exception. */ - env->exception.vaddress = dirty_ptr; - - is_write = FIELD_EX32(desc, MTEDESC, WRITE); - syn = syn_data_abort_no_iss(arm_current_el(env) != 0, 0, 0, 0, 0, - is_write, 0x11); - raise_exception_ra(env, EXCP_DATA_ABORT, syn, - exception_target_el(env), ra); - /* noreturn, but fall through to the assert anyway */ + mte_sync_check_fail(env, desc, dirty_ptr, ra); + break; case 0: /* @@ -583,30 +614,19 @@ static void mte_check_fail(CPUARMState *env, uint32_t desc, case 2: /* Tag check fail causes asynchronous flag set. */ - if (regime_has_2_ranges(arm_mmu_idx)) { - select = extract64(dirty_ptr, 55, 1); - } else { - select = 0; - } - env->cp15.tfsr_el[el] |= 1 << select; -#ifdef CONFIG_USER_ONLY - /* - * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT, - * which then sends a SIGSEGV when the thread is next scheduled. - * This cpu will return to the main loop at the end of the TB, - * which is rather sooner than "normal". But the alternative - * is waiting until the next syscall. - */ - qemu_cpu_kick(env_cpu(env)); -#endif + mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el); break; - default: - /* Case 3: Reserved. */ - qemu_log_mask(LOG_GUEST_ERROR, - "Tag check failure with SCTLR_EL%d.TCF%s " - "set to reserved value %d\n", - reg_el, el ? "" : "0", tcf); + case 3: + /* + * Tag check fail causes asynchronous flag set for stores, or + * a synchronous exception for loads. + */ + if (FIELD_EX32(desc, MTEDESC, WRITE)) { + mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el); + } else { + mte_sync_check_fail(env, desc, dirty_ptr, ra); + } break; } } From 90a76c6316cfe6416fc33814a838fb3928f746ee Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Mon, 21 Jun 2021 09:56:25 +0200 Subject: [PATCH 57/57] docs/system: arm: Add nRF boards description MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds the target guide for BBC Micro:bit. Information is taken from https://wiki.qemu.org/Features/MicroBit and from hw/arm/nrf51_soc.c. Signed-off-by: Alexandre Iooss Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Joel Stanley Message-id: 20210621075625.540471-1-erdnaxe@crans.org Signed-off-by: Peter Maydell --- MAINTAINERS | 1 + docs/system/arm/nrf.rst | 51 ++++++++++++++++++++++++++++++++++++++ docs/system/target-arm.rst | 1 + 3 files changed, 53 insertions(+) create mode 100644 docs/system/arm/nrf.rst diff --git a/MAINTAINERS b/MAINTAINERS index 0ca6b7de94..e8ba494c3f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1031,6 +1031,7 @@ F: hw/*/microbit*.c F: include/hw/*/nrf51*.h F: include/hw/*/microbit*.h F: tests/qtest/microbit-test.c +F: docs/system/arm/nrf.rst AVR Machines ------------- diff --git a/docs/system/arm/nrf.rst b/docs/system/arm/nrf.rst new file mode 100644 index 0000000000..eda87bd760 --- /dev/null +++ b/docs/system/arm/nrf.rst @@ -0,0 +1,51 @@ +Nordic nRF boards (``microbit``) +================================ + +The `Nordic nRF`_ chips are a family of ARM-based System-on-Chip that +are designed to be used for low-power and short-range wireless solutions. + +.. _Nordic nRF: https://www.nordicsemi.com/Products + +The nRF51 series is the first series for short range wireless applications. +It is superseded by the nRF52 series. +The following machines are based on this chip : + +- ``microbit`` BBC micro:bit board with nRF51822 SoC + +There are other series such as nRF52, nRF53 and nRF91 which are currently not +supported by QEMU. + +Supported devices +----------------- + + * ARM Cortex-M0 (ARMv6-M) + * Serial ports (UART) + * Clock controller + * Timers + * Random Number Generator (RNG) + * GPIO controller + * NVMC + * SWI + +Missing devices +--------------- + + * Watchdog + * Real-Time Clock (RTC) controller + * TWI (i2c) + * SPI controller + * Analog to Digital Converter (ADC) + * Quadrature decoder + * Radio + +Boot options +------------ + +The Micro:bit machine can be started using the ``-device`` option to load a +firmware in `ihex format`_. Example: + +.. _ihex format: https://en.wikipedia.org/wiki/Intel_HEX + +.. code-block:: bash + + $ qemu-system-arm -M microbit -device loader,file=test.hex diff --git a/docs/system/target-arm.rst b/docs/system/target-arm.rst index 8b8547f9a9..13b3eeaf07 100644 --- a/docs/system/target-arm.rst +++ b/docs/system/target-arm.rst @@ -87,6 +87,7 @@ undocumented; you can get a complete list by running arm/digic arm/musicpal arm/gumstix + arm/nrf arm/nseries arm/nuvoton arm/orangepi