diff --git a/MAINTAINERS b/MAINTAINERS index 1a041eaf86..3443d2a5b5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1032,6 +1032,7 @@ F: hw/*/microbit*.c F: include/hw/*/nrf51*.h F: include/hw/*/microbit*.h F: tests/qtest/microbit-test.c +F: docs/system/arm/nrf.rst AVR Machines ------------- diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst new file mode 100644 index 0000000000..144dc491d9 --- /dev/null +++ b/docs/system/arm/emulation.rst @@ -0,0 +1,103 @@ +A-profile CPU architecture support +================================== + +QEMU's TCG emulation includes support for the Armv5, Armv6, Armv7 and +Armv8 versions of the A-profile architecture. It also has support for +the following architecture extensions: + +- FEAT_AA32BF16 (AArch32 BFloat16 instructions) +- FEAT_AA32HPD (AArch32 hierarchical permission disables) +- FEAT_AA32I8MM (AArch32 Int8 matrix multiplication instructions) +- FEAT_AES (AESD and AESE instructions) +- FEAT_BF16 (AArch64 BFloat16 instructions) +- FEAT_BTI (Branch Target Identification) +- FEAT_DIT (Data Independent Timing instructions) +- FEAT_DPB (DC CVAP instruction) +- FEAT_DotProd (Advanced SIMD dot product instructions) +- FEAT_FCMA (Floating-point complex number instructions) +- FEAT_FHM (Floating-point half-precision multiplication instructions) +- FEAT_FP16 (Half-precision floating-point data processing) +- FEAT_FRINTTS (Floating-point to integer instructions) +- FEAT_FlagM (Flag manipulation instructions v2) +- FEAT_FlagM2 (Enhancements to flag manipulation instructions) +- FEAT_HPDS (Hierarchical permission disables) +- FEAT_I8MM (AArch64 Int8 matrix multiplication instructions) +- FEAT_JSCVT (JavaScript conversion instructions) +- FEAT_LOR (Limited ordering regions) +- FEAT_LRCPC (Load-acquire RCpc instructions) +- FEAT_LRCPC2 (Load-acquire RCpc instructions v2) +- FEAT_LSE (Large System Extensions) +- FEAT_MTE (Memory Tagging Extension) +- FEAT_MTE2 (Memory Tagging Extension) +- FEAT_MTE3 (MTE Asymmetric Fault Handling) +- FEAT_PAN (Privileged access never) +- FEAT_PAN2 (AT S1E1R and AT S1E1W instruction variants affected by PSTATE.PAN) +- FEAT_PAuth (Pointer authentication) +- FEAT_PMULL (PMULL, PMULL2 instructions) +- FEAT_PMUv3p1 (PMU Extensions v3.1) +- FEAT_PMUv3p4 (PMU Extensions v3.4) +- FEAT_RDM (Advanced SIMD rounding double multiply accumulate instructions) +- FEAT_RNG (Random number generator) +- FEAT_SB (Speculation Barrier) +- FEAT_SEL2 (Secure EL2) +- FEAT_SHA1 (SHA1 instructions) +- FEAT_SHA256 (SHA256 instructions) +- FEAT_SHA3 (Advanced SIMD SHA3 instructions) +- FEAT_SHA512 (Advanced SIMD SHA512 instructions) +- FEAT_SM3 (Advanced SIMD SM3 instructions) +- FEAT_SM4 (Advanced SIMD SM4 instructions) +- FEAT_SPECRES (Speculation restriction instructions) +- FEAT_SSBS (Speculative Store Bypass Safe) +- FEAT_TLBIOS (TLB invalidate instructions in Outer Shareable domain) +- FEAT_TLBIRANGE (TLB invalidate range instructions) +- FEAT_TTCNP (Translation table Common not private translations) +- FEAT_TTST (Small translation tables) +- FEAT_UAO (Unprivileged Access Override control) +- FEAT_VHE (Virtualization Host Extensions) +- FEAT_VMID16 (16-bit VMID) +- FEAT_XNX (Translation table stage 2 Unprivileged Execute-never) +- SVE (The Scalable Vector Extension) +- SVE2 (The Scalable Vector Extension v2) + +For information on the specifics of these extensions, please refer +to the `Armv8-A Arm Architecture Reference Manual +`_. + +When a specific named CPU is being emulated, only those features which +are present in hardware for that CPU are emulated. (If a feature is +not in the list above then it is not supported, even if the real +hardware should have it.) The ``max`` CPU enables all features. + +R-profile CPU architecture support +================================== + +QEMU's TCG emulation support for R-profile CPUs is currently limited. +We emulate only the Cortex-R5 and Cortex-R5F CPUs. + +M-profile CPU architecture support +================================== + +QEMU's TCG emulation includes support for Armv6-M, Armv7-M, Armv8-M, and +Armv8.1-M versions of the M-profile architucture. It also has support +for the following architecture extensions: + +- FP (Floating-point Extension) +- FPCXT (FPCXT access instructions) +- HP (Half-precision floating-point instructions) +- LOB (Low Overhead loops and Branch future) +- M (Main Extension) +- MPU (Memory Protection Unit Extension) +- PXN (Privileged Execute Never) +- RAS (Reliability, Serviceability and Availability): "minimum RAS Extension" only +- S (Security Extension) +- ST (System Timer Extension) + +For information on the specifics of these extensions, please refer +to the `Armv8-M Arm Architecture Reference Manual +`_. + +When a specific named CPU is being emulated, only those features which +are present in hardware for that CPU are emulated. (If a feature is +not in the list above then it is not supported, even if the real +hardware should have it.) There is no equivalent of the ``max`` CPU for +M-profile. diff --git a/docs/system/arm/nrf.rst b/docs/system/arm/nrf.rst new file mode 100644 index 0000000000..eda87bd760 --- /dev/null +++ b/docs/system/arm/nrf.rst @@ -0,0 +1,51 @@ +Nordic nRF boards (``microbit``) +================================ + +The `Nordic nRF`_ chips are a family of ARM-based System-on-Chip that +are designed to be used for low-power and short-range wireless solutions. + +.. _Nordic nRF: https://www.nordicsemi.com/Products + +The nRF51 series is the first series for short range wireless applications. +It is superseded by the nRF52 series. +The following machines are based on this chip : + +- ``microbit`` BBC micro:bit board with nRF51822 SoC + +There are other series such as nRF52, nRF53 and nRF91 which are currently not +supported by QEMU. + +Supported devices +----------------- + + * ARM Cortex-M0 (ARMv6-M) + * Serial ports (UART) + * Clock controller + * Timers + * Random Number Generator (RNG) + * GPIO controller + * NVMC + * SWI + +Missing devices +--------------- + + * Watchdog + * Real-Time Clock (RTC) controller + * TWI (i2c) + * SPI controller + * Analog to Digital Converter (ADC) + * Quadrature decoder + * Radio + +Boot options +------------ + +The Micro:bit machine can be started using the ``-device`` option to load a +firmware in `ihex format`_. Example: + +.. _ihex format: https://en.wikipedia.org/wiki/Intel_HEX + +.. code-block:: bash + + $ qemu-system-arm -M microbit -device loader,file=test.hex diff --git a/docs/system/target-arm.rst b/docs/system/target-arm.rst index edd013c7bb..13b3eeaf07 100644 --- a/docs/system/target-arm.rst +++ b/docs/system/target-arm.rst @@ -87,6 +87,7 @@ undocumented; you can get a complete list by running arm/digic arm/musicpal arm/gumstix + arm/nrf arm/nseries arm/nuvoton arm/orangepi @@ -99,6 +100,12 @@ undocumented; you can get a complete list by running arm/virt arm/xlnx-versal-virt +Emulated CPU architecture support +================================= + +.. toctree:: + arm/emulation + Arm CPU features ================ diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c new file mode 100644 index 0000000000..c315de1802 --- /dev/null +++ b/hw/acpi/ghes-stub.c @@ -0,0 +1,22 @@ +/* + * Support for generating APEI tables and recording CPER for Guests: + * stub functions. + * + * Copyright (c) 2021 Linaro, Ltd + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "hw/acpi/ghes.h" + +int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address) +{ + return -1; +} + +bool acpi_ghes_present(void) +{ + return false; +} diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c index a4dac6bf15..a749b84d62 100644 --- a/hw/acpi/ghes.c +++ b/hw/acpi/ghes.c @@ -386,6 +386,8 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s, /* Create a read-write fw_cfg file for Address */ fw_cfg_add_file_callback(s, ACPI_GHES_DATA_ADDR_FW_CFG_FILE, NULL, NULL, NULL, &(ags->ghes_addr_le), sizeof(ags->ghes_addr_le), false); + + ags->present = true; } int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address) @@ -443,3 +445,18 @@ int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address) return ret; } + +bool acpi_ghes_present(void) +{ + AcpiGedState *acpi_ged_state; + AcpiGhesState *ags; + + acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED, + NULL)); + + if (!acpi_ged_state) { + return false; + } + ags = &acpi_ged_state->ghes_state; + return ags->present; +} diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build index dd69577212..9b7fa75719 100644 --- a/hw/acpi/meson.build +++ b/hw/acpi/meson.build @@ -13,13 +13,13 @@ acpi_ss.add(when: 'CONFIG_ACPI_PCI', if_true: files('pci.c')) acpi_ss.add(when: 'CONFIG_ACPI_VMGENID', if_true: files('vmgenid.c')) acpi_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device.c')) acpi_ss.add(when: 'CONFIG_ACPI_HMAT', if_true: files('hmat.c')) -acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c')) +acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c'), if_false: files('ghes-stub.c')) acpi_ss.add(when: 'CONFIG_ACPI_X86', if_true: files('core.c', 'piix4.c', 'pcihp.c'), if_false: files('acpi-stub.c')) acpi_ss.add(when: 'CONFIG_ACPI_X86_ICH', if_true: files('ich9.c', 'tco.c')) acpi_ss.add(when: 'CONFIG_IPMI', if_true: files('ipmi.c'), if_false: files('ipmi-stub.c')) acpi_ss.add(when: 'CONFIG_PC', if_false: files('acpi-x86-stub.c')) acpi_ss.add(when: 'CONFIG_TPM', if_true: files('tpm.c')) -softmmu_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c')) +softmmu_ss.add(when: 'CONFIG_ACPI', if_false: files('acpi-stub.c', 'aml-build-stub.c', 'ghes-stub.c')) softmmu_ss.add_all(when: 'CONFIG_ACPI', if_true: acpi_ss) softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('acpi-stub.c', 'aml-build-stub.c', - 'acpi-x86-stub.c', 'ipmi-stub.c')) + 'acpi-x86-stub.c', 'ipmi-stub.c', 'ghes-stub.c')) diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h index 2ae8bc1ded..674f6958e9 100644 --- a/include/hw/acpi/ghes.h +++ b/include/hw/acpi/ghes.h @@ -64,6 +64,7 @@ enum { typedef struct AcpiGhesState { uint64_t ghes_addr_le; + bool present; /* True if GHES is present at all on this board */ } AcpiGhesState; void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker); @@ -72,4 +73,12 @@ void acpi_build_hest(GArray *table_data, BIOSLinker *linker, void acpi_ghes_add_fw_cfg(AcpiGhesState *vms, FWCfgState *s, GArray *hardware_errors); int acpi_ghes_record_errors(uint8_t notify, uint64_t error_physical_addr); + +/** + * acpi_ghes_present: Report whether ACPI GHES table is present + * + * Returns: true if the system has an ACPI GHES table and it is + * safe to call acpi_ghes_record_errors() to record a memory error. + */ +bool acpi_ghes_present(void); #endif diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h index ef8a008ea7..1a2ae93758 100644 --- a/include/tcg/tcg-op.h +++ b/include/tcg/tcg-op.h @@ -338,6 +338,9 @@ void tcg_gen_umin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2); void tcg_gen_umax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2); void tcg_gen_abs_i32(TCGv_i32, TCGv_i32); +/* Replicate a value of size @vece from @in to all the lanes in @out */ +void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in); + static inline void tcg_gen_discard_i32(TCGv_i32 arg) { tcg_gen_op1_i32(INDEX_op_discard, arg); @@ -534,6 +537,9 @@ void tcg_gen_umin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2); void tcg_gen_umax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2); void tcg_gen_abs_i64(TCGv_i64, TCGv_i64); +/* Replicate a value of size @vece from @in to all the lanes in @out */ +void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in); + #if TCG_TARGET_REG_BITS == 64 static inline void tcg_gen_discard_i64(TCGv_i64 arg) { @@ -1127,6 +1133,7 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t); #define tcg_gen_atomic_smax_fetch_tl tcg_gen_atomic_smax_fetch_i64 #define tcg_gen_atomic_umax_fetch_tl tcg_gen_atomic_umax_fetch_i64 #define tcg_gen_dup_tl_vec tcg_gen_dup_i64_vec +#define tcg_gen_dup_tl tcg_gen_dup_i64 #else #define tcg_gen_movi_tl tcg_gen_movi_i32 #define tcg_gen_mov_tl tcg_gen_mov_i32 @@ -1241,6 +1248,7 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t); #define tcg_gen_atomic_smax_fetch_tl tcg_gen_atomic_smax_fetch_i32 #define tcg_gen_atomic_umax_fetch_tl tcg_gen_atomic_umax_fetch_i32 #define tcg_gen_dup_tl_vec tcg_gen_dup_i32_vec +#define tcg_gen_dup_tl tcg_gen_dup_i32 #endif #if UINTPTR_MAX == UINT32_MAX diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h index 41a6c4bfe5..2dad364240 100644 --- a/include/tcg/tcg.h +++ b/include/tcg/tcg.h @@ -1264,7 +1264,6 @@ uint64_t dup_const(unsigned vece, uint64_t c); : (qemu_build_not_reached_always(), 0)) \ : dup_const(VECE, C)) - /* * Memory helpers that will be used by TCG generated code. */ diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 1c23187d1a..c7a1626bec 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -683,7 +683,7 @@ static void aarch64_max_initfn(Object *obj) * during realize if the board provides no tag memory, much like * we do for EL2 with the virtualization=on property. */ - t = FIELD_DP64(t, ID_AA64PFR1, MTE, 2); + t = FIELD_DP64(t, ID_AA64PFR1, MTE, 3); cpu->isar.id_aa64pfr1 = t; t = cpu->isar.id_aa64mmfr0; diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h new file mode 100644 index 0000000000..4bbb9b3ae2 --- /dev/null +++ b/target/arm/helper-mve.h @@ -0,0 +1,357 @@ +/* + * M-profile MVE specific helper definitions + * + * Copyright (c) 2021 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ +DEF_HELPER_FLAGS_3(mve_vldrb, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrw, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vstrb, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vstrh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vstrw, TCG_CALL_NO_WG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vldrb_sh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrb_sw, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrb_uh, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrb_uw, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrh_sw, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vldrh_uw, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vstrb_h, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vstrb_w, TCG_CALL_NO_WG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vstrh_w, TCG_CALL_NO_WG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vdup, TCG_CALL_NO_WG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(mve_vclsb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vclsh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vclsw, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vclzb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vclzh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vclzw, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vrev16b, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vrev32b, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vrev32h, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vrev64b, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vrev64h, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vrev64w, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vmvn, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vabsb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vabsh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vabsw, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vfabsh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vfabss, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_3(mve_vnegb, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vnegh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vnegw, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vfnegh, TCG_CALL_NO_WG, void, env, ptr, ptr) +DEF_HELPER_FLAGS_3(mve_vfnegs, TCG_CALL_NO_WG, void, env, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vand, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vbic, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vorr, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vorn, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_veor, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vaddb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vaddh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vaddw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vsubb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vsubh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vsubw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vmulb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vmulhsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulhsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulhsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulhub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulhuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulhuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vrmulhsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrmulhsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrmulhsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrmulhub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrmulhuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrmulhuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vmaxsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmaxsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmaxsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmaxub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmaxuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmaxuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vminsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vminsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vminsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vminub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vminuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vminuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vabdsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vabdsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vabdsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vabdub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vabduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vabduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vhaddsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhaddsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhaddsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhaddub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhadduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhadduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vhsubsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhsubsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhsubsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhsubub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhsubuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhsubuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vmullbsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullbsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullbsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullbub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullbuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmullbuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vmulltsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulltsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulltsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulltub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulltuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vmulltuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqdmulhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmulhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmulhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrdmulhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmulhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmulhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqaddsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqaddsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqaddsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqaddub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqadduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqadduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqsubsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqsubsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqsubsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqsubub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqsubuh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqsubuw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vrshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vrshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrshlsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrshlsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrshlsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrshlub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrshluh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrshluw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqdmladhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmladhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmladhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqdmladhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmladhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmladhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrdmladhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmladhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmladhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrdmladhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmladhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmladhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqdmlsdhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmlsdhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmlsdhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqdmlsdhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmlsdhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmlsdhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrdmlsdhb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmlsdhh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmlsdhw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqrdmlsdhxb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmlsdhxh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqrdmlsdhxw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vqdmullbh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmullbw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmullth, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vqdmulltw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vrhaddsb, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrhaddsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrhaddsw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vrhaddub, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrhadduh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vrhadduw, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vadc, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vadci, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vsbc, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vsbci, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vcadd90b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vcadd90h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vcadd90w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vcadd270b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vcadd270h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vcadd270w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vhcadd90b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhcadd90h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhcadd90w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vhcadd270b, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhcadd270h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) +DEF_HELPER_FLAGS_4(mve_vhcadd270w, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vsub_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vsub_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vsub_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vmul_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmul_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vmul_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vhadds_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhadds_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhadds_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vhaddu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhaddu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhaddu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vhsubs_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhsubs_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhsubs_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vhsubu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhsubu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vhsubu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqadds_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqadds_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqadds_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqaddu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqaddu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqaddu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqsubs_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqsubs_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqsubs_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqsubu_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqsubu_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqsubu_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqdmulh_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmulh_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmulh_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqrdmulh_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrdmulh_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqrdmulh_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vbrsrb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vbrsrh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vbrsrw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vqdmullb_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmullb_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmullt_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(mve_vqdmullt_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlaldavxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) + +DEF_HELPER_FLAGS_4(mve_vmlaldavuh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlaldavuw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) + +DEF_HELPER_FLAGS_4(mve_vmlsldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlsldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlsldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vmlsldavxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) + +DEF_HELPER_FLAGS_4(mve_vrmlaldavhsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vrmlaldavhxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) + +DEF_HELPER_FLAGS_4(mve_vrmlaldavhuw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) + +DEF_HELPER_FLAGS_4(mve_vrmlsldavhsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) +DEF_HELPER_FLAGS_4(mve_vrmlsldavhxsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64) + +DEF_HELPER_FLAGS_3(mve_vaddvsb, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vaddvub, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vaddvsh, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vaddvuh, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vaddvsw, TCG_CALL_NO_WG, i32, env, ptr, i32) +DEF_HELPER_FLAGS_3(mve_vaddvuw, TCG_CALL_NO_WG, i32, env, ptr, i32) diff --git a/target/arm/helper.h b/target/arm/helper.h index dc6eb96d43..db87d7d537 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -1019,3 +1019,5 @@ DEF_HELPER_FLAGS_6(gvec_bfmlal_idx, TCG_CALL_NO_RWG, #include "helper-a64.h" #include "helper-sve.h" #endif + +#include "helper-mve.h" diff --git a/target/arm/internals.h b/target/arm/internals.h index 886db56b58..3ba86e8af8 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -1202,4 +1202,15 @@ static inline uint64_t useronly_maybe_clean_ptr(uint32_t desc, uint64_t ptr) return ptr; } +/* Values for M-profile PSR.ECI for MVE insns */ +enum MVEECIState { + ECI_NONE = 0, /* No completed beats */ + ECI_A0 = 1, /* Completed: A0 */ + ECI_A0A1 = 2, /* Completed: A0, A1 */ + /* 3 is reserved */ + ECI_A0A1A2 = 4, /* Completed: A0, A1, A2 */ + ECI_A0A1A2B0 = 5, /* Completed: A0, A1, A2, B0 */ + /* All other values reserved */ +}; + #endif diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 37ceadd9a9..59982d470d 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -1410,14 +1410,10 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) { ram_addr_t ram_addr; hwaddr paddr; - Object *obj = qdev_get_machine(); - VirtMachineState *vms = VIRT_MACHINE(obj); - bool acpi_enabled = virt_is_acpi_enabled(vms); assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); - if (acpi_enabled && addr && - object_property_get_bool(obj, "ras", NULL)) { + if (acpi_ghes_present() && addr) { ram_addr = qemu_ram_addr_from_host(addr); if (ram_addr != RAM_ADDR_INVALID && kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { diff --git a/target/arm/m-nocp.decode b/target/arm/m-nocp.decode index 6699626d7c..b65c801c97 100644 --- a/target/arm/m-nocp.decode +++ b/target/arm/m-nocp.decode @@ -34,6 +34,14 @@ &nocp cp +# M-profile VLDR/VSTR to sysreg +%vldr_sysreg 22:1 13:3 +%imm7_0x4 0:7 !function=times_4 + +&vldr_sysreg rn reg imm a w p +@vldr_sysreg .... ... . a:1 . . . rn:4 ... . ... .. ....... \ + reg=%vldr_sysreg imm=%imm7_0x4 &vldr_sysreg + { # Special cases which do not take an early NOCP: VLLDM and VLSTM VLLDM_VLSTM 1110 1100 001 l:1 rn:4 0000 1010 op:1 000 0000 @@ -41,6 +49,22 @@ VSCCLRM 1110 1100 1.01 1111 .... 1011 imm:7 0 vd=%vd_dp size=3 VSCCLRM 1110 1100 1.01 1111 .... 1010 imm:8 vd=%vd_sp size=2 + # FP system register accesses: these are a special case because accesses + # to FPCXT_NS succeed even if the FPU is disabled. We therefore need + # to handle them before the big NOCP blocks. Note that within these + # insns NOCP still has higher priority than UNDEFs; this is implemented + # by their returning 'false' for UNDEF so as to fall through into the + # NOCP check (in contrast to VLLDM etc, which call unallocated_encoding() + # for the UNDEFs there that must take precedence over NOCP.) + + VMSR_VMRS ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000 + + # P=0 W=0 is SEE "Related encodings", so split into two patterns + VLDR_sysreg ---- 110 1 . . w:1 1 .... ... 0 111 11 ....... @vldr_sysreg p=1 + VLDR_sysreg ---- 110 0 . . 1 1 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1 + VSTR_sysreg ---- 110 1 . . w:1 0 .... ... 0 111 11 ....... @vldr_sysreg p=1 + VSTR_sysreg ---- 110 0 . . 1 0 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1 + NOCP 111- 1110 ---- ---- ---- cp:4 ---- ---- &nocp NOCP 111- 110- ---- ---- ---- cp:4 ---- ---- &nocp # From v8.1M onwards this range will also NOCP: diff --git a/target/arm/meson.build b/target/arm/meson.build index 2b50be3f86..25a02bf276 100644 --- a/target/arm/meson.build +++ b/target/arm/meson.build @@ -23,6 +23,7 @@ arm_ss.add(files( 'helper.c', 'iwmmxt_helper.c', 'm_helper.c', + 'mve_helper.c', 'neon_helper.c', 'op_helper.c', 'tlb_helper.c', diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c index 9e615cc513..724175210b 100644 --- a/target/arm/mte_helper.c +++ b/target/arm/mte_helper.c @@ -538,13 +538,50 @@ void HELPER(stzgm_tags)(CPUARMState *env, uint64_t ptr, uint64_t val) } } +static void mte_sync_check_fail(CPUARMState *env, uint32_t desc, + uint64_t dirty_ptr, uintptr_t ra) +{ + int is_write, syn; + + env->exception.vaddress = dirty_ptr; + + is_write = FIELD_EX32(desc, MTEDESC, WRITE); + syn = syn_data_abort_no_iss(arm_current_el(env) != 0, 0, 0, 0, 0, is_write, + 0x11); + raise_exception_ra(env, EXCP_DATA_ABORT, syn, exception_target_el(env), ra); + g_assert_not_reached(); +} + +static void mte_async_check_fail(CPUARMState *env, uint64_t dirty_ptr, + uintptr_t ra, ARMMMUIdx arm_mmu_idx, int el) +{ + int select; + + if (regime_has_2_ranges(arm_mmu_idx)) { + select = extract64(dirty_ptr, 55, 1); + } else { + select = 0; + } + env->cp15.tfsr_el[el] |= 1 << select; +#ifdef CONFIG_USER_ONLY + /* + * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT, + * which then sends a SIGSEGV when the thread is next scheduled. + * This cpu will return to the main loop at the end of the TB, + * which is rather sooner than "normal". But the alternative + * is waiting until the next syscall. + */ + qemu_cpu_kick(env_cpu(env)); +#endif +} + /* Record a tag check failure. */ static void mte_check_fail(CPUARMState *env, uint32_t desc, uint64_t dirty_ptr, uintptr_t ra) { int mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX); ARMMMUIdx arm_mmu_idx = core_to_aa64_mmu_idx(mmu_idx); - int el, reg_el, tcf, select, is_write, syn; + int el, reg_el, tcf; uint64_t sctlr; reg_el = regime_el(env, arm_mmu_idx); @@ -564,14 +601,8 @@ static void mte_check_fail(CPUARMState *env, uint32_t desc, switch (tcf) { case 1: /* Tag check fail causes a synchronous exception. */ - env->exception.vaddress = dirty_ptr; - - is_write = FIELD_EX32(desc, MTEDESC, WRITE); - syn = syn_data_abort_no_iss(arm_current_el(env) != 0, 0, 0, 0, 0, - is_write, 0x11); - raise_exception_ra(env, EXCP_DATA_ABORT, syn, - exception_target_el(env), ra); - /* noreturn, but fall through to the assert anyway */ + mte_sync_check_fail(env, desc, dirty_ptr, ra); + break; case 0: /* @@ -583,30 +614,19 @@ static void mte_check_fail(CPUARMState *env, uint32_t desc, case 2: /* Tag check fail causes asynchronous flag set. */ - if (regime_has_2_ranges(arm_mmu_idx)) { - select = extract64(dirty_ptr, 55, 1); - } else { - select = 0; - } - env->cp15.tfsr_el[el] |= 1 << select; -#ifdef CONFIG_USER_ONLY - /* - * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT, - * which then sends a SIGSEGV when the thread is next scheduled. - * This cpu will return to the main loop at the end of the TB, - * which is rather sooner than "normal". But the alternative - * is waiting until the next syscall. - */ - qemu_cpu_kick(env_cpu(env)); -#endif + mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el); break; - default: - /* Case 3: Reserved. */ - qemu_log_mask(LOG_GUEST_ERROR, - "Tag check failure with SCTLR_EL%d.TCF%s " - "set to reserved value %d\n", - reg_el, el ? "" : "0", tcf); + case 3: + /* + * Tag check fail causes asynchronous flag set for stores, or + * a synchronous exception for loads. + */ + if (FIELD_EX32(desc, MTEDESC, WRITE)) { + mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el); + } else { + mte_sync_check_fail(env, desc, dirty_ptr, ra); + } break; } } diff --git a/target/arm/mve.decode b/target/arm/mve.decode index c8492bb576..d9ece7be5d 100644 --- a/target/arm/mve.decode +++ b/target/arm/mve.decode @@ -18,3 +18,243 @@ # # This file is processed by scripts/decodetree.py # + +%qd 22:1 13:3 +%qm 5:1 1:3 +%qn 7:1 17:3 + +# VQDMULL has size in bit 28: 0 for 16 bit, 1 for 32 bit +%size_28 28:1 !function=plus_1 + +&vldr_vstr rn qd imm p a w size l u +&1op qd qm size +&2op qd qm qn size +&2scalar qd qn rm size + +@vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0 +# Note that both Rn and Qd are 3 bits only (no D bit) +@vldst_wn ... u:1 ... . . . . l:1 . rn:3 qd:3 . ... .. imm:7 &vldr_vstr + +@1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm +@1op_nosz .... .... .... .... .... .... .... .... &1op qd=%qd qm=%qm size=0 +@2op .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn +@2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0 +@2op_sz28 .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn \ + size=%size_28 + +# The _rev suffix indicates that Vn and Vm are reversed. This is +# the case for shifts. In the Arm ARM these insns are documented +# with the Vm and Vn fields in their usual places, but in the +# assembly the operands are listed "backwards", ie in the order +# Qd, Qm, Qn where other insns use Qd, Qn, Qm. For QEMU we choose +# to consider Vm and Vn as being in different fields in the insn. +# This gives us consistency with A64 and Neon. +@2op_rev .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qn qn=%qm + +@2scalar .... .... .. size:2 .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn +@2scalar_nosz .... .... .... .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn + +# Vector loads and stores + +# Widening loads and narrowing stores: +# for these P=0 W=0 is 'related encoding'; sz=11 is 'related encoding' +# This means we need to expand out to multiple patterns for P, W, SZ. +# For stores the U bit must be 0 but we catch that in the trans_ function. +# The naming scheme here is "VLDSTB_H == in-memory byte load/store to/from +# signed halfword element in register", etc. +VLDSTB_H 111 . 110 0 a:1 0 1 . 0 ... ... 0 111 01 ....... @vldst_wn \ + p=0 w=1 size=1 +VLDSTB_H 111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 01 ....... @vldst_wn \ + p=1 size=1 +VLDSTB_W 111 . 110 0 a:1 0 1 . 0 ... ... 0 111 10 ....... @vldst_wn \ + p=0 w=1 size=2 +VLDSTB_W 111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 10 ....... @vldst_wn \ + p=1 size=2 +VLDSTH_W 111 . 110 0 a:1 0 1 . 1 ... ... 0 111 10 ....... @vldst_wn \ + p=0 w=1 size=2 +VLDSTH_W 111 . 110 1 a:1 0 w:1 . 1 ... ... 0 111 10 ....... @vldst_wn \ + p=1 size=2 + +# Non-widening loads/stores (P=0 W=0 is 'related encoding') +VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111100 ....... @vldr_vstr \ + size=0 p=0 w=1 +VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111101 ....... @vldr_vstr \ + size=1 p=0 w=1 +VLDR_VSTR 1110110 0 a:1 . 1 . .... ... 111110 ....... @vldr_vstr \ + size=2 p=0 w=1 +VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111100 ....... @vldr_vstr \ + size=0 p=1 +VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111101 ....... @vldr_vstr \ + size=1 p=1 +VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111110 ....... @vldr_vstr \ + size=2 p=1 + +# Vector 2-op +VAND 1110 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VBIC 1110 1111 0 . 01 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VORR 1110 1111 0 . 10 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VORN 1110 1111 0 . 11 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz +VEOR 1111 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz + +VADD 1110 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op +VSUB 1111 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op +VMUL 1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op + +VMULH_S 111 0 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op +VMULH_U 111 1 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op + +VRMULH_S 111 0 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op +VRMULH_U 111 1 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op + +VMAX_S 111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op +VMAX_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op +VMIN_S 111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op +VMIN_U 111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op + +VABD_S 111 0 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op +VABD_U 111 1 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op + +VHADD_S 111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op +VHADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op +VHSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op +VHSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op + +VMULL_BS 111 0 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op +VMULL_BU 111 1 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op +VMULL_TS 111 0 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op +VMULL_TU 111 1 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op + +VQDMULH 1110 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op +VQRDMULH 1111 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op + +VQADD_S 111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op +VQADD_U 111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op +VQSUB_S 111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op +VQSUB_U 111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op + +VSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev +VSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev + +VRSHL_S 111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev +VRSHL_U 111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev + +VQSHL_S 111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev +VQSHL_U 111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev + +VQRSHL_S 111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev +VQRSHL_U 111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev + +VQDMLADH 1110 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op +VQDMLADHX 1110 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op +VQRDMLADH 1110 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op +VQRDMLADHX 1110 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op + +VQDMLSDH 1111 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op +VQDMLSDHX 1111 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op +VQRDMLSDH 1111 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op +VQRDMLSDHX 1111 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op + +VQDMULLB 111 . 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 1 @2op_sz28 +VQDMULLT 111 . 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 1 @2op_sz28 + +VRHADD_S 111 0 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op +VRHADD_U 111 1 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op + +{ + VADC 1110 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz + VADCI 1110 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz + VHCADD90 1110 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op + VHCADD270 1110 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op +} + +{ + VSBC 1111 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz + VSBCI 1111 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz + VCADD90 1111 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op + VCADD270 1111 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op +} + +# Vector miscellaneous + +VCLS 1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op +VCLZ 1111 1111 1 . 11 .. 00 ... 0 0100 11 . 0 ... 0 @1op + +VREV16 1111 1111 1 . 11 .. 00 ... 0 0001 01 . 0 ... 0 @1op +VREV32 1111 1111 1 . 11 .. 00 ... 0 0000 11 . 0 ... 0 @1op +VREV64 1111 1111 1 . 11 .. 00 ... 0 0000 01 . 0 ... 0 @1op + +VMVN 1111 1111 1 . 11 00 00 ... 0 0101 11 . 0 ... 0 @1op_nosz + +VABS 1111 1111 1 . 11 .. 01 ... 0 0011 01 . 0 ... 0 @1op +VABS_fp 1111 1111 1 . 11 .. 01 ... 0 0111 01 . 0 ... 0 @1op +VNEG 1111 1111 1 . 11 .. 01 ... 0 0011 11 . 0 ... 0 @1op +VNEG_fp 1111 1111 1 . 11 .. 01 ... 0 0111 11 . 0 ... 0 @1op + +&vdup qd rt size +# Qd is in the fields usually named Qn +@vdup .... .... . . .. ... . rt:4 .... . . . . .... qd=%qn &vdup + +# B and E bits encode size, which we decode here to the usual size values +VDUP 1110 1110 1 1 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=0 +VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 1 1 0000 @vdup size=1 +VDUP 1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2 + +# multiply-add long dual accumulate +# rdahi: bits [3:1] from insn, bit 0 is 1 +# rdalo: bits [3:1] from insn, bit 0 is 0 +%rdahi 20:3 !function=times_2_plus_1 +%rdalo 13:3 !function=times_2 +# size bit is 0 for 16 bit, 1 for 32 bit +%size_16 16:1 !function=plus_1 + +&vmlaldav rdahi rdalo size qn qm x a + +@vmlaldav .... .... . ... ... . ... . .... .... qm:3 . \ + qn=%qn rdahi=%rdahi rdalo=%rdalo size=%size_16 &vmlaldav +@vmlaldav_nosz .... .... . ... ... . ... . .... .... qm:3 . \ + qn=%qn rdahi=%rdahi rdalo=%rdalo size=0 &vmlaldav +VMLALDAV_S 1110 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 0 @vmlaldav +VMLALDAV_U 1111 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 0 @vmlaldav + +VMLSLDAV 1110 1110 1 ... ... . ... x:1 1110 . 0 a:1 0 ... 1 @vmlaldav + +VRMLALDAVH_S 1110 1110 1 ... ... 0 ... x:1 1111 . 0 a:1 0 ... 0 @vmlaldav_nosz +VRMLALDAVH_U 1111 1110 1 ... ... 0 ... x:1 1111 . 0 a:1 0 ... 0 @vmlaldav_nosz + +VRMLSLDAVH 1111 1110 1 ... ... 0 ... x:1 1110 . 0 a:1 0 ... 1 @vmlaldav_nosz + +# Scalar operations + +VADD_scalar 1110 1110 0 . .. ... 1 ... 0 1111 . 100 .... @2scalar +VSUB_scalar 1110 1110 0 . .. ... 1 ... 1 1111 . 100 .... @2scalar +VMUL_scalar 1110 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar +VHADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar +VHADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar +VHSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar +VHSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar + +{ + VQADD_S_scalar 1110 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar + VQADD_U_scalar 1111 1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar + VQDMULLB_scalar 111 . 1110 0 . 11 ... 0 ... 0 1111 . 110 .... @2scalar_nosz \ + size=%size_28 +} + +{ + VQSUB_S_scalar 1110 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar + VQSUB_U_scalar 1111 1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar + VQDMULLT_scalar 111 . 1110 0 . 11 ... 0 ... 1 1111 . 110 .... @2scalar_nosz \ + size=%size_28 +} + +VBRSR 1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar + +VQDMULH_scalar 1110 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar +VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar + +# Vector add across vector +VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo + +# Predicate operations +%mask_22_13 22:1 13:3 +VPST 1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13 diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c new file mode 100644 index 0000000000..05552ce7ee --- /dev/null +++ b/target/arm/mve_helper.c @@ -0,0 +1,1160 @@ +/* + * M-profile MVE Operations + * + * Copyright (c) 2021 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#include "qemu/osdep.h" +#include "qemu/int128.h" +#include "cpu.h" +#include "internals.h" +#include "vec_internal.h" +#include "exec/helper-proto.h" +#include "exec/cpu_ldst.h" +#include "exec/exec-all.h" +#include "tcg/tcg.h" + +static uint16_t mve_element_mask(CPUARMState *env) +{ + /* + * Return the mask of which elements in the MVE vector should be + * updated. This is a combination of multiple things: + * (1) by default, we update every lane in the vector + * (2) VPT predication stores its state in the VPR register; + * (3) low-overhead-branch tail predication will mask out part + * the vector on the final iteration of the loop + * (4) if EPSR.ECI is set then we must execute only some beats + * of the insn + * We combine all these into a 16-bit result with the same semantics + * as VPR.P0: 0 to mask the lane, 1 if it is active. + * 8-bit vector ops will look at all bits of the result; + * 16-bit ops will look at bits 0, 2, 4, ...; + * 32-bit ops will look at bits 0, 4, 8 and 12. + * Compare pseudocode GetCurInstrBeat(), though that only returns + * the 4-bit slice of the mask corresponding to a single beat. + */ + uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0); + + if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) { + mask |= 0xff; + } + if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) { + mask |= 0xff00; + } + + if (env->v7m.ltpsize < 4 && + env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) { + /* + * Tail predication active, and this is the last loop iteration. + * The element size is (1 << ltpsize), and we only want to process + * loopcount elements, so we want to retain the least significant + * (loopcount * esize) predicate bits and zero out bits above that. + */ + int masklen = env->regs[14] << env->v7m.ltpsize; + assert(masklen <= 16); + mask &= MAKE_64BIT_MASK(0, masklen); + } + + if ((env->condexec_bits & 0xf) == 0) { + /* + * ECI bits indicate which beats are already executed; + * we handle this by effectively predicating them out. + */ + int eci = env->condexec_bits >> 4; + switch (eci) { + case ECI_NONE: + break; + case ECI_A0: + mask &= 0xfff0; + break; + case ECI_A0A1: + mask &= 0xff00; + break; + case ECI_A0A1A2: + case ECI_A0A1A2B0: + mask &= 0xf000; + break; + default: + g_assert_not_reached(); + } + } + + return mask; +} + +static void mve_advance_vpt(CPUARMState *env) +{ + /* Advance the VPT and ECI state if necessary */ + uint32_t vpr = env->v7m.vpr; + unsigned mask01, mask23; + + if ((env->condexec_bits & 0xf) == 0) { + env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ? + (ECI_A0 << 4) : (ECI_NONE << 4); + } + + if (!(vpr & (R_V7M_VPR_MASK01_MASK | R_V7M_VPR_MASK23_MASK))) { + /* VPT not enabled, nothing to do */ + return; + } + + mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01); + mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23); + if (mask01 > 8) { + /* high bit set, but not 0b1000: invert the relevant half of P0 */ + vpr ^= 0xff; + } + if (mask23 > 8) { + /* high bit set, but not 0b1000: invert the relevant half of P0 */ + vpr ^= 0xff00; + } + vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1); + vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1); + env->v7m.vpr = vpr; +} + + +#define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ + { \ + TYPE *d = vd; \ + uint16_t mask = mve_element_mask(env); \ + unsigned b, e; \ + /* \ + * R_SXTM allows the dest reg to become UNKNOWN for abandoned \ + * beats so we don't care if we update part of the dest and \ + * then take an exception. \ + */ \ + for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ + if (mask & (1 << b)) { \ + d[H##ESIZE(e)] = cpu_##LDTYPE##_data_ra(env, addr, GETPC()); \ + } \ + addr += MSIZE; \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ + { \ + TYPE *d = vd; \ + uint16_t mask = mve_element_mask(env); \ + unsigned b, e; \ + for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ + if (mask & (1 << b)) { \ + cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \ + } \ + addr += MSIZE; \ + } \ + mve_advance_vpt(env); \ + } + +DO_VLDR(vldrb, 1, ldub, 1, uint8_t) +DO_VLDR(vldrh, 2, lduw, 2, uint16_t) +DO_VLDR(vldrw, 4, ldl, 4, uint32_t) + +DO_VSTR(vstrb, 1, stb, 1, uint8_t) +DO_VSTR(vstrh, 2, stw, 2, uint16_t) +DO_VSTR(vstrw, 4, stl, 4, uint32_t) + +DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t) +DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t) +DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t) +DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t) +DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t) +DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t) + +DO_VSTR(vstrb_h, 1, stb, 2, int16_t) +DO_VSTR(vstrb_w, 1, stb, 4, int32_t) +DO_VSTR(vstrh_w, 2, stw, 4, int32_t) + +#undef DO_VLDR +#undef DO_VSTR + +/* + * The mergemask(D, R, M) macro performs the operation "*D = R" but + * storing only the bytes which correspond to 1 bits in M, + * leaving other bytes in *D unchanged. We use _Generic + * to select the correct implementation based on the type of D. + */ + +static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask) +{ + if (mask & 1) { + *d = r; + } +} + +static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask) +{ + mergemask_ub((uint8_t *)d, r, mask); +} + +static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask) +{ + uint16_t bmask = expand_pred_b_data[mask & 3]; + *d = (*d & ~bmask) | (r & bmask); +} + +static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask) +{ + mergemask_uh((uint16_t *)d, r, mask); +} + +static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask) +{ + uint32_t bmask = expand_pred_b_data[mask & 0xf]; + *d = (*d & ~bmask) | (r & bmask); +} + +static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask) +{ + mergemask_uw((uint32_t *)d, r, mask); +} + +static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask) +{ + uint64_t bmask = expand_pred_b_data[mask & 0xff]; + *d = (*d & ~bmask) | (r & bmask); +} + +static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask) +{ + mergemask_uq((uint64_t *)d, r, mask); +} + +#define mergemask(D, R, M) \ + _Generic(D, \ + uint8_t *: mergemask_ub, \ + int8_t *: mergemask_sb, \ + uint16_t *: mergemask_uh, \ + int16_t *: mergemask_sh, \ + uint32_t *: mergemask_uw, \ + int32_t *: mergemask_sw, \ + uint64_t *: mergemask_uq, \ + int64_t *: mergemask_sq)(D, R, M) + +void HELPER(mve_vdup)(CPUARMState *env, void *vd, uint32_t val) +{ + /* + * The generated code already replicated an 8 or 16 bit constant + * into the 32-bit value, so we only need to write the 32-bit + * value to all elements of the Qreg, allowing for predication. + */ + uint32_t *d = vd; + uint16_t mask = mve_element_mask(env); + unsigned e; + for (e = 0; e < 16 / 4; e++, mask >>= 4) { + mergemask(&d[H4(e)], val, mask); + } + mve_advance_vpt(env); +} + +#define DO_1OP(OP, ESIZE, TYPE, FN) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ + { \ + TYPE *d = vd, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_CLS_B(N) (clrsb32(N) - 24) +#define DO_CLS_H(N) (clrsb32(N) - 16) + +DO_1OP(vclsb, 1, int8_t, DO_CLS_B) +DO_1OP(vclsh, 2, int16_t, DO_CLS_H) +DO_1OP(vclsw, 4, int32_t, clrsb32) + +#define DO_CLZ_B(N) (clz32(N) - 24) +#define DO_CLZ_H(N) (clz32(N) - 16) + +DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B) +DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H) +DO_1OP(vclzw, 4, uint32_t, clz32) + +DO_1OP(vrev16b, 2, uint16_t, bswap16) +DO_1OP(vrev32b, 4, uint32_t, bswap32) +DO_1OP(vrev32h, 4, uint32_t, hswap32) +DO_1OP(vrev64b, 8, uint64_t, bswap64) +DO_1OP(vrev64h, 8, uint64_t, hswap64) +DO_1OP(vrev64w, 8, uint64_t, wswap64) + +#define DO_NOT(N) (~(N)) + +DO_1OP(vmvn, 8, uint64_t, DO_NOT) + +#define DO_ABS(N) ((N) < 0 ? -(N) : (N)) +#define DO_FABSH(N) ((N) & dup_const(MO_16, 0x7fff)) +#define DO_FABSS(N) ((N) & dup_const(MO_32, 0x7fffffff)) + +DO_1OP(vabsb, 1, int8_t, DO_ABS) +DO_1OP(vabsh, 2, int16_t, DO_ABS) +DO_1OP(vabsw, 4, int32_t, DO_ABS) + +/* We can do these 64 bits at a time */ +DO_1OP(vfabsh, 8, uint64_t, DO_FABSH) +DO_1OP(vfabss, 8, uint64_t, DO_FABSS) + +#define DO_NEG(N) (-(N)) +#define DO_FNEGH(N) ((N) ^ dup_const(MO_16, 0x8000)) +#define DO_FNEGS(N) ((N) ^ dup_const(MO_32, 0x80000000)) + +DO_1OP(vnegb, 1, int8_t, DO_NEG) +DO_1OP(vnegh, 2, int16_t, DO_NEG) +DO_1OP(vnegw, 4, int32_t, DO_NEG) + +/* We can do these 64 bits at a time */ +DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH) +DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS) + +#define DO_2OP(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, \ + void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], \ + FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask); \ + } \ + mve_advance_vpt(env); \ + } + +/* provide unsigned 2-op helpers for all sizes */ +#define DO_2OP_U(OP, FN) \ + DO_2OP(OP##b, 1, uint8_t, FN) \ + DO_2OP(OP##h, 2, uint16_t, FN) \ + DO_2OP(OP##w, 4, uint32_t, FN) + +/* provide signed 2-op helpers for all sizes */ +#define DO_2OP_S(OP, FN) \ + DO_2OP(OP##b, 1, int8_t, FN) \ + DO_2OP(OP##h, 2, int16_t, FN) \ + DO_2OP(OP##w, 4, int32_t, FN) + +/* + * "Long" operations where two half-sized inputs (taken from either the + * top or the bottom of the input vector) produce a double-width result. + * Here ESIZE, TYPE are for the input, and LESIZE, LTYPE for the output. + */ +#define DO_2OP_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ + { \ + LTYPE *d = vd; \ + TYPE *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned le; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], \ + m[H##ESIZE(le * 2 + TOP)]); \ + mergemask(&d[H##LESIZE(le)], r, mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_2OP_SAT(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + TYPE r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + qc |= sat & mask & 1; \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +/* provide unsigned 2-op helpers for all sizes */ +#define DO_2OP_SAT_U(OP, FN) \ + DO_2OP_SAT(OP##b, 1, uint8_t, FN) \ + DO_2OP_SAT(OP##h, 2, uint16_t, FN) \ + DO_2OP_SAT(OP##w, 4, uint32_t, FN) + +/* provide signed 2-op helpers for all sizes */ +#define DO_2OP_SAT_S(OP, FN) \ + DO_2OP_SAT(OP##b, 1, int8_t, FN) \ + DO_2OP_SAT(OP##h, 2, int16_t, FN) \ + DO_2OP_SAT(OP##w, 4, int32_t, FN) + +#define DO_AND(N, M) ((N) & (M)) +#define DO_BIC(N, M) ((N) & ~(M)) +#define DO_ORR(N, M) ((N) | (M)) +#define DO_ORN(N, M) ((N) | ~(M)) +#define DO_EOR(N, M) ((N) ^ (M)) + +DO_2OP(vand, 8, uint64_t, DO_AND) +DO_2OP(vbic, 8, uint64_t, DO_BIC) +DO_2OP(vorr, 8, uint64_t, DO_ORR) +DO_2OP(vorn, 8, uint64_t, DO_ORN) +DO_2OP(veor, 8, uint64_t, DO_EOR) + +#define DO_ADD(N, M) ((N) + (M)) +#define DO_SUB(N, M) ((N) - (M)) +#define DO_MUL(N, M) ((N) * (M)) + +DO_2OP_U(vadd, DO_ADD) +DO_2OP_U(vsub, DO_SUB) +DO_2OP_U(vmul, DO_MUL) + +DO_2OP_L(vmullbsb, 0, 1, int8_t, 2, int16_t, DO_MUL) +DO_2OP_L(vmullbsh, 0, 2, int16_t, 4, int32_t, DO_MUL) +DO_2OP_L(vmullbsw, 0, 4, int32_t, 8, int64_t, DO_MUL) +DO_2OP_L(vmullbub, 0, 1, uint8_t, 2, uint16_t, DO_MUL) +DO_2OP_L(vmullbuh, 0, 2, uint16_t, 4, uint32_t, DO_MUL) +DO_2OP_L(vmullbuw, 0, 4, uint32_t, 8, uint64_t, DO_MUL) + +DO_2OP_L(vmulltsb, 1, 1, int8_t, 2, int16_t, DO_MUL) +DO_2OP_L(vmulltsh, 1, 2, int16_t, 4, int32_t, DO_MUL) +DO_2OP_L(vmulltsw, 1, 4, int32_t, 8, int64_t, DO_MUL) +DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL) +DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL) +DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL) + +/* + * Because the computation type is at least twice as large as required, + * these work for both signed and unsigned source types. + */ +static inline uint8_t do_mulh_b(int32_t n, int32_t m) +{ + return (n * m) >> 8; +} + +static inline uint16_t do_mulh_h(int32_t n, int32_t m) +{ + return (n * m) >> 16; +} + +static inline uint32_t do_mulh_w(int64_t n, int64_t m) +{ + return (n * m) >> 32; +} + +static inline uint8_t do_rmulh_b(int32_t n, int32_t m) +{ + return (n * m + (1U << 7)) >> 8; +} + +static inline uint16_t do_rmulh_h(int32_t n, int32_t m) +{ + return (n * m + (1U << 15)) >> 16; +} + +static inline uint32_t do_rmulh_w(int64_t n, int64_t m) +{ + return (n * m + (1U << 31)) >> 32; +} + +DO_2OP(vmulhsb, 1, int8_t, do_mulh_b) +DO_2OP(vmulhsh, 2, int16_t, do_mulh_h) +DO_2OP(vmulhsw, 4, int32_t, do_mulh_w) +DO_2OP(vmulhub, 1, uint8_t, do_mulh_b) +DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h) +DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w) + +DO_2OP(vrmulhsb, 1, int8_t, do_rmulh_b) +DO_2OP(vrmulhsh, 2, int16_t, do_rmulh_h) +DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w) +DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b) +DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h) +DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w) + +#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) +#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) + +DO_2OP_S(vmaxs, DO_MAX) +DO_2OP_U(vmaxu, DO_MAX) +DO_2OP_S(vmins, DO_MIN) +DO_2OP_U(vminu, DO_MIN) + +#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) + +DO_2OP_S(vabds, DO_ABD) +DO_2OP_U(vabdu, DO_ABD) + +static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m) +{ + return ((uint64_t)n + m) >> 1; +} + +static inline int32_t do_vhadd_s(int32_t n, int32_t m) +{ + return ((int64_t)n + m) >> 1; +} + +static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m) +{ + return ((uint64_t)n - m) >> 1; +} + +static inline int32_t do_vhsub_s(int32_t n, int32_t m) +{ + return ((int64_t)n - m) >> 1; +} + +DO_2OP_S(vhadds, do_vhadd_s) +DO_2OP_U(vhaddu, do_vhadd_u) +DO_2OP_S(vhsubs, do_vhsub_s) +DO_2OP_U(vhsubu, do_vhsub_u) + +#define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) +#define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) +#define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL) +#define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL) + +DO_2OP_S(vshls, DO_VSHLS) +DO_2OP_U(vshlu, DO_VSHLU) +DO_2OP_S(vrshls, DO_VRSHLS) +DO_2OP_U(vrshlu, DO_VRSHLU) + +#define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1) +#define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1) + +DO_2OP_S(vrhadds, DO_RHADD_S) +DO_2OP_U(vrhaddu, DO_RHADD_U) + +static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m, + uint32_t inv, uint32_t carry_in, bool update_flags) +{ + uint16_t mask = mve_element_mask(env); + unsigned e; + + /* If any additions trigger, we will update flags. */ + if (mask & 0x1111) { + update_flags = true; + } + + for (e = 0; e < 16 / 4; e++, mask >>= 4) { + uint64_t r = carry_in; + r += n[H4(e)]; + r += m[H4(e)] ^ inv; + if (mask & 1) { + carry_in = r >> 32; + } + mergemask(&d[H4(e)], r, mask); + } + + if (update_flags) { + /* Store C, clear NZV. */ + env->vfp.xregs[ARM_VFP_FPSCR] &= ~FPCR_NZCV_MASK; + env->vfp.xregs[ARM_VFP_FPSCR] |= carry_in * FPCR_C; + } + mve_advance_vpt(env); +} + +void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C; + do_vadc(env, vd, vn, vm, 0, carry_in, false); +} + +void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C; + do_vadc(env, vd, vn, vm, -1, carry_in, false); +} + + +void HELPER(mve_vadci)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + do_vadc(env, vd, vn, vm, 0, 0, true); +} + +void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + do_vadc(env, vd, vn, vm, -1, 1, true); +} + +#define DO_VCADD(OP, ESIZE, TYPE, FN0, FN1) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE r[16 / ESIZE]; \ + /* Calculate all results first to avoid overwriting inputs */ \ + for (e = 0; e < 16 / ESIZE; e++) { \ + if (!(e & 1)) { \ + r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]); \ + } else { \ + r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]); \ + } \ + } \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], r[e], mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VCADD_ALL(OP, FN0, FN1) \ + DO_VCADD(OP##b, 1, int8_t, FN0, FN1) \ + DO_VCADD(OP##h, 2, int16_t, FN0, FN1) \ + DO_VCADD(OP##w, 4, int32_t, FN0, FN1) + +DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD) +DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB) +DO_VCADD_ALL(vhcadd90, do_vhsub_s, do_vhadd_s) +DO_VCADD_ALL(vhcadd270, do_vhadd_s, do_vhsub_s) + +static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) +{ + if (val > max) { + *s = true; + return max; + } else if (val < min) { + *s = true; + return min; + } + return val; +} + +#define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s) +#define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s) +#define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s) + +#define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s) +#define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s) +#define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s) + +#define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s) +#define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s) +#define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s) + +#define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s) +#define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s) +#define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s) + +/* + * For QDMULH and QRDMULH we simplify "double and shift by esize" into + * "shift by esize-1", adjusting the QRDMULH rounding constant to match. + */ +#define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \ + INT8_MIN, INT8_MAX, s) +#define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \ + INT16_MIN, INT16_MAX, s) +#define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \ + INT32_MIN, INT32_MAX, s) + +#define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \ + INT8_MIN, INT8_MAX, s) +#define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \ + INT16_MIN, INT16_MAX, s) +#define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \ + INT32_MIN, INT32_MAX, s) + +DO_2OP_SAT(vqdmulhb, 1, int8_t, DO_QDMULH_B) +DO_2OP_SAT(vqdmulhh, 2, int16_t, DO_QDMULH_H) +DO_2OP_SAT(vqdmulhw, 4, int32_t, DO_QDMULH_W) + +DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B) +DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H) +DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W) + +DO_2OP_SAT(vqaddub, 1, uint8_t, DO_UQADD_B) +DO_2OP_SAT(vqadduh, 2, uint16_t, DO_UQADD_H) +DO_2OP_SAT(vqadduw, 4, uint32_t, DO_UQADD_W) +DO_2OP_SAT(vqaddsb, 1, int8_t, DO_SQADD_B) +DO_2OP_SAT(vqaddsh, 2, int16_t, DO_SQADD_H) +DO_2OP_SAT(vqaddsw, 4, int32_t, DO_SQADD_W) + +DO_2OP_SAT(vqsubub, 1, uint8_t, DO_UQSUB_B) +DO_2OP_SAT(vqsubuh, 2, uint16_t, DO_UQSUB_H) +DO_2OP_SAT(vqsubuw, 4, uint32_t, DO_UQSUB_W) +DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B) +DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H) +DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W) + +/* + * This wrapper fixes up the impedance mismatch between do_sqrshl_bhs() + * and friends wanting a uint32_t* sat and our needing a bool*. + */ +#define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp) \ + ({ \ + uint32_t su32 = 0; \ + typeof(N) r = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32); \ + if (su32) { \ + *satp = true; \ + } \ + r; \ + }) + +#define DO_SQSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp) +#define DO_UQSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp) +#define DO_SQRSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp) +#define DO_UQRSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp) + +DO_2OP_SAT_S(vqshls, DO_SQSHL_OP) +DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP) +DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP) +DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP) + +/* + * Multiply add dual returning high half + * The 'FN' here takes four inputs A, B, C, D, a 0/1 indicator of + * whether to add the rounding constant, and the pointer to the + * saturation flag, and should do "(A * B + C * D) * 2 + rounding constant", + * saturate to twice the input size and return the high half; or + * (A * B - C * D) etc for VQDMLSDH. + */ +#define DO_VQDMLADH_OP(OP, ESIZE, TYPE, XCHG, ROUND, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + if ((e & 1) == XCHG) { \ + TYPE r = FN(n[H##ESIZE(e)], \ + m[H##ESIZE(e - XCHG)], \ + n[H##ESIZE(e + (1 - 2 * XCHG))], \ + m[H##ESIZE(e + (1 - XCHG))], \ + ROUND, &sat); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + qc |= sat & mask & 1; \ + } \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +static int8_t do_vqdmladh_b(int8_t a, int8_t b, int8_t c, int8_t d, + int round, bool *sat) +{ + int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 7); + return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; +} + +static int16_t do_vqdmladh_h(int16_t a, int16_t b, int16_t c, int16_t d, + int round, bool *sat) +{ + int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 15); + return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; +} + +static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d, + int round, bool *sat) +{ + int64_t m1 = (int64_t)a * b; + int64_t m2 = (int64_t)c * d; + int64_t r; + /* + * Architecturally we should do the entire add, double, round + * and then check for saturation. We do three saturating adds, + * but we need to be careful about the order. If the first + * m1 + m2 saturates then it's impossible for the *2+rc to + * bring it back into the non-saturated range. However, if + * m1 + m2 is negative then it's possible that doing the doubling + * would take the intermediate result below INT64_MAX and the + * addition of the rounding constant then brings it back in range. + * So we add half the rounding constant before doubling rather + * than adding the rounding constant after the doubling. + */ + if (sadd64_overflow(m1, m2, &r) || + sadd64_overflow(r, (round << 30), &r) || + sadd64_overflow(r, r, &r)) { + *sat = true; + return r < 0 ? INT32_MAX : INT32_MIN; + } + return r >> 32; +} + +static int8_t do_vqdmlsdh_b(int8_t a, int8_t b, int8_t c, int8_t d, + int round, bool *sat) +{ + int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7); + return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; +} + +static int16_t do_vqdmlsdh_h(int16_t a, int16_t b, int16_t c, int16_t d, + int round, bool *sat) +{ + int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15); + return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; +} + +static int32_t do_vqdmlsdh_w(int32_t a, int32_t b, int32_t c, int32_t d, + int round, bool *sat) +{ + int64_t m1 = (int64_t)a * b; + int64_t m2 = (int64_t)c * d; + int64_t r; + /* The same ordering issue as in do_vqdmladh_w applies here too */ + if (ssub64_overflow(m1, m2, &r) || + sadd64_overflow(r, (round << 30), &r) || + sadd64_overflow(r, r, &r)) { + *sat = true; + return r < 0 ? INT32_MAX : INT32_MIN; + } + return r >> 32; +} + +DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b) +DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h) +DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w) +DO_VQDMLADH_OP(vqdmladhxb, 1, int8_t, 1, 0, do_vqdmladh_b) +DO_VQDMLADH_OP(vqdmladhxh, 2, int16_t, 1, 0, do_vqdmladh_h) +DO_VQDMLADH_OP(vqdmladhxw, 4, int32_t, 1, 0, do_vqdmladh_w) + +DO_VQDMLADH_OP(vqrdmladhb, 1, int8_t, 0, 1, do_vqdmladh_b) +DO_VQDMLADH_OP(vqrdmladhh, 2, int16_t, 0, 1, do_vqdmladh_h) +DO_VQDMLADH_OP(vqrdmladhw, 4, int32_t, 0, 1, do_vqdmladh_w) +DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b) +DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h) +DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w) + +DO_VQDMLADH_OP(vqdmlsdhb, 1, int8_t, 0, 0, do_vqdmlsdh_b) +DO_VQDMLADH_OP(vqdmlsdhh, 2, int16_t, 0, 0, do_vqdmlsdh_h) +DO_VQDMLADH_OP(vqdmlsdhw, 4, int32_t, 0, 0, do_vqdmlsdh_w) +DO_VQDMLADH_OP(vqdmlsdhxb, 1, int8_t, 1, 0, do_vqdmlsdh_b) +DO_VQDMLADH_OP(vqdmlsdhxh, 2, int16_t, 1, 0, do_vqdmlsdh_h) +DO_VQDMLADH_OP(vqdmlsdhxw, 4, int32_t, 1, 0, do_vqdmlsdh_w) + +DO_VQDMLADH_OP(vqrdmlsdhb, 1, int8_t, 0, 1, do_vqdmlsdh_b) +DO_VQDMLADH_OP(vqrdmlsdhh, 2, int16_t, 0, 1, do_vqdmlsdh_h) +DO_VQDMLADH_OP(vqrdmlsdhw, 4, int32_t, 0, 1, do_vqdmlsdh_w) +DO_VQDMLADH_OP(vqrdmlsdhxb, 1, int8_t, 1, 1, do_vqdmlsdh_b) +DO_VQDMLADH_OP(vqrdmlsdhxh, 2, int16_t, 1, 1, do_vqdmlsdh_h) +DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w) + +#define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + TYPE *d = vd, *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_2OP_SAT_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + TYPE *d = vd, *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat), \ + mask); \ + qc |= sat & mask & 1; \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +/* provide unsigned 2-op scalar helpers for all sizes */ +#define DO_2OP_SCALAR_U(OP, FN) \ + DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \ + DO_2OP_SCALAR(OP##h, 2, uint16_t, FN) \ + DO_2OP_SCALAR(OP##w, 4, uint32_t, FN) +#define DO_2OP_SCALAR_S(OP, FN) \ + DO_2OP_SCALAR(OP##b, 1, int8_t, FN) \ + DO_2OP_SCALAR(OP##h, 2, int16_t, FN) \ + DO_2OP_SCALAR(OP##w, 4, int32_t, FN) + +DO_2OP_SCALAR_U(vadd_scalar, DO_ADD) +DO_2OP_SCALAR_U(vsub_scalar, DO_SUB) +DO_2OP_SCALAR_U(vmul_scalar, DO_MUL) +DO_2OP_SCALAR_S(vhadds_scalar, do_vhadd_s) +DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u) +DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s) +DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u) + +DO_2OP_SAT_SCALAR(vqaddu_scalarb, 1, uint8_t, DO_UQADD_B) +DO_2OP_SAT_SCALAR(vqaddu_scalarh, 2, uint16_t, DO_UQADD_H) +DO_2OP_SAT_SCALAR(vqaddu_scalarw, 4, uint32_t, DO_UQADD_W) +DO_2OP_SAT_SCALAR(vqadds_scalarb, 1, int8_t, DO_SQADD_B) +DO_2OP_SAT_SCALAR(vqadds_scalarh, 2, int16_t, DO_SQADD_H) +DO_2OP_SAT_SCALAR(vqadds_scalarw, 4, int32_t, DO_SQADD_W) + +DO_2OP_SAT_SCALAR(vqsubu_scalarb, 1, uint8_t, DO_UQSUB_B) +DO_2OP_SAT_SCALAR(vqsubu_scalarh, 2, uint16_t, DO_UQSUB_H) +DO_2OP_SAT_SCALAR(vqsubu_scalarw, 4, uint32_t, DO_UQSUB_W) +DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B) +DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H) +DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W) + +DO_2OP_SAT_SCALAR(vqdmulh_scalarb, 1, int8_t, DO_QDMULH_B) +DO_2OP_SAT_SCALAR(vqdmulh_scalarh, 2, int16_t, DO_QDMULH_H) +DO_2OP_SAT_SCALAR(vqdmulh_scalarw, 4, int32_t, DO_QDMULH_W) +DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B) +DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H) +DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W) + +/* + * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the + * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type. + * SATMASK specifies which bits of the predicate mask matter for determining + * whether to propagate a saturation indication into FPSCR.QC -- for + * the 16x16->32 case we must check only the bit corresponding to the T or B + * half that we used, but for the 32x32->64 case we propagate if the mask + * bit is set for either half. + */ +#define DO_2OP_SAT_SCALAR_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + LTYPE *d = vd; \ + TYPE *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned le; \ + bool qc = false; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + bool sat = false; \ + LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat); \ + mergemask(&d[H##LESIZE(le)], r, mask); \ + qc |= sat && (mask & SATMASK); \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat) +{ + int64_t r = ((int64_t)n * m) * 2; + return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat); +} + +static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat) +{ + /* The multiply can't overflow, but the doubling might */ + int64_t r = (int64_t)n * m; + if (r > INT64_MAX / 2) { + *sat = true; + return INT64_MAX; + } else if (r < INT64_MIN / 2) { + *sat = true; + return INT64_MIN; + } else { + return r * 2; + } +} + +#define SATMASK16B 1 +#define SATMASK16T (1 << 2) +#define SATMASK32 ((1 << 4) | 1) + +DO_2OP_SAT_SCALAR_L(vqdmullb_scalarh, 0, 2, int16_t, 4, int32_t, \ + do_qdmullh, SATMASK16B) +DO_2OP_SAT_SCALAR_L(vqdmullb_scalarw, 0, 4, int32_t, 8, int64_t, \ + do_qdmullw, SATMASK32) +DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \ + do_qdmullh, SATMASK16T) +DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \ + do_qdmullw, SATMASK32) + +/* + * Long saturating ops + */ +#define DO_2OP_SAT_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + void *vm) \ + { \ + LTYPE *d = vd; \ + TYPE *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned le; \ + bool qc = false; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + bool sat = false; \ + LTYPE op1 = n[H##ESIZE(le * 2 + TOP)]; \ + LTYPE op2 = m[H##ESIZE(le * 2 + TOP)]; \ + mergemask(&d[H##LESIZE(le)], FN(op1, op2, &sat), mask); \ + qc |= sat && (mask & SATMASK); \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +DO_2OP_SAT_L(vqdmullbh, 0, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16B) +DO_2OP_SAT_L(vqdmullbw, 0, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32) +DO_2OP_SAT_L(vqdmullth, 1, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16T) +DO_2OP_SAT_L(vqdmulltw, 1, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32) + +static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m) +{ + m &= 0xff; + if (m == 0) { + return 0; + } + n = revbit8(n); + if (m < 8) { + n >>= 8 - m; + } + return n; +} + +static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m) +{ + m &= 0xff; + if (m == 0) { + return 0; + } + n = revbit16(n); + if (m < 16) { + n >>= 16 - m; + } + return n; +} + +static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m) +{ + m &= 0xff; + if (m == 0) { + return 0; + } + n = revbit32(n); + if (m < 32) { + n >>= 32 - m; + } + return n; +} + +DO_2OP_SCALAR(vbrsrb, 1, uint8_t, do_vbrsrb) +DO_2OP_SCALAR(vbrsrh, 2, uint16_t, do_vbrsrh) +DO_2OP_SCALAR(vbrsrw, 4, uint32_t, do_vbrsrw) + +/* + * Multiply add long dual accumulate ops. + */ +#define DO_LDAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \ + uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ + void *vm, uint64_t a) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *n = vn, *m = vm; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + if (e & 1) { \ + a ODDACC \ + (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \ + } else { \ + a EVENACC \ + (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \ + } \ + } \ + } \ + mve_advance_vpt(env); \ + return a; \ + } + +DO_LDAV(vmlaldavsh, 2, int16_t, false, +=, +=) +DO_LDAV(vmlaldavxsh, 2, int16_t, true, +=, +=) +DO_LDAV(vmlaldavsw, 4, int32_t, false, +=, +=) +DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=) + +DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=) +DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=) + +DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=) +DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=) +DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=) +DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=) + +/* + * Rounding multiply add long dual accumulate high: we must keep + * a 72-bit internal accumulator value and return the top 64 bits. + */ +#define DO_LDAVH(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC, TO128) \ + uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ + void *vm, uint64_t a) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *n = vn, *m = vm; \ + Int128 acc = int128_lshift(TO128(a), 8); \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + if (e & 1) { \ + acc = ODDACC(acc, TO128(n[H##ESIZE(e - 1 * XCHG)] * \ + m[H##ESIZE(e)])); \ + } else { \ + acc = EVENACC(acc, TO128(n[H##ESIZE(e + 1 * XCHG)] * \ + m[H##ESIZE(e)])); \ + } \ + acc = int128_add(acc, int128_make64(1 << 7)); \ + } \ + } \ + mve_advance_vpt(env); \ + return int128_getlo(int128_rshift(acc, 8)); \ + } + +DO_LDAVH(vrmlaldavhsw, 4, int32_t, false, int128_add, int128_add, int128_makes64) +DO_LDAVH(vrmlaldavhxsw, 4, int32_t, true, int128_add, int128_add, int128_makes64) + +DO_LDAVH(vrmlaldavhuw, 4, uint32_t, false, int128_add, int128_add, int128_make64) + +DO_LDAVH(vrmlsldavhsw, 4, int32_t, false, int128_add, int128_sub, int128_makes64) +DO_LDAVH(vrmlsldavhxsw, 4, int32_t, true, int128_add, int128_sub, int128_makes64) + +/* Vector add across vector */ +#define DO_VADDV(OP, ESIZE, TYPE) \ + uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ + uint32_t ra) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *m = vm; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + ra += m[H##ESIZE(e)]; \ + } \ + } \ + mve_advance_vpt(env); \ + return ra; \ + } \ + +DO_VADDV(vaddvsb, 1, uint8_t) +DO_VADDV(vaddvsh, 2, uint16_t) +DO_VADDV(vaddvsw, 4, uint32_t) +DO_VADDV(vaddvub, 1, uint8_t) +DO_VADDV(vaddvuh, 2, uint16_t) +DO_VADDV(vaddvuw, 4, uint32_t) diff --git a/target/arm/translate-a32.h b/target/arm/translate-a32.h index 0a0053949f..6dfcafe179 100644 --- a/target/arm/translate-a32.h +++ b/target/arm/translate-a32.h @@ -32,6 +32,7 @@ bool disas_neon_shared(DisasContext *s, uint32_t insn); void load_reg_var(DisasContext *s, TCGv_i32 var, int reg); void arm_gen_condlabel(DisasContext *s); bool vfp_access_check(DisasContext *s); +bool vfp_access_check_m(DisasContext *s, bool skip_context_update); void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop); void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop); void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop); @@ -46,6 +47,8 @@ long neon_full_reg_offset(unsigned reg); long neon_element_offset(int reg, int element, MemOp memop); void gen_rev16(TCGv_i32 dest, TCGv_i32 var); void clear_eci_state(DisasContext *s); +bool mve_eci_check(DisasContext *s); +void mve_update_and_store_eci(DisasContext *s); static inline TCGv_i32 load_cpu_offset(int offset) { diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c index 09b3be4ed3..5eab04832c 100644 --- a/target/arm/translate-m-nocp.c +++ b/target/arm/translate-m-nocp.c @@ -19,6 +19,7 @@ #include "qemu/osdep.h" #include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" #include "translate.h" #include "translate-a32.h" @@ -191,6 +192,555 @@ static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a) return true; } +/* + * M-profile provides two different sets of instructions that can + * access floating point system registers: VMSR/VMRS (which move + * to/from a general purpose register) and VLDR/VSTR sysreg (which + * move directly to/from memory). In some cases there are also side + * effects which must happen after any write to memory (which could + * cause an exception). So we implement the common logic for the + * sysreg access in gen_M_fp_sysreg_write() and gen_M_fp_sysreg_read(), + * which take pointers to callback functions which will perform the + * actual "read/write general purpose register" and "read/write + * memory" operations. + */ + +/* + * Emit code to store the sysreg to its final destination; frees the + * TCG temp 'value' it is passed. do_access is true to do the store, + * and false to skip it and only perform side-effects like base + * register writeback. + */ +typedef void fp_sysreg_storefn(DisasContext *s, void *opaque, TCGv_i32 value, + bool do_access); +/* + * Emit code to load the value to be copied to the sysreg; returns + * a new TCG temporary. do_access is true to do the store, + * and false to skip it and only perform side-effects like base + * register writeback. + */ +typedef TCGv_i32 fp_sysreg_loadfn(DisasContext *s, void *opaque, + bool do_access); + +/* Common decode/access checks for fp sysreg read/write */ +typedef enum FPSysRegCheckResult { + FPSysRegCheckFailed, /* caller should return false */ + FPSysRegCheckDone, /* caller should return true */ + FPSysRegCheckContinue, /* caller should continue generating code */ +} FPSysRegCheckResult; + +static FPSysRegCheckResult fp_sysreg_checks(DisasContext *s, int regno) +{ + if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { + return FPSysRegCheckFailed; + } + + switch (regno) { + case ARM_VFP_FPSCR: + case QEMU_VFP_FPSCR_NZCV: + break; + case ARM_VFP_FPSCR_NZCVQC: + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return FPSysRegCheckFailed; + } + break; + case ARM_VFP_FPCXT_S: + case ARM_VFP_FPCXT_NS: + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return FPSysRegCheckFailed; + } + if (!s->v8m_secure) { + return FPSysRegCheckFailed; + } + break; + case ARM_VFP_VPR: + case ARM_VFP_P0: + if (!dc_isar_feature(aa32_mve, s)) { + return FPSysRegCheckFailed; + } + break; + default: + return FPSysRegCheckFailed; + } + + /* + * FPCXT_NS is a special case: it has specific handling for + * "current FP state is inactive", and must do the PreserveFPState() + * but not the usual full set of actions done by ExecuteFPCheck(). + * So we don't call vfp_access_check() and the callers must handle this. + */ + if (regno != ARM_VFP_FPCXT_NS && !vfp_access_check(s)) { + return FPSysRegCheckDone; + } + return FPSysRegCheckContinue; +} + +static void gen_branch_fpInactive(DisasContext *s, TCGCond cond, + TCGLabel *label) +{ + /* + * FPCXT_NS is a special case: it has specific handling for + * "current FP state is inactive", and must do the PreserveFPState() + * but not the usual full set of actions done by ExecuteFPCheck(). + * We don't have a TB flag that matches the fpInactive check, so we + * do it at runtime as we don't expect FPCXT_NS accesses to be frequent. + * + * Emit code that checks fpInactive and does a conditional + * branch to label based on it: + * if cond is TCG_COND_NE then branch if fpInactive != 0 (ie if inactive) + * if cond is TCG_COND_EQ then branch if fpInactive == 0 (ie if active) + */ + assert(cond == TCG_COND_EQ || cond == TCG_COND_NE); + + /* fpInactive = FPCCR_NS.ASPEN == 1 && CONTROL.FPCA == 0 */ + TCGv_i32 aspen, fpca; + aspen = load_cpu_field(v7m.fpccr[M_REG_NS]); + fpca = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK); + tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK); + tcg_gen_andi_i32(fpca, fpca, R_V7M_CONTROL_FPCA_MASK); + tcg_gen_or_i32(fpca, fpca, aspen); + tcg_gen_brcondi_i32(tcg_invert_cond(cond), fpca, 0, label); + tcg_temp_free_i32(aspen); + tcg_temp_free_i32(fpca); +} + +static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, + fp_sysreg_loadfn *loadfn, + void *opaque) +{ + /* Do a write to an M-profile floating point system register */ + TCGv_i32 tmp; + TCGLabel *lab_end = NULL; + + switch (fp_sysreg_checks(s, regno)) { + case FPSysRegCheckFailed: + return false; + case FPSysRegCheckDone: + return true; + case FPSysRegCheckContinue: + break; + } + + switch (regno) { + case ARM_VFP_FPSCR: + tmp = loadfn(s, opaque, true); + gen_helper_vfp_set_fpscr(cpu_env, tmp); + tcg_temp_free_i32(tmp); + gen_lookup_tb(s); + break; + case ARM_VFP_FPSCR_NZCVQC: + { + TCGv_i32 fpscr; + tmp = loadfn(s, opaque, true); + if (dc_isar_feature(aa32_mve, s)) { + /* QC is only present for MVE; otherwise RES0 */ + TCGv_i32 qc = tcg_temp_new_i32(); + tcg_gen_andi_i32(qc, tmp, FPCR_QC); + /* + * The 4 vfp.qc[] fields need only be "zero" vs "non-zero"; + * here writing the same value into all elements is simplest. + */ + tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc), + 16, 16, qc); + } + tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); + fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); + tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK); + tcg_gen_or_i32(fpscr, fpscr, tmp); + store_cpu_field(fpscr, vfp.xregs[ARM_VFP_FPSCR]); + tcg_temp_free_i32(tmp); + break; + } + case ARM_VFP_FPCXT_NS: + { + TCGLabel *lab_active = gen_new_label(); + + lab_end = gen_new_label(); + gen_branch_fpInactive(s, TCG_COND_EQ, lab_active); + /* + * fpInactive case: write is a NOP, so only do side effects + * like register writeback before we branch to end + */ + loadfn(s, opaque, false); + tcg_gen_br(lab_end); + + gen_set_label(lab_active); + /* + * !fpInactive: if FPU disabled, take NOCP exception; + * otherwise PreserveFPState(), and then FPCXT_NS writes + * behave the same as FPCXT_S writes. + */ + if (!vfp_access_check_m(s, true)) { + /* + * This was only a conditional exception, so override + * gen_exception_insn()'s default to DISAS_NORETURN + */ + s->base.is_jmp = DISAS_NEXT; + break; + } + } + /* fall through */ + case ARM_VFP_FPCXT_S: + { + TCGv_i32 sfpa, control; + /* + * Set FPSCR and CONTROL.SFPA from value; the new FPSCR takes + * bits [27:0] from value and zeroes bits [31:28]. + */ + tmp = loadfn(s, opaque, true); + sfpa = tcg_temp_new_i32(); + tcg_gen_shri_i32(sfpa, tmp, 31); + control = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_deposit_i32(control, control, sfpa, + R_V7M_CONTROL_SFPA_SHIFT, 1); + store_cpu_field(control, v7m.control[M_REG_S]); + tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK); + gen_helper_vfp_set_fpscr(cpu_env, tmp); + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(sfpa); + break; + } + case ARM_VFP_VPR: + /* Behaves as NOP if not privileged */ + if (IS_USER(s)) { + loadfn(s, opaque, false); + break; + } + tmp = loadfn(s, opaque, true); + store_cpu_field(tmp, v7m.vpr); + break; + case ARM_VFP_P0: + { + TCGv_i32 vpr; + tmp = loadfn(s, opaque, true); + vpr = load_cpu_field(v7m.vpr); + tcg_gen_deposit_i32(vpr, vpr, tmp, + R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH); + store_cpu_field(vpr, v7m.vpr); + tcg_temp_free_i32(tmp); + break; + } + default: + g_assert_not_reached(); + } + if (lab_end) { + gen_set_label(lab_end); + } + return true; +} + +static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, + fp_sysreg_storefn *storefn, + void *opaque) +{ + /* Do a read from an M-profile floating point system register */ + TCGv_i32 tmp; + TCGLabel *lab_end = NULL; + bool lookup_tb = false; + + switch (fp_sysreg_checks(s, regno)) { + case FPSysRegCheckFailed: + return false; + case FPSysRegCheckDone: + return true; + case FPSysRegCheckContinue: + break; + } + + if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) { + /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */ + regno = QEMU_VFP_FPSCR_NZCV; + } + + switch (regno) { + case ARM_VFP_FPSCR: + tmp = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(tmp, cpu_env); + storefn(s, opaque, tmp, true); + break; + case ARM_VFP_FPSCR_NZCVQC: + tmp = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(tmp, cpu_env); + tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK); + storefn(s, opaque, tmp, true); + break; + case QEMU_VFP_FPSCR_NZCV: + /* + * Read just NZCV; this is a special case to avoid the + * helper call for the "VMRS to CPSR.NZCV" insn. + */ + tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); + tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); + storefn(s, opaque, tmp, true); + break; + case ARM_VFP_FPCXT_S: + { + TCGv_i32 control, sfpa, fpscr; + /* Bits [27:0] from FPSCR, bit [31] from CONTROL.SFPA */ + tmp = tcg_temp_new_i32(); + sfpa = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(tmp, cpu_env); + tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK); + control = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK); + tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT); + tcg_gen_or_i32(tmp, tmp, sfpa); + tcg_temp_free_i32(sfpa); + /* + * Store result before updating FPSCR etc, in case + * it is a memory write which causes an exception. + */ + storefn(s, opaque, tmp, true); + /* + * Now we must reset FPSCR from FPDSCR_NS, and clear + * CONTROL.SFPA; so we'll end the TB here. + */ + tcg_gen_andi_i32(control, control, ~R_V7M_CONTROL_SFPA_MASK); + store_cpu_field(control, v7m.control[M_REG_S]); + fpscr = load_cpu_field(v7m.fpdscr[M_REG_NS]); + gen_helper_vfp_set_fpscr(cpu_env, fpscr); + tcg_temp_free_i32(fpscr); + lookup_tb = true; + break; + } + case ARM_VFP_FPCXT_NS: + { + TCGv_i32 control, sfpa, fpscr, fpdscr, zero; + TCGLabel *lab_active = gen_new_label(); + + lookup_tb = true; + + gen_branch_fpInactive(s, TCG_COND_EQ, lab_active); + /* fpInactive case: reads as FPDSCR_NS */ + TCGv_i32 tmp = load_cpu_field(v7m.fpdscr[M_REG_NS]); + storefn(s, opaque, tmp, true); + lab_end = gen_new_label(); + tcg_gen_br(lab_end); + + gen_set_label(lab_active); + /* + * !fpInactive: if FPU disabled, take NOCP exception; + * otherwise PreserveFPState(), and then FPCXT_NS + * reads the same as FPCXT_S. + */ + if (!vfp_access_check_m(s, true)) { + /* + * This was only a conditional exception, so override + * gen_exception_insn()'s default to DISAS_NORETURN + */ + s->base.is_jmp = DISAS_NEXT; + break; + } + tmp = tcg_temp_new_i32(); + sfpa = tcg_temp_new_i32(); + fpscr = tcg_temp_new_i32(); + gen_helper_vfp_get_fpscr(fpscr, cpu_env); + tcg_gen_andi_i32(tmp, fpscr, ~FPCR_NZCV_MASK); + control = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK); + tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT); + tcg_gen_or_i32(tmp, tmp, sfpa); + tcg_temp_free_i32(control); + /* Store result before updating FPSCR, in case it faults */ + storefn(s, opaque, tmp, true); + /* If SFPA is zero then set FPSCR from FPDSCR_NS */ + fpdscr = load_cpu_field(v7m.fpdscr[M_REG_NS]); + zero = tcg_const_i32(0); + tcg_gen_movcond_i32(TCG_COND_EQ, fpscr, sfpa, zero, fpdscr, fpscr); + gen_helper_vfp_set_fpscr(cpu_env, fpscr); + tcg_temp_free_i32(zero); + tcg_temp_free_i32(sfpa); + tcg_temp_free_i32(fpdscr); + tcg_temp_free_i32(fpscr); + break; + } + case ARM_VFP_VPR: + /* Behaves as NOP if not privileged */ + if (IS_USER(s)) { + storefn(s, opaque, NULL, false); + break; + } + tmp = load_cpu_field(v7m.vpr); + storefn(s, opaque, tmp, true); + break; + case ARM_VFP_P0: + tmp = load_cpu_field(v7m.vpr); + tcg_gen_extract_i32(tmp, tmp, R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH); + storefn(s, opaque, tmp, true); + break; + default: + g_assert_not_reached(); + } + + if (lab_end) { + gen_set_label(lab_end); + } + if (lookup_tb) { + gen_lookup_tb(s); + } + return true; +} + +static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value, + bool do_access) +{ + arg_VMSR_VMRS *a = opaque; + + if (!do_access) { + return; + } + + if (a->rt == 15) { + /* Set the 4 flag bits in the CPSR */ + gen_set_nzcv(value); + tcg_temp_free_i32(value); + } else { + store_reg(s, a->rt, value); + } +} + +static TCGv_i32 gpr_to_fp_sysreg(DisasContext *s, void *opaque, bool do_access) +{ + arg_VMSR_VMRS *a = opaque; + + if (!do_access) { + return NULL; + } + return load_reg(s, a->rt); +} + +static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) +{ + /* + * Accesses to R15 are UNPREDICTABLE; we choose to undef. + * FPSCR -> r15 is a special case which writes to the PSR flags; + * set a->reg to a special value to tell gen_M_fp_sysreg_read() + * we only care about the top 4 bits of FPSCR there. + */ + if (a->rt == 15) { + if (a->l && a->reg == ARM_VFP_FPSCR) { + a->reg = QEMU_VFP_FPSCR_NZCV; + } else { + return false; + } + } + + if (a->l) { + /* VMRS, move FP system register to gp register */ + return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_gpr, a); + } else { + /* VMSR, move gp register to FP system register */ + return gen_M_fp_sysreg_write(s, a->reg, gpr_to_fp_sysreg, a); + } +} + +static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value, + bool do_access) +{ + arg_vldr_sysreg *a = opaque; + uint32_t offset = a->imm; + TCGv_i32 addr; + + if (!a->a) { + offset = -offset; + } + + if (!do_access && !a->w) { + return; + } + + addr = load_reg(s, a->rn); + if (a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + + if (s->v8m_stackcheck && a->rn == 13 && a->w) { + gen_helper_v8m_stackcheck(cpu_env, addr); + } + + if (do_access) { + gen_aa32_st_i32(s, value, addr, get_mem_index(s), + MO_UL | MO_ALIGN | s->be_data); + tcg_temp_free_i32(value); + } + + if (a->w) { + /* writeback */ + if (!a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } +} + +static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque, + bool do_access) +{ + arg_vldr_sysreg *a = opaque; + uint32_t offset = a->imm; + TCGv_i32 addr; + TCGv_i32 value = NULL; + + if (!a->a) { + offset = -offset; + } + + if (!do_access && !a->w) { + return NULL; + } + + addr = load_reg(s, a->rn); + if (a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + + if (s->v8m_stackcheck && a->rn == 13 && a->w) { + gen_helper_v8m_stackcheck(cpu_env, addr); + } + + if (do_access) { + value = tcg_temp_new_i32(); + gen_aa32_ld_i32(s, value, addr, get_mem_index(s), + MO_UL | MO_ALIGN | s->be_data); + } + + if (a->w) { + /* writeback */ + if (!a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } + return value; +} + +static bool trans_VLDR_sysreg(DisasContext *s, arg_vldr_sysreg *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return false; + } + if (a->rn == 15) { + return false; + } + return gen_M_fp_sysreg_write(s, a->reg, memory_to_fp_sysreg, a); +} + +static bool trans_VSTR_sysreg(DisasContext *s, arg_vldr_sysreg *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { + return false; + } + if (a->rn == 15) { + return false; + } + return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_memory, a); +} + static bool trans_NOCP(DisasContext *s, arg_nocp *a) { /* diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c index e91f526a1a..67462bdf27 100644 --- a/target/arm/translate-mve.c +++ b/target/arm/translate-mve.c @@ -27,3 +27,762 @@ /* Include the generated decoder */ #include "decode-mve.c.inc" + +typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); +typedef void MVEGenTwoOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr); +typedef void MVEGenTwoOpScalarFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void MVEGenDualAccOpFn(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64); +typedef void MVEGenVADDVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32); + +/* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */ +static inline long mve_qreg_offset(unsigned reg) +{ + return offsetof(CPUARMState, vfp.zregs[reg].d[0]); +} + +static TCGv_ptr mve_qreg_ptr(unsigned reg) +{ + TCGv_ptr ret = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(ret, cpu_env, mve_qreg_offset(reg)); + return ret; +} + +static bool mve_check_qreg_bank(DisasContext *s, int qmask) +{ + /* + * Check whether Qregs are in range. For v8.1M only Q0..Q7 + * are supported, see VFPSmallRegisterBank(). + */ + return qmask < 8; +} + +bool mve_eci_check(DisasContext *s) +{ + /* + * This is a beatwise insn: check that ECI is valid (not a + * reserved value) and note that we are handling it. + * Return true if OK, false if we generated an exception. + */ + s->eci_handled = true; + switch (s->eci) { + case ECI_NONE: + case ECI_A0: + case ECI_A0A1: + case ECI_A0A1A2: + case ECI_A0A1A2B0: + return true; + default: + /* Reserved value: INVSTATE UsageFault */ + gen_exception_insn(s, s->pc_curr, EXCP_INVSTATE, syn_uncategorized(), + default_exception_el(s)); + return false; + } +} + +static void mve_update_eci(DisasContext *s) +{ + /* + * The helper function will always update the CPUState field, + * so we only need to update the DisasContext field. + */ + if (s->eci) { + s->eci = (s->eci == ECI_A0A1A2B0) ? ECI_A0 : ECI_NONE; + } +} + +void mve_update_and_store_eci(DisasContext *s) +{ + /* + * For insns which don't call a helper function that will call + * mve_advance_vpt(), this version updates s->eci and also stores + * it out to the CPUState field. + */ + if (s->eci) { + mve_update_eci(s); + store_cpu_field(tcg_constant_i32(s->eci << 4), condexec_bits); + } +} + +static bool mve_skip_first_beat(DisasContext *s) +{ + /* Return true if PSR.ECI says we must skip the first beat of this insn */ + switch (s->eci) { + case ECI_NONE: + return false; + case ECI_A0: + case ECI_A0A1: + case ECI_A0A1A2: + case ECI_A0A1A2B0: + return true; + default: + g_assert_not_reached(); + } +} + +static bool do_ldst(DisasContext *s, arg_VLDR_VSTR *a, MVEGenLdStFn *fn) +{ + TCGv_i32 addr; + uint32_t offset; + TCGv_ptr qreg; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd) || + !fn) { + return false; + } + + /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */ + if (a->rn == 15 || (a->rn == 13 && a->w)) { + return false; + } + + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + offset = a->imm << a->size; + if (!a->a) { + offset = -offset; + } + addr = load_reg(s, a->rn); + if (a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + + qreg = mve_qreg_ptr(a->qd); + fn(cpu_env, qreg, addr); + tcg_temp_free_ptr(qreg); + + /* + * Writeback always happens after the last beat of the insn, + * regardless of predication + */ + if (a->w) { + if (!a->p) { + tcg_gen_addi_i32(addr, addr, offset); + } + store_reg(s, a->rn, addr); + } else { + tcg_temp_free_i32(addr); + } + mve_update_eci(s); + return true; +} + +static bool trans_VLDR_VSTR(DisasContext *s, arg_VLDR_VSTR *a) +{ + static MVEGenLdStFn * const ldstfns[4][2] = { + { gen_helper_mve_vstrb, gen_helper_mve_vldrb }, + { gen_helper_mve_vstrh, gen_helper_mve_vldrh }, + { gen_helper_mve_vstrw, gen_helper_mve_vldrw }, + { NULL, NULL } + }; + return do_ldst(s, a, ldstfns[a->size][a->l]); +} + +#define DO_VLDST_WIDE_NARROW(OP, SLD, ULD, ST) \ + static bool trans_##OP(DisasContext *s, arg_VLDR_VSTR *a) \ + { \ + static MVEGenLdStFn * const ldstfns[2][2] = { \ + { gen_helper_mve_##ST, gen_helper_mve_##SLD }, \ + { NULL, gen_helper_mve_##ULD }, \ + }; \ + return do_ldst(s, a, ldstfns[a->u][a->l]); \ + } + +DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h) +DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w) +DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w) + +static bool trans_VDUP(DisasContext *s, arg_VDUP *a) +{ + TCGv_ptr qd; + TCGv_i32 rt; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd)) { + return false; + } + if (a->rt == 13 || a->rt == 15) { + /* UNPREDICTABLE; we choose to UNDEF */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + rt = load_reg(s, a->rt); + tcg_gen_dup_i32(a->size, rt, rt); + gen_helper_mve_vdup(cpu_env, qd, rt); + tcg_temp_free_ptr(qd); + tcg_temp_free_i32(rt); + mve_update_eci(s); + return true; +} + +static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn) +{ + TCGv_ptr qd, qm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd | a->qm) || + !fn) { + return false; + } + + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + qm = mve_qreg_ptr(a->qm); + fn(cpu_env, qd, qm); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qm); + mve_update_eci(s); + return true; +} + +#define DO_1OP(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_1op *a) \ + { \ + static MVEGenOneOpFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_1op(s, a, fns[a->size]); \ + } + +DO_1OP(VCLZ, vclz) +DO_1OP(VCLS, vcls) +DO_1OP(VABS, vabs) +DO_1OP(VNEG, vneg) + +static bool trans_VREV16(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + gen_helper_mve_vrev16b, + NULL, + NULL, + NULL, + }; + return do_1op(s, a, fns[a->size]); +} + +static bool trans_VREV32(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + gen_helper_mve_vrev32b, + gen_helper_mve_vrev32h, + NULL, + NULL, + }; + return do_1op(s, a, fns[a->size]); +} + +static bool trans_VREV64(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + gen_helper_mve_vrev64b, + gen_helper_mve_vrev64h, + gen_helper_mve_vrev64w, + NULL, + }; + return do_1op(s, a, fns[a->size]); +} + +static bool trans_VMVN(DisasContext *s, arg_1op *a) +{ + return do_1op(s, a, gen_helper_mve_vmvn); +} + +static bool trans_VABS_fp(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + NULL, + gen_helper_mve_vfabsh, + gen_helper_mve_vfabss, + NULL, + }; + if (!dc_isar_feature(aa32_mve_fp, s)) { + return false; + } + return do_1op(s, a, fns[a->size]); +} + +static bool trans_VNEG_fp(DisasContext *s, arg_1op *a) +{ + static MVEGenOneOpFn * const fns[] = { + NULL, + gen_helper_mve_vfnegh, + gen_helper_mve_vfnegs, + NULL, + }; + if (!dc_isar_feature(aa32_mve_fp, s)) { + return false; + } + return do_1op(s, a, fns[a->size]); +} + +static bool do_2op(DisasContext *s, arg_2op *a, MVEGenTwoOpFn fn) +{ + TCGv_ptr qd, qn, qm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd | a->qn | a->qm) || + !fn) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + qn = mve_qreg_ptr(a->qn); + qm = mve_qreg_ptr(a->qm); + fn(cpu_env, qd, qn, qm); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qn); + tcg_temp_free_ptr(qm); + mve_update_eci(s); + return true; +} + +#define DO_LOGIC(INSN, HELPER) \ + static bool trans_##INSN(DisasContext *s, arg_2op *a) \ + { \ + return do_2op(s, a, HELPER); \ + } + +DO_LOGIC(VAND, gen_helper_mve_vand) +DO_LOGIC(VBIC, gen_helper_mve_vbic) +DO_LOGIC(VORR, gen_helper_mve_vorr) +DO_LOGIC(VORN, gen_helper_mve_vorn) +DO_LOGIC(VEOR, gen_helper_mve_veor) + +#define DO_2OP(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_2op *a) \ + { \ + static MVEGenTwoOpFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_2op(s, a, fns[a->size]); \ + } + +DO_2OP(VADD, vadd) +DO_2OP(VSUB, vsub) +DO_2OP(VMUL, vmul) +DO_2OP(VMULH_S, vmulhs) +DO_2OP(VMULH_U, vmulhu) +DO_2OP(VRMULH_S, vrmulhs) +DO_2OP(VRMULH_U, vrmulhu) +DO_2OP(VMAX_S, vmaxs) +DO_2OP(VMAX_U, vmaxu) +DO_2OP(VMIN_S, vmins) +DO_2OP(VMIN_U, vminu) +DO_2OP(VABD_S, vabds) +DO_2OP(VABD_U, vabdu) +DO_2OP(VHADD_S, vhadds) +DO_2OP(VHADD_U, vhaddu) +DO_2OP(VHSUB_S, vhsubs) +DO_2OP(VHSUB_U, vhsubu) +DO_2OP(VMULL_BS, vmullbs) +DO_2OP(VMULL_BU, vmullbu) +DO_2OP(VMULL_TS, vmullts) +DO_2OP(VMULL_TU, vmulltu) +DO_2OP(VQDMULH, vqdmulh) +DO_2OP(VQRDMULH, vqrdmulh) +DO_2OP(VQADD_S, vqadds) +DO_2OP(VQADD_U, vqaddu) +DO_2OP(VQSUB_S, vqsubs) +DO_2OP(VQSUB_U, vqsubu) +DO_2OP(VSHL_S, vshls) +DO_2OP(VSHL_U, vshlu) +DO_2OP(VRSHL_S, vrshls) +DO_2OP(VRSHL_U, vrshlu) +DO_2OP(VQSHL_S, vqshls) +DO_2OP(VQSHL_U, vqshlu) +DO_2OP(VQRSHL_S, vqrshls) +DO_2OP(VQRSHL_U, vqrshlu) +DO_2OP(VQDMLADH, vqdmladh) +DO_2OP(VQDMLADHX, vqdmladhx) +DO_2OP(VQRDMLADH, vqrdmladh) +DO_2OP(VQRDMLADHX, vqrdmladhx) +DO_2OP(VQDMLSDH, vqdmlsdh) +DO_2OP(VQDMLSDHX, vqdmlsdhx) +DO_2OP(VQRDMLSDH, vqrdmlsdh) +DO_2OP(VQRDMLSDHX, vqrdmlsdhx) +DO_2OP(VRHADD_S, vrhadds) +DO_2OP(VRHADD_U, vrhaddu) +/* + * VCADD Qd == Qm at size MO_32 is UNPREDICTABLE; we choose not to diagnose + * so we can reuse the DO_2OP macro. (Our implementation calculates the + * "expected" results in this case.) Similarly for VHCADD. + */ +DO_2OP(VCADD90, vcadd90) +DO_2OP(VCADD270, vcadd270) +DO_2OP(VHCADD90, vhcadd90) +DO_2OP(VHCADD270, vhcadd270) + +static bool trans_VQDMULLB(DisasContext *s, arg_2op *a) +{ + static MVEGenTwoOpFn * const fns[] = { + NULL, + gen_helper_mve_vqdmullbh, + gen_helper_mve_vqdmullbw, + NULL, + }; + if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) { + /* UNPREDICTABLE; we choose to undef */ + return false; + } + return do_2op(s, a, fns[a->size]); +} + +static bool trans_VQDMULLT(DisasContext *s, arg_2op *a) +{ + static MVEGenTwoOpFn * const fns[] = { + NULL, + gen_helper_mve_vqdmullth, + gen_helper_mve_vqdmulltw, + NULL, + }; + if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) { + /* UNPREDICTABLE; we choose to undef */ + return false; + } + return do_2op(s, a, fns[a->size]); +} + +/* + * VADC and VSBC: these perform an add-with-carry or subtract-with-carry + * of the 32-bit elements in each lane of the input vectors, where the + * carry-out of each add is the carry-in of the next. The initial carry + * input is either fixed (0 for VADCI, 1 for VSBCI) or is from FPSCR.C + * (for VADC and VSBC); the carry out at the end is written back to FPSCR.C. + * These insns are subject to beat-wise execution. Partial execution + * of an I=1 (initial carry input fixed) insn which does not + * execute the first beat must start with the current FPSCR.NZCV + * value, not the fixed constant input. + */ +static bool trans_VADC(DisasContext *s, arg_2op *a) +{ + return do_2op(s, a, gen_helper_mve_vadc); +} + +static bool trans_VADCI(DisasContext *s, arg_2op *a) +{ + if (mve_skip_first_beat(s)) { + return trans_VADC(s, a); + } + return do_2op(s, a, gen_helper_mve_vadci); +} + +static bool trans_VSBC(DisasContext *s, arg_2op *a) +{ + return do_2op(s, a, gen_helper_mve_vsbc); +} + +static bool trans_VSBCI(DisasContext *s, arg_2op *a) +{ + if (mve_skip_first_beat(s)) { + return trans_VSBC(s, a); + } + return do_2op(s, a, gen_helper_mve_vsbci); +} + +static bool do_2op_scalar(DisasContext *s, arg_2scalar *a, + MVEGenTwoOpScalarFn fn) +{ + TCGv_ptr qd, qn; + TCGv_i32 rm; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qd | a->qn) || + !fn) { + return false; + } + if (a->rm == 13 || a->rm == 15) { + /* UNPREDICTABLE */ + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qd = mve_qreg_ptr(a->qd); + qn = mve_qreg_ptr(a->qn); + rm = load_reg(s, a->rm); + fn(cpu_env, qd, qn, rm); + tcg_temp_free_i32(rm); + tcg_temp_free_ptr(qd); + tcg_temp_free_ptr(qn); + mve_update_eci(s); + return true; +} + +#define DO_2OP_SCALAR(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_2scalar *a) \ + { \ + static MVEGenTwoOpScalarFn * const fns[] = { \ + gen_helper_mve_##FN##b, \ + gen_helper_mve_##FN##h, \ + gen_helper_mve_##FN##w, \ + NULL, \ + }; \ + return do_2op_scalar(s, a, fns[a->size]); \ + } + +DO_2OP_SCALAR(VADD_scalar, vadd_scalar) +DO_2OP_SCALAR(VSUB_scalar, vsub_scalar) +DO_2OP_SCALAR(VMUL_scalar, vmul_scalar) +DO_2OP_SCALAR(VHADD_S_scalar, vhadds_scalar) +DO_2OP_SCALAR(VHADD_U_scalar, vhaddu_scalar) +DO_2OP_SCALAR(VHSUB_S_scalar, vhsubs_scalar) +DO_2OP_SCALAR(VHSUB_U_scalar, vhsubu_scalar) +DO_2OP_SCALAR(VQADD_S_scalar, vqadds_scalar) +DO_2OP_SCALAR(VQADD_U_scalar, vqaddu_scalar) +DO_2OP_SCALAR(VQSUB_S_scalar, vqsubs_scalar) +DO_2OP_SCALAR(VQSUB_U_scalar, vqsubu_scalar) +DO_2OP_SCALAR(VQDMULH_scalar, vqdmulh_scalar) +DO_2OP_SCALAR(VQRDMULH_scalar, vqrdmulh_scalar) +DO_2OP_SCALAR(VBRSR, vbrsr) + +static bool trans_VQDMULLB_scalar(DisasContext *s, arg_2scalar *a) +{ + static MVEGenTwoOpScalarFn * const fns[] = { + NULL, + gen_helper_mve_vqdmullb_scalarh, + gen_helper_mve_vqdmullb_scalarw, + NULL, + }; + if (a->qd == a->qn && a->size == MO_32) { + /* UNPREDICTABLE; we choose to undef */ + return false; + } + return do_2op_scalar(s, a, fns[a->size]); +} + +static bool trans_VQDMULLT_scalar(DisasContext *s, arg_2scalar *a) +{ + static MVEGenTwoOpScalarFn * const fns[] = { + NULL, + gen_helper_mve_vqdmullt_scalarh, + gen_helper_mve_vqdmullt_scalarw, + NULL, + }; + if (a->qd == a->qn && a->size == MO_32) { + /* UNPREDICTABLE; we choose to undef */ + return false; + } + return do_2op_scalar(s, a, fns[a->size]); +} + +static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a, + MVEGenDualAccOpFn *fn) +{ + TCGv_ptr qn, qm; + TCGv_i64 rda; + TCGv_i32 rdalo, rdahi; + + if (!dc_isar_feature(aa32_mve, s) || + !mve_check_qreg_bank(s, a->qn | a->qm) || + !fn) { + return false; + } + /* + * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related + * encoding; rdalo always has bit 0 clear so cannot be 13 or 15. + */ + if (a->rdahi == 13 || a->rdahi == 15) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + qn = mve_qreg_ptr(a->qn); + qm = mve_qreg_ptr(a->qm); + + /* + * This insn is subject to beat-wise execution. Partial execution + * of an A=0 (no-accumulate) insn which does not execute the first + * beat must start with the current rda value, not 0. + */ + if (a->a || mve_skip_first_beat(s)) { + rda = tcg_temp_new_i64(); + rdalo = load_reg(s, a->rdalo); + rdahi = load_reg(s, a->rdahi); + tcg_gen_concat_i32_i64(rda, rdalo, rdahi); + tcg_temp_free_i32(rdalo); + tcg_temp_free_i32(rdahi); + } else { + rda = tcg_const_i64(0); + } + + fn(rda, cpu_env, qn, qm, rda); + tcg_temp_free_ptr(qn); + tcg_temp_free_ptr(qm); + + rdalo = tcg_temp_new_i32(); + rdahi = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(rdalo, rda); + tcg_gen_extrh_i64_i32(rdahi, rda); + store_reg(s, a->rdalo, rdalo); + store_reg(s, a->rdahi, rdahi); + tcg_temp_free_i64(rda); + mve_update_eci(s); + return true; +} + +static bool trans_VMLALDAV_S(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenDualAccOpFn * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_mve_vmlaldavsh, gen_helper_mve_vmlaldavxsh }, + { gen_helper_mve_vmlaldavsw, gen_helper_mve_vmlaldavxsw }, + { NULL, NULL }, + }; + return do_long_dual_acc(s, a, fns[a->size][a->x]); +} + +static bool trans_VMLALDAV_U(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenDualAccOpFn * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_mve_vmlaldavuh, NULL }, + { gen_helper_mve_vmlaldavuw, NULL }, + { NULL, NULL }, + }; + return do_long_dual_acc(s, a, fns[a->size][a->x]); +} + +static bool trans_VMLSLDAV(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenDualAccOpFn * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_mve_vmlsldavsh, gen_helper_mve_vmlsldavxsh }, + { gen_helper_mve_vmlsldavsw, gen_helper_mve_vmlsldavxsw }, + { NULL, NULL }, + }; + return do_long_dual_acc(s, a, fns[a->size][a->x]); +} + +static bool trans_VRMLALDAVH_S(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenDualAccOpFn * const fns[] = { + gen_helper_mve_vrmlaldavhsw, gen_helper_mve_vrmlaldavhxsw, + }; + return do_long_dual_acc(s, a, fns[a->x]); +} + +static bool trans_VRMLALDAVH_U(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenDualAccOpFn * const fns[] = { + gen_helper_mve_vrmlaldavhuw, NULL, + }; + return do_long_dual_acc(s, a, fns[a->x]); +} + +static bool trans_VRMLSLDAVH(DisasContext *s, arg_vmlaldav *a) +{ + static MVEGenDualAccOpFn * const fns[] = { + gen_helper_mve_vrmlsldavhsw, gen_helper_mve_vrmlsldavhxsw, + }; + return do_long_dual_acc(s, a, fns[a->x]); +} + +static bool trans_VPST(DisasContext *s, arg_VPST *a) +{ + TCGv_i32 vpr; + + /* mask == 0 is a "related encoding" */ + if (!dc_isar_feature(aa32_mve, s) || !a->mask) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + /* + * Set the VPR mask fields. We take advantage of MASK01 and MASK23 + * being adjacent fields in the register. + * + * This insn is not predicated, but it is subject to beat-wise + * execution, and the mask is updated on the odd-numbered beats. + * So if PSR.ECI says we should skip beat 1, we mustn't update the + * 01 mask field. + */ + vpr = load_cpu_field(v7m.vpr); + switch (s->eci) { + case ECI_NONE: + case ECI_A0: + /* Update both 01 and 23 fields */ + tcg_gen_deposit_i32(vpr, vpr, + tcg_constant_i32(a->mask | (a->mask << 4)), + R_V7M_VPR_MASK01_SHIFT, + R_V7M_VPR_MASK01_LENGTH + R_V7M_VPR_MASK23_LENGTH); + break; + case ECI_A0A1: + case ECI_A0A1A2: + case ECI_A0A1A2B0: + /* Update only the 23 mask field */ + tcg_gen_deposit_i32(vpr, vpr, + tcg_constant_i32(a->mask), + R_V7M_VPR_MASK23_SHIFT, R_V7M_VPR_MASK23_LENGTH); + break; + default: + g_assert_not_reached(); + } + store_cpu_field(vpr, v7m.vpr); + mve_update_and_store_eci(s); + return true; +} + +static bool trans_VADDV(DisasContext *s, arg_VADDV *a) +{ + /* VADDV: vector add across vector */ + static MVEGenVADDVFn * const fns[4][2] = { + { gen_helper_mve_vaddvsb, gen_helper_mve_vaddvub }, + { gen_helper_mve_vaddvsh, gen_helper_mve_vaddvuh }, + { gen_helper_mve_vaddvsw, gen_helper_mve_vaddvuw }, + { NULL, NULL } + }; + TCGv_ptr qm; + TCGv_i32 rda; + + if (!dc_isar_feature(aa32_mve, s) || + a->size == 3) { + return false; + } + if (!mve_eci_check(s) || !vfp_access_check(s)) { + return true; + } + + /* + * This insn is subject to beat-wise execution. Partial execution + * of an A=0 (no-accumulate) insn which does not execute the first + * beat must start with the current value of Rda, not zero. + */ + if (a->a || mve_skip_first_beat(s)) { + /* Accumulate input from Rda */ + rda = load_reg(s, a->rda); + } else { + /* Accumulate starting at zero */ + rda = tcg_const_i32(0); + } + + qm = mve_qreg_ptr(a->qm); + fns[a->size][a->u](rda, cpu_env, qm, rda); + store_reg(s, a->rda, rda); + tcg_temp_free_ptr(qm); + + mve_update_eci(s); + return true; +} diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c index 01e26a246d..b2991e21ec 100644 --- a/target/arm/translate-vfp.c +++ b/target/arm/translate-vfp.c @@ -132,32 +132,75 @@ static void gen_preserve_fp_state(DisasContext *s) } /* - * Check that VFP access is enabled. If it is, do the necessary - * M-profile lazy-FP handling and then return true. - * If not, emit code to generate an appropriate exception and - * return false. + * Generate code for M-profile FP context handling: update the + * ownership of the FP context, and create a new context if + * necessary. This corresponds to the parts of the pseudocode + * ExecuteFPCheck() after the inital PreserveFPState() call. + */ +static void gen_update_fp_context(DisasContext *s) +{ + /* Update ownership of FP context: set FPCCR.S to match current state */ + if (s->v8m_fpccr_s_wrong) { + TCGv_i32 tmp; + + tmp = load_cpu_field(v7m.fpccr[M_REG_S]); + if (s->v8m_secure) { + tcg_gen_ori_i32(tmp, tmp, R_V7M_FPCCR_S_MASK); + } else { + tcg_gen_andi_i32(tmp, tmp, ~R_V7M_FPCCR_S_MASK); + } + store_cpu_field(tmp, v7m.fpccr[M_REG_S]); + /* Don't need to do this for any further FP insns in this TB */ + s->v8m_fpccr_s_wrong = false; + } + + if (s->v7m_new_fp_ctxt_needed) { + /* + * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA, + * the FPSCR, and VPR. + */ + TCGv_i32 control, fpscr; + uint32_t bits = R_V7M_CONTROL_FPCA_MASK; + + fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]); + gen_helper_vfp_set_fpscr(cpu_env, fpscr); + tcg_temp_free_i32(fpscr); + if (dc_isar_feature(aa32_mve, s)) { + TCGv_i32 z32 = tcg_const_i32(0); + store_cpu_field(z32, v7m.vpr); + } + + /* + * We don't need to arrange to end the TB, because the only + * parts of FPSCR which we cache in the TB flags are the VECLEN + * and VECSTRIDE, and those don't exist for M-profile. + */ + + if (s->v8m_secure) { + bits |= R_V7M_CONTROL_SFPA_MASK; + } + control = load_cpu_field(v7m.control[M_REG_S]); + tcg_gen_ori_i32(control, control, bits); + store_cpu_field(control, v7m.control[M_REG_S]); + /* Don't need to do this for any further FP insns in this TB */ + s->v7m_new_fp_ctxt_needed = false; + } +} + +/* + * Check that VFP access is enabled, A-profile specific version. + * + * If VFP is enabled, return true. If not, emit code to generate an + * appropriate exception and return false. * The ignore_vfp_enabled argument specifies that we should ignore - * whether VFP is enabled via FPEXC[EN]: this should be true for FMXR/FMRX + * whether VFP is enabled via FPEXC.EN: this should be true for FMXR/FMRX * accesses to FPSID, FPEXC, MVFR0, MVFR1, MVFR2, and false for all other insns. */ -static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled) +static bool vfp_access_check_a(DisasContext *s, bool ignore_vfp_enabled) { if (s->fp_excp_el) { - if (arm_dc_feature(s, ARM_FEATURE_M)) { - /* - * M-profile mostly catches the "FPU disabled" case early, in - * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP) - * which do coprocessor-checks are outside the large ranges of - * the encoding space handled by the patterns in m-nocp.decode, - * and for them we may need to raise NOCP here. - */ - gen_exception_insn(s, s->pc_curr, EXCP_NOCP, - syn_uncategorized(), s->fp_excp_el); - } else { - gen_exception_insn(s, s->pc_curr, EXCP_UDEF, - syn_fp_access_trap(1, 0xe, false), - s->fp_excp_el); - } + gen_exception_insn(s, s->pc_curr, EXCP_UDEF, + syn_fp_access_trap(1, 0xe, false), s->fp_excp_el); return false; } @@ -166,59 +209,40 @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled) unallocated_encoding(s); return false; } + return true; +} - if (arm_dc_feature(s, ARM_FEATURE_M)) { - /* Handle M-profile lazy FP state mechanics */ +/* + * Check that VFP access is enabled, M-profile specific version. + * + * If VFP is enabled, do the necessary M-profile lazy-FP handling and then + * return true. If not, emit code to generate an appropriate exception and + * return false. + * skip_context_update is true to skip the "update FP context" part of this. + */ +bool vfp_access_check_m(DisasContext *s, bool skip_context_update) +{ + if (s->fp_excp_el) { + /* + * M-profile mostly catches the "FPU disabled" case early, in + * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP) + * which do coprocessor-checks are outside the large ranges of + * the encoding space handled by the patterns in m-nocp.decode, + * and for them we may need to raise NOCP here. + */ + gen_exception_insn(s, s->pc_curr, EXCP_NOCP, + syn_uncategorized(), s->fp_excp_el); + return false; + } - /* Trigger lazy-state preservation if necessary */ - gen_preserve_fp_state(s); + /* Handle M-profile lazy FP state mechanics */ - /* Update ownership of FP context: set FPCCR.S to match current state */ - if (s->v8m_fpccr_s_wrong) { - TCGv_i32 tmp; + /* Trigger lazy-state preservation if necessary */ + gen_preserve_fp_state(s); - tmp = load_cpu_field(v7m.fpccr[M_REG_S]); - if (s->v8m_secure) { - tcg_gen_ori_i32(tmp, tmp, R_V7M_FPCCR_S_MASK); - } else { - tcg_gen_andi_i32(tmp, tmp, ~R_V7M_FPCCR_S_MASK); - } - store_cpu_field(tmp, v7m.fpccr[M_REG_S]); - /* Don't need to do this for any further FP insns in this TB */ - s->v8m_fpccr_s_wrong = false; - } - - if (s->v7m_new_fp_ctxt_needed) { - /* - * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA, - * the FPSCR, and VPR. - */ - TCGv_i32 control, fpscr; - uint32_t bits = R_V7M_CONTROL_FPCA_MASK; - - fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]); - gen_helper_vfp_set_fpscr(cpu_env, fpscr); - tcg_temp_free_i32(fpscr); - if (dc_isar_feature(aa32_mve, s)) { - TCGv_i32 z32 = tcg_const_i32(0); - store_cpu_field(z32, v7m.vpr); - } - - /* - * We don't need to arrange to end the TB, because the only - * parts of FPSCR which we cache in the TB flags are the VECLEN - * and VECSTRIDE, and those don't exist for M-profile. - */ - - if (s->v8m_secure) { - bits |= R_V7M_CONTROL_SFPA_MASK; - } - control = load_cpu_field(v7m.control[M_REG_S]); - tcg_gen_ori_i32(control, control, bits); - store_cpu_field(control, v7m.control[M_REG_S]); - /* Don't need to do this for any further FP insns in this TB */ - s->v7m_new_fp_ctxt_needed = false; - } + if (!skip_context_update) { + /* Update ownership of FP context and create new FP context if needed */ + gen_update_fp_context(s); } return true; @@ -230,7 +254,11 @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled) */ bool vfp_access_check(DisasContext *s) { - return full_vfp_access_check(s, false); + if (arm_dc_feature(s, ARM_FEATURE_M)) { + return vfp_access_check_m(s, false); + } else { + return vfp_access_check_a(s, false); + } } static bool trans_VSEL(DisasContext *s, arg_VSEL *a) @@ -553,6 +581,48 @@ static bool trans_VCVT(DisasContext *s, arg_VCVT *a) return true; } +static bool mve_skip_vmov(DisasContext *s, int vn, int index, int size) +{ + /* + * In a CPU with MVE, the VMOV (vector lane to general-purpose register) + * and VMOV (general-purpose register to vector lane) insns are not + * predicated, but they are subject to beatwise execution if they are + * not in an IT block. + * + * Since our implementation always executes all 4 beats in one tick, + * this means only that if PSR.ECI says we should not be executing + * the beat corresponding to the lane of the vector register being + * accessed then we should skip performing the move, and that we need + * to do the usual check for bad ECI state and advance of ECI state. + * + * Note that if PSR.ECI is non-zero then we cannot be in an IT block. + * + * Return true if this VMOV scalar <-> gpreg should be skipped because + * the MVE PSR.ECI state says we skip the beat where the store happens. + */ + + /* Calculate the byte offset into Qn which we're going to access */ + int ofs = (index << size) + ((vn & 1) * 8); + + if (!dc_isar_feature(aa32_mve, s)) { + return false; + } + + switch (s->eci) { + case ECI_NONE: + return false; + case ECI_A0: + return ofs < 4; + case ECI_A0A1: + return ofs < 8; + case ECI_A0A1A2: + case ECI_A0A1A2B0: + return ofs < 12; + default: + g_assert_not_reached(); + } +} + static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a) { /* VMOV scalar to general purpose register */ @@ -575,14 +645,26 @@ static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a) return false; } + if (dc_isar_feature(aa32_mve, s)) { + if (!mve_eci_check(s)) { + return true; + } + } + if (!vfp_access_check(s)) { return true; } - tmp = tcg_temp_new_i32(); - read_neon_element32(tmp, a->vn, a->index, a->size | (a->u ? 0 : MO_SIGN)); - store_reg(s, a->rt, tmp); + if (!mve_skip_vmov(s, a->vn, a->index, a->size)) { + tmp = tcg_temp_new_i32(); + read_neon_element32(tmp, a->vn, a->index, + a->size | (a->u ? 0 : MO_SIGN)); + store_reg(s, a->rt, tmp); + } + if (dc_isar_feature(aa32_mve, s)) { + mve_update_and_store_eci(s); + } return true; } @@ -608,14 +690,25 @@ static bool trans_VMOV_from_gp(DisasContext *s, arg_VMOV_from_gp *a) return false; } + if (dc_isar_feature(aa32_mve, s)) { + if (!mve_eci_check(s)) { + return true; + } + } + if (!vfp_access_check(s)) { return true; } - tmp = load_reg(s, a->rt); - write_neon_element32(tmp, a->vn, a->index, a->size); - tcg_temp_free_i32(tmp); + if (!mve_skip_vmov(s, a->vn, a->index, a->size)) { + tmp = load_reg(s, a->rt); + write_neon_element32(tmp, a->vn, a->index, a->size); + tcg_temp_free_i32(tmp); + } + if (dc_isar_feature(aa32_mve, s)) { + mve_update_and_store_eci(s); + } return true; } @@ -663,408 +756,14 @@ static bool trans_VDUP(DisasContext *s, arg_VDUP *a) return true; } -/* - * M-profile provides two different sets of instructions that can - * access floating point system registers: VMSR/VMRS (which move - * to/from a general purpose register) and VLDR/VSTR sysreg (which - * move directly to/from memory). In some cases there are also side - * effects which must happen after any write to memory (which could - * cause an exception). So we implement the common logic for the - * sysreg access in gen_M_fp_sysreg_write() and gen_M_fp_sysreg_read(), - * which take pointers to callback functions which will perform the - * actual "read/write general purpose register" and "read/write - * memory" operations. - */ - -/* - * Emit code to store the sysreg to its final destination; frees the - * TCG temp 'value' it is passed. - */ -typedef void fp_sysreg_storefn(DisasContext *s, void *opaque, TCGv_i32 value); -/* - * Emit code to load the value to be copied to the sysreg; returns - * a new TCG temporary - */ -typedef TCGv_i32 fp_sysreg_loadfn(DisasContext *s, void *opaque); - -/* Common decode/access checks for fp sysreg read/write */ -typedef enum FPSysRegCheckResult { - FPSysRegCheckFailed, /* caller should return false */ - FPSysRegCheckDone, /* caller should return true */ - FPSysRegCheckContinue, /* caller should continue generating code */ -} FPSysRegCheckResult; - -static FPSysRegCheckResult fp_sysreg_checks(DisasContext *s, int regno) -{ - if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) { - return FPSysRegCheckFailed; - } - - switch (regno) { - case ARM_VFP_FPSCR: - case QEMU_VFP_FPSCR_NZCV: - break; - case ARM_VFP_FPSCR_NZCVQC: - if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { - return FPSysRegCheckFailed; - } - break; - case ARM_VFP_FPCXT_S: - case ARM_VFP_FPCXT_NS: - if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { - return FPSysRegCheckFailed; - } - if (!s->v8m_secure) { - return FPSysRegCheckFailed; - } - break; - case ARM_VFP_VPR: - case ARM_VFP_P0: - if (!dc_isar_feature(aa32_mve, s)) { - return FPSysRegCheckFailed; - } - break; - default: - return FPSysRegCheckFailed; - } - - /* - * FPCXT_NS is a special case: it has specific handling for - * "current FP state is inactive", and must do the PreserveFPState() - * but not the usual full set of actions done by ExecuteFPCheck(). - * So we don't call vfp_access_check() and the callers must handle this. - */ - if (regno != ARM_VFP_FPCXT_NS && !vfp_access_check(s)) { - return FPSysRegCheckDone; - } - return FPSysRegCheckContinue; -} - -static void gen_branch_fpInactive(DisasContext *s, TCGCond cond, - TCGLabel *label) -{ - /* - * FPCXT_NS is a special case: it has specific handling for - * "current FP state is inactive", and must do the PreserveFPState() - * but not the usual full set of actions done by ExecuteFPCheck(). - * We don't have a TB flag that matches the fpInactive check, so we - * do it at runtime as we don't expect FPCXT_NS accesses to be frequent. - * - * Emit code that checks fpInactive and does a conditional - * branch to label based on it: - * if cond is TCG_COND_NE then branch if fpInactive != 0 (ie if inactive) - * if cond is TCG_COND_EQ then branch if fpInactive == 0 (ie if active) - */ - assert(cond == TCG_COND_EQ || cond == TCG_COND_NE); - - /* fpInactive = FPCCR_NS.ASPEN == 1 && CONTROL.FPCA == 0 */ - TCGv_i32 aspen, fpca; - aspen = load_cpu_field(v7m.fpccr[M_REG_NS]); - fpca = load_cpu_field(v7m.control[M_REG_S]); - tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK); - tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK); - tcg_gen_andi_i32(fpca, fpca, R_V7M_CONTROL_FPCA_MASK); - tcg_gen_or_i32(fpca, fpca, aspen); - tcg_gen_brcondi_i32(tcg_invert_cond(cond), fpca, 0, label); - tcg_temp_free_i32(aspen); - tcg_temp_free_i32(fpca); -} - -static bool gen_M_fp_sysreg_write(DisasContext *s, int regno, - - fp_sysreg_loadfn *loadfn, - void *opaque) -{ - /* Do a write to an M-profile floating point system register */ - TCGv_i32 tmp; - TCGLabel *lab_end = NULL; - - switch (fp_sysreg_checks(s, regno)) { - case FPSysRegCheckFailed: - return false; - case FPSysRegCheckDone: - return true; - case FPSysRegCheckContinue: - break; - } - - switch (regno) { - case ARM_VFP_FPSCR: - tmp = loadfn(s, opaque); - gen_helper_vfp_set_fpscr(cpu_env, tmp); - tcg_temp_free_i32(tmp); - gen_lookup_tb(s); - break; - case ARM_VFP_FPSCR_NZCVQC: - { - TCGv_i32 fpscr; - tmp = loadfn(s, opaque); - if (dc_isar_feature(aa32_mve, s)) { - /* QC is only present for MVE; otherwise RES0 */ - TCGv_i32 qc = tcg_temp_new_i32(); - tcg_gen_andi_i32(qc, tmp, FPCR_QC); - /* - * The 4 vfp.qc[] fields need only be "zero" vs "non-zero"; - * here writing the same value into all elements is simplest. - */ - tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc), - 16, 16, qc); - } - tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); - fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); - tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK); - tcg_gen_or_i32(fpscr, fpscr, tmp); - store_cpu_field(fpscr, vfp.xregs[ARM_VFP_FPSCR]); - tcg_temp_free_i32(tmp); - break; - } - case ARM_VFP_FPCXT_NS: - lab_end = gen_new_label(); - /* fpInactive case: write is a NOP, so branch to end */ - gen_branch_fpInactive(s, TCG_COND_NE, lab_end); - /* !fpInactive: PreserveFPState(), and reads same as FPCXT_S */ - gen_preserve_fp_state(s); - /* fall through */ - case ARM_VFP_FPCXT_S: - { - TCGv_i32 sfpa, control; - /* - * Set FPSCR and CONTROL.SFPA from value; the new FPSCR takes - * bits [27:0] from value and zeroes bits [31:28]. - */ - tmp = loadfn(s, opaque); - sfpa = tcg_temp_new_i32(); - tcg_gen_shri_i32(sfpa, tmp, 31); - control = load_cpu_field(v7m.control[M_REG_S]); - tcg_gen_deposit_i32(control, control, sfpa, - R_V7M_CONTROL_SFPA_SHIFT, 1); - store_cpu_field(control, v7m.control[M_REG_S]); - tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK); - gen_helper_vfp_set_fpscr(cpu_env, tmp); - tcg_temp_free_i32(tmp); - tcg_temp_free_i32(sfpa); - break; - } - case ARM_VFP_VPR: - /* Behaves as NOP if not privileged */ - if (IS_USER(s)) { - break; - } - tmp = loadfn(s, opaque); - store_cpu_field(tmp, v7m.vpr); - break; - case ARM_VFP_P0: - { - TCGv_i32 vpr; - tmp = loadfn(s, opaque); - vpr = load_cpu_field(v7m.vpr); - tcg_gen_deposit_i32(vpr, vpr, tmp, - R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH); - store_cpu_field(vpr, v7m.vpr); - tcg_temp_free_i32(tmp); - break; - } - default: - g_assert_not_reached(); - } - if (lab_end) { - gen_set_label(lab_end); - } - return true; -} - -static bool gen_M_fp_sysreg_read(DisasContext *s, int regno, - fp_sysreg_storefn *storefn, - void *opaque) -{ - /* Do a read from an M-profile floating point system register */ - TCGv_i32 tmp; - TCGLabel *lab_end = NULL; - bool lookup_tb = false; - - switch (fp_sysreg_checks(s, regno)) { - case FPSysRegCheckFailed: - return false; - case FPSysRegCheckDone: - return true; - case FPSysRegCheckContinue: - break; - } - - if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) { - /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */ - regno = QEMU_VFP_FPSCR_NZCV; - } - - switch (regno) { - case ARM_VFP_FPSCR: - tmp = tcg_temp_new_i32(); - gen_helper_vfp_get_fpscr(tmp, cpu_env); - storefn(s, opaque, tmp); - break; - case ARM_VFP_FPSCR_NZCVQC: - tmp = tcg_temp_new_i32(); - gen_helper_vfp_get_fpscr(tmp, cpu_env); - tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK); - storefn(s, opaque, tmp); - break; - case QEMU_VFP_FPSCR_NZCV: - /* - * Read just NZCV; this is a special case to avoid the - * helper call for the "VMRS to CPSR.NZCV" insn. - */ - tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]); - tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK); - storefn(s, opaque, tmp); - break; - case ARM_VFP_FPCXT_S: - { - TCGv_i32 control, sfpa, fpscr; - /* Bits [27:0] from FPSCR, bit [31] from CONTROL.SFPA */ - tmp = tcg_temp_new_i32(); - sfpa = tcg_temp_new_i32(); - gen_helper_vfp_get_fpscr(tmp, cpu_env); - tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK); - control = load_cpu_field(v7m.control[M_REG_S]); - tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK); - tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT); - tcg_gen_or_i32(tmp, tmp, sfpa); - tcg_temp_free_i32(sfpa); - /* - * Store result before updating FPSCR etc, in case - * it is a memory write which causes an exception. - */ - storefn(s, opaque, tmp); - /* - * Now we must reset FPSCR from FPDSCR_NS, and clear - * CONTROL.SFPA; so we'll end the TB here. - */ - tcg_gen_andi_i32(control, control, ~R_V7M_CONTROL_SFPA_MASK); - store_cpu_field(control, v7m.control[M_REG_S]); - fpscr = load_cpu_field(v7m.fpdscr[M_REG_NS]); - gen_helper_vfp_set_fpscr(cpu_env, fpscr); - tcg_temp_free_i32(fpscr); - lookup_tb = true; - break; - } - case ARM_VFP_FPCXT_NS: - { - TCGv_i32 control, sfpa, fpscr, fpdscr, zero; - TCGLabel *lab_active = gen_new_label(); - - lookup_tb = true; - - gen_branch_fpInactive(s, TCG_COND_EQ, lab_active); - /* fpInactive case: reads as FPDSCR_NS */ - TCGv_i32 tmp = load_cpu_field(v7m.fpdscr[M_REG_NS]); - storefn(s, opaque, tmp); - lab_end = gen_new_label(); - tcg_gen_br(lab_end); - - gen_set_label(lab_active); - /* !fpInactive: Reads the same as FPCXT_S, but side effects differ */ - gen_preserve_fp_state(s); - tmp = tcg_temp_new_i32(); - sfpa = tcg_temp_new_i32(); - fpscr = tcg_temp_new_i32(); - gen_helper_vfp_get_fpscr(fpscr, cpu_env); - tcg_gen_andi_i32(tmp, fpscr, ~FPCR_NZCV_MASK); - control = load_cpu_field(v7m.control[M_REG_S]); - tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK); - tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT); - tcg_gen_or_i32(tmp, tmp, sfpa); - tcg_temp_free_i32(control); - /* Store result before updating FPSCR, in case it faults */ - storefn(s, opaque, tmp); - /* If SFPA is zero then set FPSCR from FPDSCR_NS */ - fpdscr = load_cpu_field(v7m.fpdscr[M_REG_NS]); - zero = tcg_const_i32(0); - tcg_gen_movcond_i32(TCG_COND_EQ, fpscr, sfpa, zero, fpdscr, fpscr); - gen_helper_vfp_set_fpscr(cpu_env, fpscr); - tcg_temp_free_i32(zero); - tcg_temp_free_i32(sfpa); - tcg_temp_free_i32(fpdscr); - tcg_temp_free_i32(fpscr); - break; - } - case ARM_VFP_VPR: - /* Behaves as NOP if not privileged */ - if (IS_USER(s)) { - break; - } - tmp = load_cpu_field(v7m.vpr); - storefn(s, opaque, tmp); - break; - case ARM_VFP_P0: - tmp = load_cpu_field(v7m.vpr); - tcg_gen_extract_i32(tmp, tmp, R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH); - storefn(s, opaque, tmp); - break; - default: - g_assert_not_reached(); - } - - if (lab_end) { - gen_set_label(lab_end); - } - if (lookup_tb) { - gen_lookup_tb(s); - } - return true; -} - -static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value) -{ - arg_VMSR_VMRS *a = opaque; - - if (a->rt == 15) { - /* Set the 4 flag bits in the CPSR */ - gen_set_nzcv(value); - tcg_temp_free_i32(value); - } else { - store_reg(s, a->rt, value); - } -} - -static TCGv_i32 gpr_to_fp_sysreg(DisasContext *s, void *opaque) -{ - arg_VMSR_VMRS *a = opaque; - - return load_reg(s, a->rt); -} - -static bool gen_M_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) -{ - /* - * Accesses to R15 are UNPREDICTABLE; we choose to undef. - * FPSCR -> r15 is a special case which writes to the PSR flags; - * set a->reg to a special value to tell gen_M_fp_sysreg_read() - * we only care about the top 4 bits of FPSCR there. - */ - if (a->rt == 15) { - if (a->l && a->reg == ARM_VFP_FPSCR) { - a->reg = QEMU_VFP_FPSCR_NZCV; - } else { - return false; - } - } - - if (a->l) { - /* VMRS, move FP system register to gp register */ - return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_gpr, a); - } else { - /* VMSR, move gp register to FP system register */ - return gen_M_fp_sysreg_write(s, a->reg, gpr_to_fp_sysreg, a); - } -} - static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) { TCGv_i32 tmp; bool ignore_vfp_enabled = false; if (arm_dc_feature(s, ARM_FEATURE_M)) { - return gen_M_VMSR_VMRS(s, a); + /* M profile version was already handled in m-nocp.decode */ + return false; } if (!dc_isar_feature(aa32_fpsp_v2, s)) { @@ -1114,7 +813,11 @@ static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) return false; } - if (!full_vfp_access_check(s, ignore_vfp_enabled)) { + /* + * Call vfp_access_check_a() directly, because we need to tell + * it to ignore FPEXC.EN for some register accesses. + */ + if (!vfp_access_check_a(s, ignore_vfp_enabled)) { return true; } @@ -1200,96 +903,6 @@ static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a) return true; } -static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value) -{ - arg_vldr_sysreg *a = opaque; - uint32_t offset = a->imm; - TCGv_i32 addr; - - if (!a->a) { - offset = - offset; - } - - addr = load_reg(s, a->rn); - if (a->p) { - tcg_gen_addi_i32(addr, addr, offset); - } - - if (s->v8m_stackcheck && a->rn == 13 && a->w) { - gen_helper_v8m_stackcheck(cpu_env, addr); - } - - gen_aa32_st_i32(s, value, addr, get_mem_index(s), - MO_UL | MO_ALIGN | s->be_data); - tcg_temp_free_i32(value); - - if (a->w) { - /* writeback */ - if (!a->p) { - tcg_gen_addi_i32(addr, addr, offset); - } - store_reg(s, a->rn, addr); - } else { - tcg_temp_free_i32(addr); - } -} - -static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque) -{ - arg_vldr_sysreg *a = opaque; - uint32_t offset = a->imm; - TCGv_i32 addr; - TCGv_i32 value = tcg_temp_new_i32(); - - if (!a->a) { - offset = - offset; - } - - addr = load_reg(s, a->rn); - if (a->p) { - tcg_gen_addi_i32(addr, addr, offset); - } - - if (s->v8m_stackcheck && a->rn == 13 && a->w) { - gen_helper_v8m_stackcheck(cpu_env, addr); - } - - gen_aa32_ld_i32(s, value, addr, get_mem_index(s), - MO_UL | MO_ALIGN | s->be_data); - - if (a->w) { - /* writeback */ - if (!a->p) { - tcg_gen_addi_i32(addr, addr, offset); - } - store_reg(s, a->rn, addr); - } else { - tcg_temp_free_i32(addr); - } - return value; -} - -static bool trans_VLDR_sysreg(DisasContext *s, arg_vldr_sysreg *a) -{ - if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { - return false; - } - if (a->rn == 15) { - return false; - } - return gen_M_fp_sysreg_write(s, a->reg, memory_to_fp_sysreg, a); -} - -static bool trans_VSTR_sysreg(DisasContext *s, arg_vldr_sysreg *a) -{ - if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) { - return false; - } - if (a->rn == 15) { - return false; - } - return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_memory, a); -} static bool trans_VMOV_half(DisasContext *s, arg_VMOV_single *a) { diff --git a/target/arm/translate.h b/target/arm/translate.h index 2821b325e3..99c917c571 100644 --- a/target/arm/translate.h +++ b/target/arm/translate.h @@ -136,6 +136,11 @@ static inline int negate(DisasContext *s, int x) return -x; } +static inline int plus_1(DisasContext *s, int x) +{ + return x + 1; +} + static inline int plus_2(DisasContext *s, int x) { return x + 2; @@ -151,6 +156,11 @@ static inline int times_4(DisasContext *s, int x) return x * 4; } +static inline int times_2_plus_1(DisasContext *s, int x) +{ + return x * 2 + 1; +} + static inline int arm_dc_feature(DisasContext *dc, int feature) { return (dc->features & (1ULL << feature)) != 0; diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode index 52535d9b0b..5405e80197 100644 --- a/target/arm/vfp.decode +++ b/target/arm/vfp.decode @@ -84,20 +84,6 @@ VLDR_VSTR_hp ---- 1101 u:1 .0 l:1 rn:4 .... 1001 imm:8 vd=%vd_sp VLDR_VSTR_sp ---- 1101 u:1 .0 l:1 rn:4 .... 1010 imm:8 vd=%vd_sp VLDR_VSTR_dp ---- 1101 u:1 .0 l:1 rn:4 .... 1011 imm:8 vd=%vd_dp -# M-profile VLDR/VSTR to sysreg -%vldr_sysreg 22:1 13:3 -%imm7_0x4 0:7 !function=times_4 - -&vldr_sysreg rn reg imm a w p -@vldr_sysreg .... ... . a:1 . . . rn:4 ... . ... .. ....... \ - reg=%vldr_sysreg imm=%imm7_0x4 &vldr_sysreg - -# P=0 W=0 is SEE "Related encodings", so split into two patterns -VLDR_sysreg ---- 110 1 . . w:1 1 .... ... 0 111 11 ....... @vldr_sysreg p=1 -VLDR_sysreg ---- 110 0 . . 1 1 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1 -VSTR_sysreg ---- 110 1 . . w:1 0 .... ... 0 111 11 ....... @vldr_sysreg p=1 -VSTR_sysreg ---- 110 0 . . 1 0 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1 - # We split the load/store multiple up into two patterns to avoid # overlap with other insns in the "Advanced SIMD load/store and 64-bit move" # grouping: diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c index 498a959839..515db120cc 100644 --- a/tcg/tcg-op-gvec.c +++ b/tcg/tcg-op-gvec.c @@ -386,7 +386,7 @@ uint64_t (dup_const)(unsigned vece, uint64_t c) } /* Duplicate IN into OUT as per VECE. */ -static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) +void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) { switch (vece) { case MO_8: @@ -404,7 +404,7 @@ static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) } } -static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) +void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) { switch (vece) { case MO_8: @@ -578,15 +578,15 @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, && (vece != MO_32 || !check_size_impl(oprsz, 4))) { t_64 = tcg_temp_new_i64(); tcg_gen_extu_i32_i64(t_64, in_32); - gen_dup_i64(vece, t_64, t_64); + tcg_gen_dup_i64(vece, t_64, t_64); } else { t_32 = tcg_temp_new_i32(); - gen_dup_i32(vece, t_32, in_32); + tcg_gen_dup_i32(vece, t_32, in_32); } } else if (in_64) { /* We are given a 64-bit variable input. */ t_64 = tcg_temp_new_i64(); - gen_dup_i64(vece, t_64, in_64); + tcg_gen_dup_i64(vece, t_64, in_64); } else { /* We are given a constant input. */ /* For 64-bit hosts, use 64-bit constants for "simple" constants @@ -1311,14 +1311,14 @@ void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, } else if (g->fni8 && check_size_impl(oprsz, 8)) { TCGv_i64 t64 = tcg_temp_new_i64(); - gen_dup_i64(g->vece, t64, c); + tcg_gen_dup_i64(g->vece, t64, c); expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); tcg_temp_free_i64(t64); } else if (g->fni4 && check_size_impl(oprsz, 4)) { TCGv_i32 t32 = tcg_temp_new_i32(); tcg_gen_extrl_i64_i32(t32, c); - gen_dup_i32(g->vece, t32, t32); + tcg_gen_dup_i32(g->vece, t32, t32); expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); tcg_temp_free_i32(t32); } else { @@ -2538,7 +2538,7 @@ void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) { TCGv_i64 tmp = tcg_temp_new_i64(); - gen_dup_i64(vece, tmp, c); + tcg_gen_dup_i64(vece, tmp, c); tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); tcg_temp_free_i64(tmp); } @@ -2562,7 +2562,7 @@ void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) { TCGv_i64 tmp = tcg_temp_new_i64(); - gen_dup_i64(vece, tmp, c); + tcg_gen_dup_i64(vece, tmp, c); tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); tcg_temp_free_i64(tmp); } @@ -2586,7 +2586,7 @@ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) { TCGv_i64 tmp = tcg_temp_new_i64(); - gen_dup_i64(vece, tmp, c); + tcg_gen_dup_i64(vece, tmp, c); tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); tcg_temp_free_i64(tmp); }