qemu-e2k/target/arm/helper-a64.h

/*
 *  AArch64 specific helper definitions
 *
 *  Copyright (c) 2013 Alexander Graf <agraf@suse.de>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */
DEF_HELPER_FLAGS_2(udiv64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(sdiv64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
DEF_HELPER_FLAGS_1(rbit64, TCG_CALL_NO_RWG_SE, i64, i64)
DEF_HELPER_3(vfp_cmps_a64, i64, f32, f32, ptr)
DEF_HELPER_3(vfp_cmpes_a64, i64, f32, f32, ptr)
DEF_HELPER_3(vfp_cmpd_a64, i64, f64, f64, ptr)
DEF_HELPER_3(vfp_cmped_a64, i64, f64, f64, ptr)
DEF_HELPER_FLAGS_5(simd_tbl, TCG_CALL_NO_RWG_SE, i64, env, i64, i64, i32, i32)
DEF_HELPER_FLAGS_3(vfp_mulxs, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
DEF_HELPER_FLAGS_3(vfp_mulxd, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
DEF_HELPER_FLAGS_3(neon_ceq_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
DEF_HELPER_FLAGS_3(neon_cge_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
DEF_HELPER_FLAGS_3(neon_cgt_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
DEF_HELPER_FLAGS_3(recpsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
DEF_HELPER_FLAGS_3(recpsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
DEF_HELPER_FLAGS_3(rsqrtsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
DEF_HELPER_FLAGS_3(rsqrtsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
DEF_HELPER_FLAGS_1(neon_addlp_s8, TCG_CALL_NO_RWG_SE, i64, i64)
DEF_HELPER_FLAGS_1(neon_addlp_u8, TCG_CALL_NO_RWG_SE, i64, i64)
DEF_HELPER_FLAGS_1(neon_addlp_s16, TCG_CALL_NO_RWG_SE, i64, i64)
DEF_HELPER_FLAGS_1(neon_addlp_u16, TCG_CALL_NO_RWG_SE, i64, i64)
DEF_HELPER_FLAGS_2(frecpx_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env)
DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
DEF_HELPER_FLAGS_4(paired_cmpxchg64_le, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
DEF_HELPER_FLAGS_4(paired_cmpxchg64_le_parallel, TCG_CALL_NO_WG,
                   i64, env, i64, i64, i64)
DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
DEF_HELPER_FLAGS_4(paired_cmpxchg64_be_parallel, TCG_CALL_NO_WG,
                   i64, env, i64, i64, i64)
target-arm: A64: add stubs for a64 specific helpers We will need helpers that only make sense with AArch64. Add helper-a64.{c,h} files as stubs that we can fill with these helpers in the following patches. Signed-off-by: Alexander Graf <agraf@suse.de> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> 2013-12-17 20:42:32 +01:00			`/*`
			`* AArch64 specific helper definitions`
			`*`
			`* Copyright (c) 2013 Alexander Graf <agraf@suse.de>`
			`*`
			`* This library is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2 of the License, or (at your option) any later version.`
			`*`
			`* This library is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with this library; if not, see <http://www.gnu.org/licenses/>.`
			`*/`
target-arm: A64: add support for 2-src data processing and DIV This patch adds support for decoding 2-src data processing insns, and the first users, UDIV and SDIV. Signed-off-by: Alexander Graf <agraf@suse.de> [claudio: adapted to new decoder adding the 2-src decoding level, always zero-extend result in 32bit mode] Signed-off-by: Claudio Fontana <claudio.fontana@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> 2013-12-17 20:42:34 +01:00			`DEF_HELPER_FLAGS_2(udiv64, TCG_CALL_NO_RWG_SE, i64, i64, i64)`
			`DEF_HELPER_FLAGS_2(sdiv64, TCG_CALL_NO_RWG_SE, s64, s64, s64)`
target-arm: A64: add support for 1-src RBIT insn This adds support for the C5.6.147 RBIT instruction. Signed-off-by: Alexander Graf <agraf@suse.de> [claudio: adapted to new decoder, use bswap64, make RBIT part standalone from the rest of the patch, splitting REV into a separate patch] Signed-off-by: Claudio Fontana <claudio.fontana@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> 2013-12-17 20:42:35 +01:00			`DEF_HELPER_FLAGS_1(rbit64, TCG_CALL_NO_RWG_SE, i64, i64)`
target-arm: A64: Add support for floating point compare Add decoding support for C3.6.22 Floating-point compare. Signed-off-by: Claudio Fontana <claudio.fontana@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> 2014-01-04 23:15:50 +01:00			`DEF_HELPER_3(vfp_cmps_a64, i64, f32, f32, ptr)`
			`DEF_HELPER_3(vfp_cmpes_a64, i64, f32, f32, ptr)`
			`DEF_HELPER_3(vfp_cmpd_a64, i64, f64, f64, ptr)`
			`DEF_HELPER_3(vfp_cmped_a64, i64, f64, f64, ptr)`
target-arm: A64: Add SIMD TBL/TBLX Add support for the SIMD TBL/TBLX instructions (group C3.6.2). Signed-off-by: Michael Matz <matz@suse.de> [PMM: rewritten to do more of the decode in translate-a64.c, and to do only one 64 bit pass at a time in the helper] Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> 2014-01-31 15:47:31 +01:00			`DEF_HELPER_FLAGS_5(simd_tbl, TCG_CALL_NO_RWG_SE, i64, env, i64, i64, i32, i32)`
target-arm: A64: Implement plain vector SIMD indexed element insns Implement all the SIMD vector x indexed element instructions in the subcategory which are not 'long' ops. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> 2014-02-20 11:35:48 +01:00			`DEF_HELPER_FLAGS_3(vfp_mulxs, TCG_CALL_NO_RWG, f32, f32, f32, ptr)`
			`DEF_HELPER_FLAGS_3(vfp_mulxd, TCG_CALL_NO_RWG, f64, f64, f64, ptr)`
target-arm: A64: Implement SIMD FP compare and set insns This adds all forms of the SIMD floating point and set instructions: FCM(GT\|GE\|EQ\|LE\|LT) Most of the heavy lifting is done by either the existing neon helpers or some new helpers for the 64bit double cases. Most of the code paths are common although the 2misc versions are a little special as they compare against zero. Signed-off-by: Alex Bennée <alex.bennee@linaro.org> [PMM: fixed some minor bugs, added the 2-misc-scalar encoding] Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> 2014-02-20 11:35:49 +01:00			`DEF_HELPER_FLAGS_3(neon_ceq_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)`
			`DEF_HELPER_FLAGS_3(neon_cge_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)`
			`DEF_HELPER_FLAGS_3(neon_cgt_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)`
target-arm: A64: Implement remaining 3-same instructions Implement the remaining instructions in the SIMD 3-reg-same and scalar-3-reg-same groups: FMULX, FRECPS, FRSQRTS, FACGE, FACGT, FMLA and FMLS. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> 2014-02-20 11:35:50 +01:00			`DEF_HELPER_FLAGS_3(recpsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr)`
			`DEF_HELPER_FLAGS_3(recpsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr)`
			`DEF_HELPER_FLAGS_3(rsqrtsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr)`
			`DEF_HELPER_FLAGS_3(rsqrtsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr)`
target-arm: A64: Implement SADDLP, UADDLP, SADALP, UADALP Implement the SADDLP, UADDLP, SADALP and UADALP instructions in the SIMD 2-reg misc category. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> Message-id: 1394822294-14837-8-git-send-email-peter.maydell@linaro.org 2014-03-17 17:31:48 +01:00			`DEF_HELPER_FLAGS_1(neon_addlp_s8, TCG_CALL_NO_RWG_SE, i64, i64)`
			`DEF_HELPER_FLAGS_1(neon_addlp_u8, TCG_CALL_NO_RWG_SE, i64, i64)`
			`DEF_HELPER_FLAGS_1(neon_addlp_s16, TCG_CALL_NO_RWG_SE, i64, i64)`
			`DEF_HELPER_FLAGS_1(neon_addlp_u16, TCG_CALL_NO_RWG_SE, i64, i64)`
target-arm: A64: Add FRECPX (reciprocal exponent) These are fairly simple exponent only estimation functions using helpers. Signed-off-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> Message-id: 1394822294-14837-14-git-send-email-peter.maydell@linaro.org 2014-03-17 17:31:50 +01:00			`DEF_HELPER_FLAGS_2(frecpx_f64, TCG_CALL_NO_RWG, f64, f64, ptr)`
			`DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr)`
target-arm: A64: Implement FCVTXN Implement the FCVTXN operation, which does a narrowing fp precision conversion using the "round to odd" (von Neumann) mode. This can conveniently be implemented as "do operation using round to zero; then set the LSB of the mantissa to 1 if the Inexact flag was set". Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> Message-id: 1394822294-14837-24-git-send-email-peter.maydell@linaro.org 2014-03-17 17:31:53 +01:00			`DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env)`
target-arm: A64: Implement CRC instructions Implement the optional A64 CRC instructions. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Message-id: 1401458125-27977-6-git-send-email-peter.maydell@linaro.org 2014-06-09 16:43:25 +02:00			`DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)`
			`DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)`
target-arm: emulate aarch64's LL/SC using cmpxchg helpers Emulating LL/SC with cmpxchg is not correct, since it can suffer from the ABA problem. Portable parallel code, however, is written assuming only cmpxchg--and not LL/SC--is available. This means that in practice emulating LL/SC with cmpxchg is a viable alternative. The appended emulates LL/SC pairs in aarch64 with cmpxchg helpers. This works in both user and system mode. In usermode, it avoids pausing all other CPUs to perform the LL/SC pair. The subsequent performance and scalability improvement is significant, as the plots below show. They plot the throughput of atomic_add-bench compiled for ARM and executed on a 64-core x86 machine. Hi-res plots: http://imgur.com/a/JVc8Y atomic_add-bench: 1000000 ops/thread, [0,1] range 18 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + \| 16 ++master +-H--+ ++ \|\| \| 14 ++ ++ \| \| \| 12 ++\| ++ \| \| \| 10 ++++ ++ 8 ++E ++ \|+++ \| 6 ++ \| ++ \| \| \| 4 ++ \| ++ \| \| \| 2 +H++E+--- ++ + \| +E++----+E+---+--+E+----++E+------+E+------+E++----+E+---+--+E\| 0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,2] range 18 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + \| 16 ++master +-H--+ ++ \| \| \| 14 ++E ++ \| \| \| 12 ++\| ++ \|+++ \| 10 ++ \| ++ 8 ++ \| ++ \| \| \| 6 ++ \| ++ \| \| \| 4 ++ \| ++ \| +E+--- \| 2 +H+ +E+-----+++ +++ +++ ---+E+-----+E+------+++ +++ + +E+---+--+E+----++E+------+E+--- ++++ +++ + +E\| 0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,128] range 70 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + \| 60 ++master +-H--+ +++ ---+E+-----+E+------+E+ \| +E+------E-------+E+--- \| \| --- +++ \| 50 ++ +++--- ++ \| -+E+ \| 40 ++ +++---- ++ \| E- \| \| --\| \| 30 ++ -- +++ ++ \| +E+ \| 20 ++E+ ++ \|E+ \| \| \| 10 ++ ++ + + + + + + + \| 0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,1024] range 160 ++---------+---------+----------+---------+----------+----------+---++ +cmpxchg +-E--+ + + + + + \| 140 ++master +-H--+ +++ +++ \| -+E+-----+E+-------E\| 120 ++ +++ ---- +++ \| +++ ----E-- \| 100 ++ --E--- +++ ++ \| +++ ---- +++ \| 80 ++ --E-- ++ \| ---- +++ \| \| -+E+ \| 60 ++ ---- +++ ++ \| +E+- \| 40 ++ -- ++ \| +E+ \| 20 +EE+ ++ +++ + + + + + + \| 0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads [rth: Rearrange 128-bit cmpxchg helper. Enforce alignment on LL.] Signed-off-by: Emilio G. Cota <cota@braap.org> Message-Id: <1467054136-10430-28-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson <rth@twiddle.net> 2016-06-27 21:02:13 +02:00			`DEF_HELPER_FLAGS_4(paired_cmpxchg64_le, TCG_CALL_NO_WG, i64, env, i64, i64, i64)`
target/arm: check CF_PARALLEL instead of parallel_cpus Thereby decoupling the resulting translated code from the current state of the system. Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> 2017-07-15 00:20:49 +02:00			`DEF_HELPER_FLAGS_4(paired_cmpxchg64_le_parallel, TCG_CALL_NO_WG,`
			`i64, env, i64, i64, i64)`
target-arm: emulate aarch64's LL/SC using cmpxchg helpers Emulating LL/SC with cmpxchg is not correct, since it can suffer from the ABA problem. Portable parallel code, however, is written assuming only cmpxchg--and not LL/SC--is available. This means that in practice emulating LL/SC with cmpxchg is a viable alternative. The appended emulates LL/SC pairs in aarch64 with cmpxchg helpers. This works in both user and system mode. In usermode, it avoids pausing all other CPUs to perform the LL/SC pair. The subsequent performance and scalability improvement is significant, as the plots below show. They plot the throughput of atomic_add-bench compiled for ARM and executed on a 64-core x86 machine. Hi-res plots: http://imgur.com/a/JVc8Y atomic_add-bench: 1000000 ops/thread, [0,1] range 18 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + \| 16 ++master +-H--+ ++ \|\| \| 14 ++ ++ \| \| \| 12 ++\| ++ \| \| \| 10 ++++ ++ 8 ++E ++ \|+++ \| 6 ++ \| ++ \| \| \| 4 ++ \| ++ \| \| \| 2 +H++E+--- ++ + \| +E++----+E+---+--+E+----++E+------+E+------+E++----+E+---+--+E\| 0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,2] range 18 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + \| 16 ++master +-H--+ ++ \| \| \| 14 ++E ++ \| \| \| 12 ++\| ++ \|+++ \| 10 ++ \| ++ 8 ++ \| ++ \| \| \| 6 ++ \| ++ \| \| \| 4 ++ \| ++ \| +E+--- \| 2 +H+ +E+-----+++ +++ +++ ---+E+-----+E+------+++ +++ + +E+---+--+E+----++E+------+E+--- ++++ +++ + +E\| 0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,128] range 70 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + \| 60 ++master +-H--+ +++ ---+E+-----+E+------+E+ \| +E+------E-------+E+--- \| \| --- +++ \| 50 ++ +++--- ++ \| -+E+ \| 40 ++ +++---- ++ \| E- \| \| --\| \| 30 ++ -- +++ ++ \| +E+ \| 20 ++E+ ++ \|E+ \| \| \| 10 ++ ++ + + + + + + + \| 0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,1024] range 160 ++---------+---------+----------+---------+----------+----------+---++ +cmpxchg +-E--+ + + + + + \| 140 ++master +-H--+ +++ +++ \| -+E+-----+E+-------E\| 120 ++ +++ ---- +++ \| +++ ----E-- \| 100 ++ --E--- +++ ++ \| +++ ---- +++ \| 80 ++ --E-- ++ \| ---- +++ \| \| -+E+ \| 60 ++ ---- +++ ++ \| +E+- \| 40 ++ -- ++ \| +E+ \| 20 +EE+ ++ +++ + + + + + + \| 0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads [rth: Rearrange 128-bit cmpxchg helper. Enforce alignment on LL.] Signed-off-by: Emilio G. Cota <cota@braap.org> Message-Id: <1467054136-10430-28-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson <rth@twiddle.net> 2016-06-27 21:02:13 +02:00			`DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64)`
target/arm: check CF_PARALLEL instead of parallel_cpus Thereby decoupling the resulting translated code from the current state of the system. Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> 2017-07-15 00:20:49 +02:00			`DEF_HELPER_FLAGS_4(paired_cmpxchg64_be_parallel, TCG_CALL_NO_WG,`
			`i64, env, i64, i64, i64)`