target/arm: Implement VFP fp16 for fused-multiply-add

Implement VFP fp16 support for fused multiply-add insns VFNMA, VFNMS, VFMA, VFMS. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20200828183354.27913-7-peter.maydell@linaro.org
2020-08-28 19:33:15 +01:00 · 2020-08-28 19:33:15 +01:00 · 9886fe2834
parent 2aa8dcfa14
commit 9886fe2834
4 changed files with 77 additions and 0 deletions
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@ -213,6 +213,7 @@ DEF_HELPER_FLAGS_3(vfp_fcvt_f64_to_f16, TCG_CALL_NO_RWG, f16, f64, ptr, i32)
 DEF_HELPER_4(vfp_muladdd, f64, f64, f64, f64, ptr)
 DEF_HELPER_4(vfp_muladds, f32, f32, f32, f32, ptr)
 DEF_HELPER_4(vfp_muladdh, f16, f16, f16, f16, ptr)
 DEF_HELPER_3(recps_f32, f32, env, f32, f32)
 DEF_HELPER_3(rsqrts_f32, f32, env, f32, f32)
--- a/target/arm/translate-vfp.c.inc
+++ b/target/arm/translate-vfp.c.inc
@ -1913,6 +1913,69 @@ static bool trans_VMAXNM_dp(DisasContext *s, arg_VMAXNM_dp *a)
                         a->vd, a->vn, a->vm, false);
 }
 static bool do_vfm_hp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
 {
    /*
     * VFNMA : fd = muladd(-fd,  fn, fm)
     * VFNMS : fd = muladd(-fd, -fn, fm)
     * VFMA  : fd = muladd( fd,  fn, fm)
     * VFMS  : fd = muladd( fd, -fn, fm)
     *
     * These are fused multiply-add, and must be done as one floating
     * point operation with no rounding between the multiplication and
     * addition steps.  NB that doing the negations here as separate
     * steps is correct : an input NaN should come out with its sign
     * bit flipped if it is a negated-input.
     */
    TCGv_ptr fpst;
    TCGv_i32 vn, vm, vd;
    /*
     * Present in VFPv4 only, and only with the FP16 extension.
     * Note that we can't rely on the SIMDFMAC check alone, because
     * in a Neon-no-VFP core that ID register field will be non-zero.
     */
    if (!dc_isar_feature(aa32_fp16_arith, s) ||
        !dc_isar_feature(aa32_simdfmac, s) ||
        !dc_isar_feature(aa32_fpsp_v2, s)) {
        return false;
    }
    if (s->vec_len != 0 || s->vec_stride != 0) {
        return false;
    }
    if (!vfp_access_check(s)) {
        return true;
    }
    vn = tcg_temp_new_i32();
    vm = tcg_temp_new_i32();
    vd = tcg_temp_new_i32();
    neon_load_reg32(vn, a->vn);
    neon_load_reg32(vm, a->vm);
    if (neg_n) {
        /* VFNMS, VFMS */
        gen_helper_vfp_negh(vn, vn);
    }
    neon_load_reg32(vd, a->vd);
    if (neg_d) {
        /* VFNMA, VFNMS */
        gen_helper_vfp_negh(vd, vd);
    }
    fpst = fpstatus_ptr(FPST_FPCR_F16);
    gen_helper_vfp_muladdh(vd, vn, vm, vd, fpst);
    neon_store_reg32(vd, a->vd);
    tcg_temp_free_ptr(fpst);
    tcg_temp_free_i32(vn);
    tcg_temp_free_i32(vm);
    tcg_temp_free_i32(vd);
    return true;
 }
 static bool do_vfm_sp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
 {
    /*
@ -2062,6 +2125,7 @@ static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d)
    MAKE_ONE_VFM_TRANS_FN(VFNMA, PREC, false, true) \
    MAKE_ONE_VFM_TRANS_FN(VFNMS, PREC, true, true)
 MAKE_VFM_TRANS_FNS(hp)
 MAKE_VFM_TRANS_FNS(sp)
 MAKE_VFM_TRANS_FNS(dp)
--- a/target/arm/vfp.decode
+++ b/target/arm/vfp.decode
@ -139,6 +139,11 @@ VDIV_hp      ---- 1110 1.00 .... .... 1001 .0.0 ....        @vfp_dnm_s
 VDIV_sp      ---- 1110 1.00 .... .... 1010 .0.0 ....        @vfp_dnm_s
 VDIV_dp      ---- 1110 1.00 .... .... 1011 .0.0 ....        @vfp_dnm_d
 VFMA_hp      ---- 1110 1.10 .... .... 1001 .0. 0 ....       @vfp_dnm_s
 VFMS_hp      ---- 1110 1.10 .... .... 1001 .1. 0 ....       @vfp_dnm_s
 VFNMA_hp     ---- 1110 1.01 .... .... 1001 .0. 0 ....       @vfp_dnm_s
 VFNMS_hp     ---- 1110 1.01 .... .... 1001 .1. 0 ....       @vfp_dnm_s
 VFMA_sp      ---- 1110 1.10 .... .... 1010 .0. 0 ....       @vfp_dnm_s
 VFMS_sp      ---- 1110 1.10 .... .... 1010 .1. 0 ....       @vfp_dnm_s
 VFNMA_sp     ---- 1110 1.01 .... .... 1010 .0. 0 ....       @vfp_dnm_s
--- a/target/arm/vfp_helper.c
+++ b/target/arm/vfp_helper.c
@ -1062,6 +1062,13 @@ uint32_t HELPER(rsqrte_u32)(uint32_t a)
 }
 /* VFPv4 fused multiply-accumulate */
 dh_ctype_f16 VFP_HELPER(muladd, h)(dh_ctype_f16 a, dh_ctype_f16 b,
                                   dh_ctype_f16 c, void *fpstp)
 {
    float_status *fpst = fpstp;
    return float16_muladd(a, b, c, 0, fpst);
 }
 float32 VFP_HELPER(muladd, s)(float32 a, float32 b, float32 c, void *fpstp)
 {
    float_status *fpst = fpstp;