diff --git a/target-arm/helpers.h b/target-arm/helpers.h index 18b488b89a..80251b55b1 100644 --- a/target-arm/helpers.h +++ b/target-arm/helpers.h @@ -461,4 +461,10 @@ DEF_HELPER_3(iwmmxt_muladdswl, i64, i64, i32, i32) DEF_HELPER_2(set_teecr, void, env, i32) +DEF_HELPER_3(neon_unzip8, void, env, i32, i32) +DEF_HELPER_3(neon_unzip16, void, env, i32, i32) +DEF_HELPER_3(neon_qunzip8, void, env, i32, i32) +DEF_HELPER_3(neon_qunzip16, void, env, i32, i32) +DEF_HELPER_3(neon_qunzip32, void, env, i32, i32) + #include "def-helper.h" diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index ee253bbd2e..49f1b15945 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -1693,3 +1693,97 @@ uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b) float32 f1 = float32_abs(vfp_itos(b)); return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0; } + +#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) + +void HELPER(neon_qunzip8)(CPUState *env, uint32_t rd, uint32_t rm) +{ + uint64_t zm0 = float64_val(env->vfp.regs[rm]); + uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); + uint64_t zd0 = float64_val(env->vfp.regs[rd]); + uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); + uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) + | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) + | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) + | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); + uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) + | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) + | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) + | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); + uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) + | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) + | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) + | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); + uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) + | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) + | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) + | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); + env->vfp.regs[rm] = make_float64(m0); + env->vfp.regs[rm + 1] = make_float64(m1); + env->vfp.regs[rd] = make_float64(d0); + env->vfp.regs[rd + 1] = make_float64(d1); +} + +void HELPER(neon_qunzip16)(CPUState *env, uint32_t rd, uint32_t rm) +{ + uint64_t zm0 = float64_val(env->vfp.regs[rm]); + uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); + uint64_t zd0 = float64_val(env->vfp.regs[rd]); + uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); + uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) + | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); + uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) + | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); + uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) + | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); + uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) + | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); + env->vfp.regs[rm] = make_float64(m0); + env->vfp.regs[rm + 1] = make_float64(m1); + env->vfp.regs[rd] = make_float64(d0); + env->vfp.regs[rd + 1] = make_float64(d1); +} + +void HELPER(neon_qunzip32)(CPUState *env, uint32_t rd, uint32_t rm) +{ + uint64_t zm0 = float64_val(env->vfp.regs[rm]); + uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); + uint64_t zd0 = float64_val(env->vfp.regs[rd]); + uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); + uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); + uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); + uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); + uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); + env->vfp.regs[rm] = make_float64(m0); + env->vfp.regs[rm + 1] = make_float64(m1); + env->vfp.regs[rd] = make_float64(d0); + env->vfp.regs[rd + 1] = make_float64(d1); +} + +void HELPER(neon_unzip8)(CPUState *env, uint32_t rd, uint32_t rm) +{ + uint64_t zm = float64_val(env->vfp.regs[rm]); + uint64_t zd = float64_val(env->vfp.regs[rd]); + uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) + | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) + | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) + | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); + uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) + | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) + | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) + | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); + env->vfp.regs[rm] = make_float64(m0); + env->vfp.regs[rd] = make_float64(d0); +} + +void HELPER(neon_unzip16)(CPUState *env, uint32_t rd, uint32_t rm) +{ + uint64_t zm = float64_val(env->vfp.regs[rm]); + uint64_t zd = float64_val(env->vfp.regs[rd]); + uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) + | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); + uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) + | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); + env->vfp.regs[rm] = make_float64(m0); + env->vfp.regs[rd] = make_float64(d0); +} diff --git a/target-arm/translate.c b/target-arm/translate.c index f3488200e6..398764e7bc 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -3614,40 +3614,43 @@ static inline TCGv neon_get_scalar(int size, int reg) return tmp; } -static void gen_neon_unzip_u8(TCGv t0, TCGv t1) +static int gen_neon_unzip(int rd, int rm, int size, int q) { - TCGv rd, rm, tmp; - - rd = new_tmp(); - rm = new_tmp(); - tmp = new_tmp(); - - tcg_gen_andi_i32(rd, t0, 0xff); - tcg_gen_shri_i32(tmp, t0, 8); - tcg_gen_andi_i32(tmp, tmp, 0xff00); - tcg_gen_or_i32(rd, rd, tmp); - tcg_gen_shli_i32(tmp, t1, 16); - tcg_gen_andi_i32(tmp, tmp, 0xff0000); - tcg_gen_or_i32(rd, rd, tmp); - tcg_gen_shli_i32(tmp, t1, 8); - tcg_gen_andi_i32(tmp, tmp, 0xff000000); - tcg_gen_or_i32(rd, rd, tmp); - - tcg_gen_shri_i32(rm, t0, 8); - tcg_gen_andi_i32(rm, rm, 0xff); - tcg_gen_shri_i32(tmp, t0, 16); - tcg_gen_andi_i32(tmp, tmp, 0xff00); - tcg_gen_or_i32(rm, rm, tmp); - tcg_gen_shli_i32(tmp, t1, 8); - tcg_gen_andi_i32(tmp, tmp, 0xff0000); - tcg_gen_or_i32(rm, rm, tmp); - tcg_gen_andi_i32(tmp, t1, 0xff000000); - tcg_gen_or_i32(t1, rm, tmp); - tcg_gen_mov_i32(t0, rd); - - dead_tmp(tmp); - dead_tmp(rm); - dead_tmp(rd); + TCGv tmp, tmp2; + if (size == 3 || (!q && size == 2)) { + return 1; + } + tmp = tcg_const_i32(rd); + tmp2 = tcg_const_i32(rm); + if (q) { + switch (size) { + case 0: + gen_helper_neon_qunzip8(cpu_env, tmp, tmp2); + break; + case 1: + gen_helper_neon_qunzip16(cpu_env, tmp, tmp2); + break; + case 2: + gen_helper_neon_qunzip32(cpu_env, tmp, tmp2); + break; + default: + abort(); + } + } else { + switch (size) { + case 0: + gen_helper_neon_unzip8(cpu_env, tmp, tmp2); + break; + case 1: + gen_helper_neon_unzip16(cpu_env, tmp, tmp2); + break; + default: + abort(); + } + } + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(tmp2); + return 0; } static void gen_neon_zip_u8(TCGv t0, TCGv t1) @@ -3705,25 +3708,6 @@ static void gen_neon_zip_u16(TCGv t0, TCGv t1) dead_tmp(tmp); } -static void gen_neon_unzip(int reg, int q, int tmp, int size) -{ - int n; - TCGv t0, t1; - - for (n = 0; n < q + 1; n += 2) { - t0 = neon_load_reg(reg, n); - t1 = neon_load_reg(reg, n + 1); - switch (size) { - case 0: gen_neon_unzip_u8(t0, t1); break; - case 1: gen_neon_zip_u16(t0, t1); break; /* zip and unzip are the same. */ - case 2: /* no-op */; break; - default: abort(); - } - neon_store_scratch(tmp + n, t0); - neon_store_scratch(tmp + n + 1, t1); - } -} - static void gen_neon_trn_u8(TCGv t0, TCGv t1) { TCGv rd, tmp; @@ -5440,30 +5424,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) } break; case 34: /* VUZP */ - /* Reg Before After - Rd A3 A2 A1 A0 B2 B0 A2 A0 - Rm B3 B2 B1 B0 B3 B1 A3 A1 - */ - if (size == 3) + if (gen_neon_unzip(rd, rm, size, q)) { return 1; - gen_neon_unzip(rd, q, 0, size); - gen_neon_unzip(rm, q, 4, size); - if (q) { - static int unzip_order_q[8] = - {0, 2, 4, 6, 1, 3, 5, 7}; - for (n = 0; n < 8; n++) { - int reg = (n < 4) ? rd : rm; - tmp = neon_load_scratch(unzip_order_q[n]); - neon_store_reg(reg, n % 4, tmp); - } - } else { - static int unzip_order[4] = - {0, 4, 1, 5}; - for (n = 0; n < 4; n++) { - int reg = (n < 2) ? rd : rm; - tmp = neon_load_scratch(unzip_order[n]); - neon_store_reg(reg, n % 2, tmp); - } } break; case 35: /* VZIP */