target/mips: Use 8-byte memory ops for msa load/store
Rather than use 4-16 separate operations, use 2 operations plus some byte reordering as necessary. Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
parent
948f88661c
commit
68ad9260e0
|
@ -8218,47 +8218,31 @@ void helper_msa_ffint_u_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
|
|||
#define MEMOP_IDX(DF)
|
||||
#endif
|
||||
|
||||
#ifdef TARGET_WORDS_BIGENDIAN
|
||||
static inline uint64_t bswap16x4(uint64_t x)
|
||||
{
|
||||
uint64_t m = 0x00ff00ff00ff00ffull;
|
||||
return ((x & m) << 8) | ((x >> 8) & m);
|
||||
}
|
||||
|
||||
static inline uint64_t bswap32x2(uint64_t x)
|
||||
{
|
||||
return ror64(bswap64(x), 32);
|
||||
}
|
||||
#endif
|
||||
|
||||
void helper_msa_ld_b(CPUMIPSState *env, uint32_t wd,
|
||||
target_ulong addr)
|
||||
{
|
||||
wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
|
||||
uintptr_t ra = GETPC();
|
||||
uint64_t d0, d1;
|
||||
|
||||
#if !defined(HOST_WORDS_BIGENDIAN)
|
||||
pwd->b[0] = cpu_ldub_data_ra(env, addr + (0 << DF_BYTE), ra);
|
||||
pwd->b[1] = cpu_ldub_data_ra(env, addr + (1 << DF_BYTE), ra);
|
||||
pwd->b[2] = cpu_ldub_data_ra(env, addr + (2 << DF_BYTE), ra);
|
||||
pwd->b[3] = cpu_ldub_data_ra(env, addr + (3 << DF_BYTE), ra);
|
||||
pwd->b[4] = cpu_ldub_data_ra(env, addr + (4 << DF_BYTE), ra);
|
||||
pwd->b[5] = cpu_ldub_data_ra(env, addr + (5 << DF_BYTE), ra);
|
||||
pwd->b[6] = cpu_ldub_data_ra(env, addr + (6 << DF_BYTE), ra);
|
||||
pwd->b[7] = cpu_ldub_data_ra(env, addr + (7 << DF_BYTE), ra);
|
||||
pwd->b[8] = cpu_ldub_data_ra(env, addr + (8 << DF_BYTE), ra);
|
||||
pwd->b[9] = cpu_ldub_data_ra(env, addr + (9 << DF_BYTE), ra);
|
||||
pwd->b[10] = cpu_ldub_data_ra(env, addr + (10 << DF_BYTE), ra);
|
||||
pwd->b[11] = cpu_ldub_data_ra(env, addr + (11 << DF_BYTE), ra);
|
||||
pwd->b[12] = cpu_ldub_data_ra(env, addr + (12 << DF_BYTE), ra);
|
||||
pwd->b[13] = cpu_ldub_data_ra(env, addr + (13 << DF_BYTE), ra);
|
||||
pwd->b[14] = cpu_ldub_data_ra(env, addr + (14 << DF_BYTE), ra);
|
||||
pwd->b[15] = cpu_ldub_data_ra(env, addr + (15 << DF_BYTE), ra);
|
||||
#else
|
||||
pwd->b[0] = cpu_ldub_data_ra(env, addr + (7 << DF_BYTE), ra);
|
||||
pwd->b[1] = cpu_ldub_data_ra(env, addr + (6 << DF_BYTE), ra);
|
||||
pwd->b[2] = cpu_ldub_data_ra(env, addr + (5 << DF_BYTE), ra);
|
||||
pwd->b[3] = cpu_ldub_data_ra(env, addr + (4 << DF_BYTE), ra);
|
||||
pwd->b[4] = cpu_ldub_data_ra(env, addr + (3 << DF_BYTE), ra);
|
||||
pwd->b[5] = cpu_ldub_data_ra(env, addr + (2 << DF_BYTE), ra);
|
||||
pwd->b[6] = cpu_ldub_data_ra(env, addr + (1 << DF_BYTE), ra);
|
||||
pwd->b[7] = cpu_ldub_data_ra(env, addr + (0 << DF_BYTE), ra);
|
||||
pwd->b[8] = cpu_ldub_data_ra(env, addr + (15 << DF_BYTE), ra);
|
||||
pwd->b[9] = cpu_ldub_data_ra(env, addr + (14 << DF_BYTE), ra);
|
||||
pwd->b[10] = cpu_ldub_data_ra(env, addr + (13 << DF_BYTE), ra);
|
||||
pwd->b[11] = cpu_ldub_data_ra(env, addr + (12 << DF_BYTE), ra);
|
||||
pwd->b[12] = cpu_ldub_data_ra(env, addr + (11 << DF_BYTE), ra);
|
||||
pwd->b[13] = cpu_ldub_data_ra(env, addr + (10 << DF_BYTE), ra);
|
||||
pwd->b[14] = cpu_ldub_data_ra(env, addr + (9 << DF_BYTE), ra);
|
||||
pwd->b[15] = cpu_ldub_data_ra(env, addr + (8 << DF_BYTE), ra);
|
||||
#endif
|
||||
/* Load 8 bytes at a time. Vector element ordering makes this LE. */
|
||||
d0 = cpu_ldq_le_data_ra(env, addr + 0, ra);
|
||||
d1 = cpu_ldq_le_data_ra(env, addr + 8, ra);
|
||||
pwd->d[0] = d0;
|
||||
pwd->d[1] = d1;
|
||||
}
|
||||
|
||||
void helper_msa_ld_h(CPUMIPSState *env, uint32_t wd,
|
||||
|
@ -8266,26 +8250,20 @@ void helper_msa_ld_h(CPUMIPSState *env, uint32_t wd,
|
|||
{
|
||||
wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
|
||||
uintptr_t ra = GETPC();
|
||||
uint64_t d0, d1;
|
||||
|
||||
#if !defined(HOST_WORDS_BIGENDIAN)
|
||||
pwd->h[0] = cpu_lduw_data_ra(env, addr + (0 << DF_HALF), ra);
|
||||
pwd->h[1] = cpu_lduw_data_ra(env, addr + (1 << DF_HALF), ra);
|
||||
pwd->h[2] = cpu_lduw_data_ra(env, addr + (2 << DF_HALF), ra);
|
||||
pwd->h[3] = cpu_lduw_data_ra(env, addr + (3 << DF_HALF), ra);
|
||||
pwd->h[4] = cpu_lduw_data_ra(env, addr + (4 << DF_HALF), ra);
|
||||
pwd->h[5] = cpu_lduw_data_ra(env, addr + (5 << DF_HALF), ra);
|
||||
pwd->h[6] = cpu_lduw_data_ra(env, addr + (6 << DF_HALF), ra);
|
||||
pwd->h[7] = cpu_lduw_data_ra(env, addr + (7 << DF_HALF), ra);
|
||||
#else
|
||||
pwd->h[0] = cpu_lduw_data_ra(env, addr + (3 << DF_HALF), ra);
|
||||
pwd->h[1] = cpu_lduw_data_ra(env, addr + (2 << DF_HALF), ra);
|
||||
pwd->h[2] = cpu_lduw_data_ra(env, addr + (1 << DF_HALF), ra);
|
||||
pwd->h[3] = cpu_lduw_data_ra(env, addr + (0 << DF_HALF), ra);
|
||||
pwd->h[4] = cpu_lduw_data_ra(env, addr + (7 << DF_HALF), ra);
|
||||
pwd->h[5] = cpu_lduw_data_ra(env, addr + (6 << DF_HALF), ra);
|
||||
pwd->h[6] = cpu_lduw_data_ra(env, addr + (5 << DF_HALF), ra);
|
||||
pwd->h[7] = cpu_lduw_data_ra(env, addr + (4 << DF_HALF), ra);
|
||||
/*
|
||||
* Load 8 bytes at a time. Use little-endian load, then for
|
||||
* big-endian target, we must then swap the four halfwords.
|
||||
*/
|
||||
d0 = cpu_ldq_le_data_ra(env, addr + 0, ra);
|
||||
d1 = cpu_ldq_le_data_ra(env, addr + 8, ra);
|
||||
#ifdef TARGET_WORDS_BIGENDIAN
|
||||
d0 = bswap16x4(d0);
|
||||
d1 = bswap16x4(d1);
|
||||
#endif
|
||||
pwd->d[0] = d0;
|
||||
pwd->d[1] = d1;
|
||||
}
|
||||
|
||||
void helper_msa_ld_w(CPUMIPSState *env, uint32_t wd,
|
||||
|
@ -8293,18 +8271,20 @@ void helper_msa_ld_w(CPUMIPSState *env, uint32_t wd,
|
|||
{
|
||||
wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
|
||||
uintptr_t ra = GETPC();
|
||||
uint64_t d0, d1;
|
||||
|
||||
#if !defined(HOST_WORDS_BIGENDIAN)
|
||||
pwd->w[0] = cpu_ldl_data_ra(env, addr + (0 << DF_WORD), ra);
|
||||
pwd->w[1] = cpu_ldl_data_ra(env, addr + (1 << DF_WORD), ra);
|
||||
pwd->w[2] = cpu_ldl_data_ra(env, addr + (2 << DF_WORD), ra);
|
||||
pwd->w[3] = cpu_ldl_data_ra(env, addr + (3 << DF_WORD), ra);
|
||||
#else
|
||||
pwd->w[0] = cpu_ldl_data_ra(env, addr + (1 << DF_WORD), ra);
|
||||
pwd->w[1] = cpu_ldl_data_ra(env, addr + (0 << DF_WORD), ra);
|
||||
pwd->w[2] = cpu_ldl_data_ra(env, addr + (3 << DF_WORD), ra);
|
||||
pwd->w[3] = cpu_ldl_data_ra(env, addr + (2 << DF_WORD), ra);
|
||||
/*
|
||||
* Load 8 bytes at a time. Use little-endian load, then for
|
||||
* big-endian target, we must then bswap the two words.
|
||||
*/
|
||||
d0 = cpu_ldq_le_data_ra(env, addr + 0, ra);
|
||||
d1 = cpu_ldq_le_data_ra(env, addr + 8, ra);
|
||||
#ifdef TARGET_WORDS_BIGENDIAN
|
||||
d0 = bswap32x2(d0);
|
||||
d1 = bswap32x2(d1);
|
||||
#endif
|
||||
pwd->d[0] = d0;
|
||||
pwd->d[1] = d1;
|
||||
}
|
||||
|
||||
void helper_msa_ld_d(CPUMIPSState *env, uint32_t wd,
|
||||
|
@ -8312,9 +8292,12 @@ void helper_msa_ld_d(CPUMIPSState *env, uint32_t wd,
|
|||
{
|
||||
wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
|
||||
uintptr_t ra = GETPC();
|
||||
uint64_t d0, d1;
|
||||
|
||||
pwd->d[0] = cpu_ldq_data_ra(env, addr + (0 << DF_DOUBLE), ra);
|
||||
pwd->d[1] = cpu_ldq_data_ra(env, addr + (1 << DF_DOUBLE), ra);
|
||||
d0 = cpu_ldq_data_ra(env, addr + 0, ra);
|
||||
d1 = cpu_ldq_data_ra(env, addr + 8, ra);
|
||||
pwd->d[0] = d0;
|
||||
pwd->d[1] = d1;
|
||||
}
|
||||
|
||||
#define MSA_PAGESPAN(x) \
|
||||
|
@ -8344,41 +8327,9 @@ void helper_msa_st_b(CPUMIPSState *env, uint32_t wd,
|
|||
|
||||
ensure_writable_pages(env, addr, mmu_idx, ra);
|
||||
|
||||
#if !defined(HOST_WORDS_BIGENDIAN)
|
||||
cpu_stb_data_ra(env, addr + (0 << DF_BYTE), pwd->b[0], ra);
|
||||
cpu_stb_data_ra(env, addr + (1 << DF_BYTE), pwd->b[1], ra);
|
||||
cpu_stb_data_ra(env, addr + (2 << DF_BYTE), pwd->b[2], ra);
|
||||
cpu_stb_data_ra(env, addr + (3 << DF_BYTE), pwd->b[3], ra);
|
||||
cpu_stb_data_ra(env, addr + (4 << DF_BYTE), pwd->b[4], ra);
|
||||
cpu_stb_data_ra(env, addr + (5 << DF_BYTE), pwd->b[5], ra);
|
||||
cpu_stb_data_ra(env, addr + (6 << DF_BYTE), pwd->b[6], ra);
|
||||
cpu_stb_data_ra(env, addr + (7 << DF_BYTE), pwd->b[7], ra);
|
||||
cpu_stb_data_ra(env, addr + (8 << DF_BYTE), pwd->b[8], ra);
|
||||
cpu_stb_data_ra(env, addr + (9 << DF_BYTE), pwd->b[9], ra);
|
||||
cpu_stb_data_ra(env, addr + (10 << DF_BYTE), pwd->b[10], ra);
|
||||
cpu_stb_data_ra(env, addr + (11 << DF_BYTE), pwd->b[11], ra);
|
||||
cpu_stb_data_ra(env, addr + (12 << DF_BYTE), pwd->b[12], ra);
|
||||
cpu_stb_data_ra(env, addr + (13 << DF_BYTE), pwd->b[13], ra);
|
||||
cpu_stb_data_ra(env, addr + (14 << DF_BYTE), pwd->b[14], ra);
|
||||
cpu_stb_data_ra(env, addr + (15 << DF_BYTE), pwd->b[15], ra);
|
||||
#else
|
||||
cpu_stb_data_ra(env, addr + (7 << DF_BYTE), pwd->b[0], ra);
|
||||
cpu_stb_data_ra(env, addr + (6 << DF_BYTE), pwd->b[1], ra);
|
||||
cpu_stb_data_ra(env, addr + (5 << DF_BYTE), pwd->b[2], ra);
|
||||
cpu_stb_data_ra(env, addr + (4 << DF_BYTE), pwd->b[3], ra);
|
||||
cpu_stb_data_ra(env, addr + (3 << DF_BYTE), pwd->b[4], ra);
|
||||
cpu_stb_data_ra(env, addr + (2 << DF_BYTE), pwd->b[5], ra);
|
||||
cpu_stb_data_ra(env, addr + (1 << DF_BYTE), pwd->b[6], ra);
|
||||
cpu_stb_data_ra(env, addr + (0 << DF_BYTE), pwd->b[7], ra);
|
||||
cpu_stb_data_ra(env, addr + (15 << DF_BYTE), pwd->b[8], ra);
|
||||
cpu_stb_data_ra(env, addr + (14 << DF_BYTE), pwd->b[9], ra);
|
||||
cpu_stb_data_ra(env, addr + (13 << DF_BYTE), pwd->b[10], ra);
|
||||
cpu_stb_data_ra(env, addr + (12 << DF_BYTE), pwd->b[11], ra);
|
||||
cpu_stb_data_ra(env, addr + (11 << DF_BYTE), pwd->b[12], ra);
|
||||
cpu_stb_data_ra(env, addr + (10 << DF_BYTE), pwd->b[13], ra);
|
||||
cpu_stb_data_ra(env, addr + (9 << DF_BYTE), pwd->b[14], ra);
|
||||
cpu_stb_data_ra(env, addr + (8 << DF_BYTE), pwd->b[15], ra);
|
||||
#endif
|
||||
/* Store 8 bytes at a time. Vector element ordering makes this LE. */
|
||||
cpu_stq_le_data_ra(env, addr + 0, pwd->d[0], ra);
|
||||
cpu_stq_le_data_ra(env, addr + 0, pwd->d[1], ra);
|
||||
}
|
||||
|
||||
void helper_msa_st_h(CPUMIPSState *env, uint32_t wd,
|
||||
|
@ -8387,28 +8338,19 @@ void helper_msa_st_h(CPUMIPSState *env, uint32_t wd,
|
|||
wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
|
||||
int mmu_idx = cpu_mmu_index(env, false);
|
||||
uintptr_t ra = GETPC();
|
||||
uint64_t d0, d1;
|
||||
|
||||
ensure_writable_pages(env, addr, mmu_idx, ra);
|
||||
|
||||
#if !defined(HOST_WORDS_BIGENDIAN)
|
||||
cpu_stw_data_ra(env, addr + (0 << DF_HALF), pwd->h[0], ra);
|
||||
cpu_stw_data_ra(env, addr + (1 << DF_HALF), pwd->h[1], ra);
|
||||
cpu_stw_data_ra(env, addr + (2 << DF_HALF), pwd->h[2], ra);
|
||||
cpu_stw_data_ra(env, addr + (3 << DF_HALF), pwd->h[3], ra);
|
||||
cpu_stw_data_ra(env, addr + (4 << DF_HALF), pwd->h[4], ra);
|
||||
cpu_stw_data_ra(env, addr + (5 << DF_HALF), pwd->h[5], ra);
|
||||
cpu_stw_data_ra(env, addr + (6 << DF_HALF), pwd->h[6], ra);
|
||||
cpu_stw_data_ra(env, addr + (7 << DF_HALF), pwd->h[7], ra);
|
||||
#else
|
||||
cpu_stw_data_ra(env, addr + (3 << DF_HALF), pwd->h[0], ra);
|
||||
cpu_stw_data_ra(env, addr + (2 << DF_HALF), pwd->h[1], ra);
|
||||
cpu_stw_data_ra(env, addr + (1 << DF_HALF), pwd->h[2], ra);
|
||||
cpu_stw_data_ra(env, addr + (0 << DF_HALF), pwd->h[3], ra);
|
||||
cpu_stw_data_ra(env, addr + (7 << DF_HALF), pwd->h[4], ra);
|
||||
cpu_stw_data_ra(env, addr + (6 << DF_HALF), pwd->h[5], ra);
|
||||
cpu_stw_data_ra(env, addr + (5 << DF_HALF), pwd->h[6], ra);
|
||||
cpu_stw_data_ra(env, addr + (4 << DF_HALF), pwd->h[7], ra);
|
||||
/* Store 8 bytes at a time. See helper_msa_ld_h. */
|
||||
d0 = pwd->d[0];
|
||||
d1 = pwd->d[1];
|
||||
#ifdef TARGET_WORDS_BIGENDIAN
|
||||
d0 = bswap16x4(d0);
|
||||
d1 = bswap16x4(d1);
|
||||
#endif
|
||||
cpu_stq_le_data_ra(env, addr + 0, d0, ra);
|
||||
cpu_stq_le_data_ra(env, addr + 8, d1, ra);
|
||||
}
|
||||
|
||||
void helper_msa_st_w(CPUMIPSState *env, uint32_t wd,
|
||||
|
@ -8417,20 +8359,19 @@ void helper_msa_st_w(CPUMIPSState *env, uint32_t wd,
|
|||
wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
|
||||
int mmu_idx = cpu_mmu_index(env, false);
|
||||
uintptr_t ra = GETPC();
|
||||
uint64_t d0, d1;
|
||||
|
||||
ensure_writable_pages(env, addr, mmu_idx, ra);
|
||||
|
||||
#if !defined(HOST_WORDS_BIGENDIAN)
|
||||
cpu_stl_data_ra(env, addr + (0 << DF_WORD), pwd->w[0], ra);
|
||||
cpu_stl_data_ra(env, addr + (1 << DF_WORD), pwd->w[1], ra);
|
||||
cpu_stl_data_ra(env, addr + (2 << DF_WORD), pwd->w[2], ra);
|
||||
cpu_stl_data_ra(env, addr + (3 << DF_WORD), pwd->w[3], ra);
|
||||
#else
|
||||
cpu_stl_data_ra(env, addr + (1 << DF_WORD), pwd->w[0], ra);
|
||||
cpu_stl_data_ra(env, addr + (0 << DF_WORD), pwd->w[1], ra);
|
||||
cpu_stl_data_ra(env, addr + (3 << DF_WORD), pwd->w[2], ra);
|
||||
cpu_stl_data_ra(env, addr + (2 << DF_WORD), pwd->w[3], ra);
|
||||
/* Store 8 bytes at a time. See helper_msa_ld_w. */
|
||||
d0 = pwd->d[0];
|
||||
d1 = pwd->d[1];
|
||||
#ifdef TARGET_WORDS_BIGENDIAN
|
||||
d0 = bswap32x2(d0);
|
||||
d1 = bswap32x2(d1);
|
||||
#endif
|
||||
cpu_stq_le_data_ra(env, addr + 0, d0, ra);
|
||||
cpu_stq_le_data_ra(env, addr + 8, d1, ra);
|
||||
}
|
||||
|
||||
void helper_msa_st_d(CPUMIPSState *env, uint32_t wd,
|
||||
|
@ -8442,6 +8383,6 @@ void helper_msa_st_d(CPUMIPSState *env, uint32_t wd,
|
|||
|
||||
ensure_writable_pages(env, addr, mmu_idx, GETPC());
|
||||
|
||||
cpu_stq_data_ra(env, addr + (0 << DF_DOUBLE), pwd->d[0], ra);
|
||||
cpu_stq_data_ra(env, addr + (1 << DF_DOUBLE), pwd->d[1], ra);
|
||||
cpu_stq_data_ra(env, addr + 0, pwd->d[0], ra);
|
||||
cpu_stq_data_ra(env, addr + 8, pwd->d[1], ra);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue