backport: re PR target/81833 (PowerPC: VSX: Miscompiles ffmpeg's scalarproduct_int16_vsx at -O1)
[gcc] 2017-09-12 Bill Schmidt <wschmidt@linux.vnet.ibm.com> Backport from mainline 2017-09-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com> PR target/81833 * config/rs6000/altivec.md (altivec_vsum2sws): Convert from a define_insn to a define_expand. (altivec_vsum2sws_direct): New define_insn. (altivec_vsumsws): Convert from a define_insn to a define_expand. [gcc/testsuite] 2017-09-12 Bill Schmidt <wschmidt@linux.vnet.ibm.com> Backport from mainline 2017-09-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com> PR target/81833 * gcc.target/powerpc/pr81833-1.c: New file. * gcc.target/powerpc/pr81833-2.c: New file. From-SVN: r252042
This commit is contained in:
parent
e38bde5423
commit
ce1a6c358b
@ -1,3 +1,14 @@
|
||||
2017-09-12 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
|
||||
|
||||
Backport from mainline
|
||||
2017-09-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
|
||||
|
||||
PR target/81833
|
||||
* config/rs6000/altivec.md (altivec_vsum2sws): Convert from a
|
||||
define_insn to a define_expand.
|
||||
(altivec_vsum2sws_direct): New define_insn.
|
||||
(altivec_vsumsws): Convert from a define_insn to a define_expand.
|
||||
|
||||
2017-09-11 Max Filippov <jcmvbkbc@gmail.com>
|
||||
|
||||
Backport from mainline
|
||||
|
@ -1749,51 +1749,61 @@
|
||||
"vsum4s<VI_char>s %0,%1,%2"
|
||||
[(set_attr "type" "veccomplex")])
|
||||
|
||||
;; FIXME: For the following two patterns, the scratch should only be
|
||||
;; allocated for !VECTOR_ELT_ORDER_BIG, and the instructions should
|
||||
;; be emitted separately.
|
||||
(define_insn "altivec_vsum2sws"
|
||||
(define_expand "altivec_vsum2sws"
|
||||
[(use (match_operand:V4SI 0 "register_operand"))
|
||||
(use (match_operand:V4SI 1 "register_operand"))
|
||||
(use (match_operand:V4SI 2 "register_operand"))]
|
||||
"TARGET_ALTIVEC"
|
||||
{
|
||||
if (VECTOR_ELT_ORDER_BIG)
|
||||
emit_insn (gen_altivec_vsum2sws_direct (operands[0], operands[1],
|
||||
operands[2]));
|
||||
else
|
||||
{
|
||||
rtx tmp1 = gen_reg_rtx (V4SImode);
|
||||
rtx tmp2 = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_altivec_vsldoi_v4si (tmp1, operands[2],
|
||||
operands[2], GEN_INT (12)));
|
||||
emit_insn (gen_altivec_vsum2sws_direct (tmp2, operands[1], tmp1));
|
||||
emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2,
|
||||
GEN_INT (4)));
|
||||
}
|
||||
DONE;
|
||||
})
|
||||
|
||||
; FIXME: This can probably be expressed without an UNSPEC.
|
||||
(define_insn "altivec_vsum2sws_direct"
|
||||
[(set (match_operand:V4SI 0 "register_operand" "=v")
|
||||
(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
|
||||
(match_operand:V4SI 2 "register_operand" "v")]
|
||||
(match_operand:V4SI 2 "register_operand" "v")]
|
||||
UNSPEC_VSUM2SWS))
|
||||
(set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))
|
||||
(clobber (match_scratch:V4SI 3 "=v"))]
|
||||
(set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))]
|
||||
"TARGET_ALTIVEC"
|
||||
"vsum2sws %0,%1,%2"
|
||||
[(set_attr "type" "veccomplex")])
|
||||
|
||||
(define_expand "altivec_vsumsws"
|
||||
[(use (match_operand:V4SI 0 "register_operand"))
|
||||
(use (match_operand:V4SI 1 "register_operand"))
|
||||
(use (match_operand:V4SI 2 "register_operand"))]
|
||||
"TARGET_ALTIVEC"
|
||||
{
|
||||
if (VECTOR_ELT_ORDER_BIG)
|
||||
return "vsum2sws %0,%1,%2";
|
||||
emit_insn (gen_altivec_vsumsws_direct (operands[0], operands[1],
|
||||
operands[2]));
|
||||
else
|
||||
return "vsldoi %3,%2,%2,12\n\tvsum2sws %3,%1,%3\n\tvsldoi %0,%3,%3,4";
|
||||
}
|
||||
[(set_attr "type" "veccomplex")
|
||||
(set (attr "length")
|
||||
(if_then_else
|
||||
(match_test "VECTOR_ELT_ORDER_BIG")
|
||||
(const_string "4")
|
||||
(const_string "12")))])
|
||||
|
||||
(define_insn "altivec_vsumsws"
|
||||
[(set (match_operand:V4SI 0 "register_operand" "=v")
|
||||
(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
|
||||
(match_operand:V4SI 2 "register_operand" "v")]
|
||||
UNSPEC_VSUMSWS))
|
||||
(set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))
|
||||
(clobber (match_scratch:V4SI 3 "=v"))]
|
||||
"TARGET_ALTIVEC"
|
||||
{
|
||||
if (VECTOR_ELT_ORDER_BIG)
|
||||
return "vsumsws %0,%1,%2";
|
||||
else
|
||||
return "vspltw %3,%2,0\n\tvsumsws %3,%1,%3\n\tvsldoi %0,%3,%3,12";
|
||||
}
|
||||
[(set_attr "type" "veccomplex")
|
||||
(set (attr "length")
|
||||
(if_then_else
|
||||
(match_test "(VECTOR_ELT_ORDER_BIG)")
|
||||
(const_string "4")
|
||||
(const_string "12")))])
|
||||
{
|
||||
rtx tmp1 = gen_reg_rtx (V4SImode);
|
||||
rtx tmp2 = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_altivec_vspltw_direct (tmp1, operands[2], const0_rtx));
|
||||
emit_insn (gen_altivec_vsumsws_direct (tmp2, operands[1], tmp1));
|
||||
emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2,
|
||||
GEN_INT (12)));
|
||||
}
|
||||
DONE;
|
||||
})
|
||||
|
||||
; FIXME: This can probably be expressed without an UNSPEC.
|
||||
(define_insn "altivec_vsumsws_direct"
|
||||
[(set (match_operand:V4SI 0 "register_operand" "=v")
|
||||
(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
|
||||
|
@ -1,3 +1,12 @@
|
||||
2017-09-12 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
|
||||
|
||||
Backport from mainline
|
||||
2017-09-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
|
||||
|
||||
PR target/81833
|
||||
* gcc.target/powerpc/pr81833-1.c: New file.
|
||||
* gcc.target/powerpc/pr81833-2.c: New file.
|
||||
|
||||
2017-09-10 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
|
||||
|
||||
Backport from mainline
|
||||
|
59
gcc/testsuite/gcc.target/powerpc/pr81833-1.c
Normal file
59
gcc/testsuite/gcc.target/powerpc/pr81833-1.c
Normal file
@ -0,0 +1,59 @@
|
||||
/* PR81833: This used to fail due to improper implementation of vec_msum. */
|
||||
/* Test case relies on -mcpu=power7 or later. Currently we don't have
|
||||
machinery to express that, so we have two separate tests for -mcpu=power7
|
||||
and -mcpu=power8 to catch 32-bit BE on P7 and 64-bit BE/LE on P8. */
|
||||
|
||||
/* { dg-do run } */
|
||||
/* { dg-require-effective-target p8vector_hw } */
|
||||
/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
|
||||
/* { dg-options "-mcpu=power8 -O2" } */
|
||||
|
||||
#include <altivec.h>
|
||||
|
||||
#define vec_u8 vector unsigned char
|
||||
#define vec_s8 vector signed char
|
||||
#define vec_u16 vector unsigned short
|
||||
#define vec_s16 vector signed short
|
||||
#define vec_u32 vector unsigned int
|
||||
#define vec_s32 vector signed int
|
||||
#define vec_f vector float
|
||||
|
||||
#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0)
|
||||
|
||||
#define zero_u8v (vec_u8) zerov
|
||||
#define zero_s8v (vec_s8) zerov
|
||||
#define zero_u16v (vec_u16) zerov
|
||||
#define zero_s16v (vec_s16) zerov
|
||||
#define zero_u32v (vec_u32) zerov
|
||||
#define zero_s32v (vec_s32) zerov
|
||||
|
||||
signed int __attribute__((noinline))
|
||||
scalarproduct_int16_vsx (const signed short *v1, const signed short *v2,
|
||||
int order)
|
||||
{
|
||||
int i;
|
||||
LOAD_ZERO;
|
||||
register vec_s16 vec1;
|
||||
register vec_s32 res = vec_splat_s32 (0), t;
|
||||
signed int ires;
|
||||
|
||||
for (i = 0; i < order; i += 8) {
|
||||
vec1 = vec_vsx_ld (0, v1);
|
||||
t = vec_msum (vec1, vec_vsx_ld (0, v2), zero_s32v);
|
||||
res = vec_sums (t, res);
|
||||
v1 += 8;
|
||||
v2 += 8;
|
||||
}
|
||||
res = vec_splat (res, 3);
|
||||
vec_ste (res, 0, &ires);
|
||||
|
||||
return ires;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 };
|
||||
if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8)
|
||||
__builtin_abort ();
|
||||
return 0;
|
||||
}
|
59
gcc/testsuite/gcc.target/powerpc/pr81833-2.c
Normal file
59
gcc/testsuite/gcc.target/powerpc/pr81833-2.c
Normal file
@ -0,0 +1,59 @@
|
||||
/* PR81833: This used to fail due to improper implementation of vec_msum. */
|
||||
/* Test case relies on -mcpu=power7 or later. Currently we don't have
|
||||
machinery to express that, so we have two separate tests for -mcpu=power7
|
||||
and -mcpu=power8 to catch 32-bit BE on P7 and 64-bit BE/LE on P8. */
|
||||
|
||||
/* { dg-do run } */
|
||||
/* { dg-require-effective-target vsx_hw } */
|
||||
/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power7" } } */
|
||||
/* { dg-options "-mcpu=power7 -O2" } */
|
||||
|
||||
#include <altivec.h>
|
||||
|
||||
#define vec_u8 vector unsigned char
|
||||
#define vec_s8 vector signed char
|
||||
#define vec_u16 vector unsigned short
|
||||
#define vec_s16 vector signed short
|
||||
#define vec_u32 vector unsigned int
|
||||
#define vec_s32 vector signed int
|
||||
#define vec_f vector float
|
||||
|
||||
#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0)
|
||||
|
||||
#define zero_u8v (vec_u8) zerov
|
||||
#define zero_s8v (vec_s8) zerov
|
||||
#define zero_u16v (vec_u16) zerov
|
||||
#define zero_s16v (vec_s16) zerov
|
||||
#define zero_u32v (vec_u32) zerov
|
||||
#define zero_s32v (vec_s32) zerov
|
||||
|
||||
signed int __attribute__((noinline))
|
||||
scalarproduct_int16_vsx (const signed short *v1, const signed short *v2,
|
||||
int order)
|
||||
{
|
||||
int i;
|
||||
LOAD_ZERO;
|
||||
register vec_s16 vec1;
|
||||
register vec_s32 res = vec_splat_s32 (0), t;
|
||||
signed int ires;
|
||||
|
||||
for (i = 0; i < order; i += 8) {
|
||||
vec1 = vec_vsx_ld (0, v1);
|
||||
t = vec_msum (vec1, vec_vsx_ld (0, v2), zero_s32v);
|
||||
res = vec_sums (t, res);
|
||||
v1 += 8;
|
||||
v2 += 8;
|
||||
}
|
||||
res = vec_splat (res, 3);
|
||||
vec_ste (res, 0, &ires);
|
||||
|
||||
return ires;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 };
|
||||
if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8)
|
||||
__builtin_abort ();
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user