i386: Introduce V2QImode vectorized arithmetic [PR103861]

This patch adds basic V2QImode infrastructure and V2QImode arithmetic
operations (plus, minus and neg).  The patched compiler can emit SSE
vectorized QImode operations (e.g. PADDB) with partial QImode vector,
and also synthesized double HI/LO QImode operations with integer registers.

The testcase:

typedef char __v2qi __attribute__ ((__vector_size__ (2)));
__v2qi plus  (__v2qi a, __v2qi b) { return a + b; };

compiles with -O2 to:

        movl    %edi, %edx
        movl    %esi, %eax
        addb    %sil, %dl
        addb    %ah, %dh
        movl    %edx, %eax
        ret

which is much better than what the unpatched compiler produces:

        movl    %edi, %eax
        movl    %esi, %edx
        xorl    %ecx, %ecx
        movb    %dil, %cl
        movsbl  %dh, %edx
        movsbl  %ah, %eax
        addl    %edx, %eax
        addb    %sil, %cl
        movb    %al, %ch
        movl    %ecx, %eax
        ret

The V2QImode vectorization does not require vector registers, so it can
be enabled by default also for 32-bit targets without SSE.

The patch also enables vectorized V2QImode sign/zero extends.

2021-12-30  Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog:

	PR target/103861
	* config/i386/i386.h (VALID_SSE2_REG_MODE): Add V2QImode.
	(VALID_INT_MODE_P): Ditto.
	* config/i386/i386.c (ix86_secondary_reload): Handle
	V2QImode reloads from SSE register to memory.
	(vector_mode_supported_p): Always return true for V2QImode.
	* config/i386/i386.md (*subqi_ext<mode>_2): New insn pattern.
	(*negqi_ext<mode>_2): Ditto.
	* config/i386/mmx.md (movv2qi): New expander.
	(movmisalignv2qi): Ditto.
	(*movv2qi_internal): New insn pattern.
	(*pushv2qi2): Ditto.
	(negv2qi2 and splitters): Ditto.
	(<plusminus:insn>v2qi3 and splitters): Ditto.

gcc/testsuite/ChangeLog:

	PR target/103861
	* gcc.dg/store_merging_18.c (dg-options): Add -fno-tree-vectorize.
	* gcc.dg/store_merging_29.c (dg-options): Ditto.
	* gcc.target/i386/pr103861.c: New test.
	* gcc.target/i386/pr92658-avx512vl.c (dg-final):
	Remove vpmovqb scan-assembler xfail.
	* gcc.target/i386/pr92658-sse4.c (dg-final):
	Remove pmovzxbq scan-assembler xfail.
	* gcc.target/i386/pr92658-sse4-2.c (dg-final):
	Remove pmovsxbq scan-assembler xfail.
	* gcc.target/i386/warn-vect-op-2.c (dg-warning): Adjust warnings.
This commit is contained in:
Uros Bizjak 2022-01-02 21:12:10 +01:00
parent 6bec6e3aaa
commit 9ff206d386
11 changed files with 362 additions and 13 deletions

View File

@ -19306,7 +19306,7 @@ ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
}
/* Require movement to gpr, and then store to memory. */
if ((mode == HFmode || mode == HImode)
if ((mode == HFmode || mode == HImode || mode == V2QImode)
&& !TARGET_SSE4_1
&& SSE_CLASS_P (rclass)
&& !in_p && MEM_P (x))
@ -22082,6 +22082,8 @@ ix86_vector_mode_supported_p (machine_mode mode)
if ((TARGET_3DNOW || TARGET_MMX_WITH_SSE)
&& VALID_MMX_REG_MODE_3DNOW (mode))
return true;
if (mode == V2QImode)
return true;
return false;
}

View File

@ -1039,7 +1039,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \
|| (MODE) == V8HFmode || (MODE) == V4HFmode || (MODE) == V2HFmode \
|| (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode \
|| (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
|| (MODE) == V2DImode || (MODE) == V2QImode || (MODE) == DFmode \
|| (MODE) == HFmode)
#define VALID_SSE_REG_MODE(MODE) \
((MODE) == V1TImode || (MODE) == TImode \
@ -1072,7 +1073,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
|| (MODE) == SDmode || (MODE) == DDmode \
|| (MODE) == HFmode || (MODE) == HCmode \
|| (MODE) == V2HImode || (MODE) == V2HFmode \
|| (MODE) == V1SImode || (MODE) == V4QImode \
|| (MODE) == V1SImode || (MODE) == V4QImode || (MODE) == V2QImode \
|| (TARGET_64BIT \
&& ((MODE) == TImode || (MODE) == CTImode \
|| (MODE) == TFmode || (MODE) == TCmode \

View File

@ -6931,6 +6931,30 @@
operands[4] = gen_rtx_SIGN_EXTEND (<DPWI>mode, operands[2]);
})
(define_insn "*subqi_ext<mode>_2"
[(set (zero_extract:SWI248
(match_operand:SWI248 0 "register_operand" "+Q")
(const_int 8)
(const_int 8))
(subreg:SWI248
(minus:QI
(subreg:QI
(zero_extract:SWI248
(match_operand:SWI248 1 "register_operand" "0")
(const_int 8)
(const_int 8)) 0)
(subreg:QI
(zero_extract:SWI248
(match_operand:SWI248 2 "register_operand" "Q")
(const_int 8)
(const_int 8)) 0)) 0))
(clobber (reg:CC FLAGS_REG))]
"/* FIXME: without this LRA can't reload this pattern, see PR82524. */
rtx_equal_p (operands[0], operands[1])"
"sub{b}\t{%h2, %h0|%h0, %h2}"
[(set_attr "type" "alu")
(set_attr "mode" "QI")])
(define_insn "*subv<mode>4"
[(set (reg:CCO FLAGS_REG)
(eq:CCO (minus:<DWI>
@ -10901,6 +10925,25 @@
[(set_attr "type" "negnot")
(set_attr "mode" "<MODE>")])
(define_insn "*negqi_ext<mode>_2"
[(set (zero_extract:SWI248
(match_operand:SWI248 0 "register_operand" "+Q")
(const_int 8)
(const_int 8))
(subreg:SWI248
(neg:QI
(subreg:QI
(zero_extract:SWI248
(match_operand:SWI248 1 "register_operand" "0")
(const_int 8)
(const_int 8)) 0)) 0))
(clobber (reg:CC FLAGS_REG))]
"/* FIXME: without this LRA can't reload this pattern, see PR82524. */
rtx_equal_p (operands[0], operands[1])"
"neg{b}\t%h0"
[(set_attr "type" "negnot")
(set_attr "mode" "QI")])
;; Negate with jump on overflow.
(define_expand "negv<mode>3"
[(parallel [(set (reg:CCO FLAGS_REG)

View File

@ -261,8 +261,8 @@
"=r ,m ,v,v,v,m,r,v")
(match_operand:V_32 1 "general_operand"
"rmC,rC,C,v,m,v,v,r"))]
"TARGET_SSE2 &&
!(MEM_P (operands[0]) && MEM_P (operands[1]))"
"TARGET_SSE2
&& !(MEM_P (operands[0]) && MEM_P (operands[1]))"
{
switch (get_attr_type (insn))
{
@ -359,6 +359,174 @@
DONE;
})
(define_expand "movv2qi"
[(set (match_operand:V2QI 0 "nonimmediate_operand")
(match_operand:V2QI 1 "nonimmediate_operand"))]
""
{
ix86_expand_vector_move (V2QImode, operands);
DONE;
})
(define_insn "*movv2qi_internal"
[(set (match_operand:V2QI 0 "nonimmediate_operand"
"=r,r,r,m ,v,v,v,m,r,v")
(match_operand:V2QI 1 "general_operand"
"r ,C,m,rC,C,v,m,v,v,r"))]
"!(MEM_P (operands[0]) && MEM_P (operands[1]))"
{
switch (get_attr_type (insn))
{
case TYPE_IMOV:
if (get_attr_mode (insn) == MODE_SI)
return "mov{l}\t{%k1, %k0|%k0, %k1}";
else
return "mov{w}\t{%1, %0|%0, %1}";
case TYPE_IMOVX:
/* movzwl is faster than movw on p2 due to partial word stalls,
though not as fast as an aligned movl. */
return "movz{wl|x}\t{%1, %k0|%k0, %1}";
case TYPE_SSELOG1:
if (satisfies_constraint_C (operands[1]))
return standard_sse_constant_opcode (insn, operands);
if (SSE_REG_P (operands[0]))
return MEM_P (operands[1])
? "%vpinsrw\t{$0, %1, %d0|%d0, %1, 0}"
: "%vpinsrw\t{$0, %k1, %d0|%d0, %k1, 0}";
else
return MEM_P (operands[0])
? "%vpextrw\t{$0, %1, %0|%0, %1, 0}"
: "%vpextrw\t{$0, %1, %k0|%k0, %1, 0}";
case TYPE_SSEMOV:
return ix86_output_ssemov (insn, operands);
default:
gcc_unreachable ();
}
}
[(set (attr "isa")
(cond [(eq_attr "alternative" "4,5,6,8,9")
(const_string "sse2")
(eq_attr "alternative" "7")
(const_string "sse4")
]
(const_string "*")))
(set (attr "type")
(cond [(eq_attr "alternative" "6,7,8,9")
(if_then_else (match_test "TARGET_AVX512FP16")
(const_string "ssemov")
(const_string "sselog1"))
(eq_attr "alternative" "4")
(const_string "sselog1")
(eq_attr "alternative" "5")
(const_string "ssemov")
(match_test "optimize_function_for_size_p (cfun)")
(const_string "imov")
(and (eq_attr "alternative" "0")
(ior (not (match_test "TARGET_PARTIAL_REG_STALL"))
(not (match_test "TARGET_HIMODE_MATH"))))
(const_string "imov")
(and (eq_attr "alternative" "1,2")
(match_operand:V2QI 1 "aligned_operand"))
(const_string "imov")
(and (match_test "TARGET_MOVX")
(eq_attr "alternative" "0,2"))
(const_string "imovx")
]
(const_string "imov")))
(set (attr "prefix")
(cond [(eq_attr "alternative" "4,5,6,7,8,9")
(const_string "maybe_evex")
]
(const_string "orig")))
(set (attr "mode")
(cond [(eq_attr "alternative" "6,7,8,9")
(if_then_else (match_test "TARGET_AVX512FP16")
(const_string "HI")
(const_string "TI"))
(eq_attr "alternative" "4")
(cond [(match_test "TARGET_AVX")
(const_string "TI")
(ior (not (match_test "TARGET_SSE2"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
]
(const_string "TI"))
(eq_attr "alternative" "5")
(cond [(match_test "TARGET_AVX512FP16")
(const_string "HI")
(match_test "TARGET_AVX")
(const_string "TI")
(ior (not (match_test "TARGET_SSE2"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
]
(const_string "TI"))
(eq_attr "type" "imovx")
(const_string "SI")
(and (eq_attr "alternative" "1,2")
(match_operand:V2QI 1 "aligned_operand"))
(const_string "SI")
(and (eq_attr "alternative" "0")
(ior (not (match_test "TARGET_PARTIAL_REG_STALL"))
(not (match_test "TARGET_HIMODE_MATH"))))
(const_string "SI")
]
(const_string "HI")))
(set (attr "preferred_for_speed")
(cond [(eq_attr "alternative" "8")
(symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
(eq_attr "alternative" "9")
(symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
]
(symbol_ref "true")))])
;; We always round up to UNITS_PER_WORD bytes.
(define_insn "*pushv2qi2"
[(set (match_operand:V2QI 0 "push_operand" "=X,X")
(match_operand:V2QI 1 "nonmemory_no_elim_operand" "rC,v"))]
""
"* return TARGET_64BIT ? \"push{q}\t%q1\" : \"push{l}\t%k1\";
#"
[(set_attr "isa" "*,sse4")
(set_attr "type" "push,multi")
(set (attr "mode")
(cond [(eq_attr "alternative" "0")
(if_then_else (match_test "TARGET_64BIT")
(const_string "DI")
(const_string "SI"))
(eq_attr "alternative" "1")
(if_then_else (match_test "TARGET_AVX512FP16")
(const_string "HI")
(const_string "TI"))
]
(const_string "HI")))])
(define_split
[(set (match_operand:V2QI 0 "push_operand")
(match_operand:V2QI 1 "sse_reg_operand"))]
"TARGET_SSE4_1 && reload_completed"
[(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
(set (match_dup 0) (match_dup 1))]
{
operands[2] = GEN_INT (-PUSH_ROUNDING (GET_MODE_SIZE (V2QImode)));
/* Preserve memory attributes. */
operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);
})
(define_expand "movmisalignv2qi"
[(set (match_operand:V2QI 0 "nonimmediate_operand")
(match_operand:V2QI 1 "nonimmediate_operand"))]
""
{
ix86_expand_vector_move (V2QImode, operands);
DONE;
})
(define_insn "sse_movntq"
[(set (match_operand:DI 0 "memory_operand" "=m,m")
(unspec:DI [(match_operand:DI 1 "register_operand" "y,r")]
@ -1461,6 +1629,58 @@
"TARGET_MMX_WITH_SSE"
"operands[2] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));")
(define_insn "negv2qi2"
[(set (match_operand:V2QI 0 "register_operand" "=Q,&Yw")
(neg:V2QI
(match_operand:V2QI 1 "register_operand" "0,Yw")))
(clobber (reg:CC FLAGS_REG))]
""
"#"
[(set_attr "isa" "*,sse2")
(set_attr "type" "multi")
(set_attr "mode" "QI,TI")])
(define_split
[(set (match_operand:V2QI 0 "general_reg_operand")
(neg:V2QI
(match_operand:V2QI 1 "general_reg_operand")))
(clobber (reg:CC FLAGS_REG))]
"reload_completed"
[(parallel
[(set (strict_low_part (match_dup 0))
(neg:QI (match_dup 1)))
(clobber (reg:CC FLAGS_REG))])
(parallel
[(set (zero_extract:HI (match_dup 2) (const_int 8) (const_int 8))
(subreg:HI
(neg:QI
(subreg:QI
(zero_extract:HI (match_dup 3)
(const_int 8)
(const_int 8)) 0)) 0))
(clobber (reg:CC FLAGS_REG))])]
{
operands[3] = gen_lowpart (HImode, operands[1]);
operands[2] = gen_lowpart (HImode, operands[0]);
operands[1] = gen_lowpart (QImode, operands[1]);
operands[0] = gen_lowpart (QImode, operands[0]);
})
(define_split
[(set (match_operand:V2QI 0 "sse_reg_operand")
(neg:V2QI
(match_operand:V2QI 1 "sse_reg_operand")))
(clobber (reg:CC FLAGS_REG))]
"reload_completed"
[(set (match_dup 0) (match_dup 2))
(set (match_dup 0)
(minus:V4QI (match_dup 0) (match_dup 1)))]
{
operands[2] = CONST0_RTX (V4QImode);
operands[1] = gen_lowpart (V4QImode, operands[1]);
operands[0] = gen_lowpart (V4QImode, operands[0]);
})
(define_expand "mmx_<insn><mode>3"
[(set (match_operand:MMXMODEI8 0 "register_operand")
(plusminus:MMXMODEI8
@ -1515,6 +1735,66 @@
(set_attr "type" "sseadd")
(set_attr "mode" "TI")])
(define_insn "<insn>v2qi3"
[(set (match_operand:V2QI 0 "register_operand" "=Q,x,Yw")
(plusminus:V2QI
(match_operand:V2QI 1 "register_operand" "<comm>0,0,Yw")
(match_operand:V2QI 2 "register_operand" "Q,x,Yw")))
(clobber (reg:CC FLAGS_REG))]
""
"#"
[(set_attr "isa" "*,sse2_noavx,avx")
(set_attr "type" "multi,sseadd,sseadd")
(set_attr "mode" "QI,TI,TI")])
(define_split
[(set (match_operand:V2QI 0 "general_reg_operand")
(plusminus:V2QI
(match_operand:V2QI 1 "general_reg_operand")
(match_operand:V2QI 2 "general_reg_operand")))
(clobber (reg:CC FLAGS_REG))]
"reload_completed"
[(parallel
[(set (strict_low_part (match_dup 0))
(plusminus:QI (match_dup 1) (match_dup 2)))
(clobber (reg:CC FLAGS_REG))])
(parallel
[(set (zero_extract:HI (match_dup 3) (const_int 8) (const_int 8))
(subreg:HI
(plusminus:QI
(subreg:QI
(zero_extract:HI (match_dup 4)
(const_int 8)
(const_int 8)) 0)
(subreg:QI
(zero_extract:HI (match_dup 5)
(const_int 8)
(const_int 8)) 0)) 0))
(clobber (reg:CC FLAGS_REG))])]
{
operands[5] = gen_lowpart (HImode, operands[2]);
operands[4] = gen_lowpart (HImode, operands[1]);
operands[3] = gen_lowpart (HImode, operands[0]);
operands[2] = gen_lowpart (QImode, operands[2]);
operands[1] = gen_lowpart (QImode, operands[1]);
operands[0] = gen_lowpart (QImode, operands[0]);
})
(define_split
[(set (match_operand:V2QI 0 "sse_reg_operand")
(plusminus:V2QI
(match_operand:V2QI 1 "sse_reg_operand")
(match_operand:V2QI 2 "sse_reg_operand")))
(clobber (reg:CC FLAGS_REG))]
"TARGET_SSE2 && reload_completed"
[(set (match_dup 0)
(plusminus:V4QI (match_dup 1) (match_dup 2)))]
{
operands[2] = gen_lowpart (V4QImode, operands[2]);
operands[1] = gen_lowpart (V4QImode, operands[1]);
operands[0] = gen_lowpart (V4QImode, operands[0]);
})
(define_expand "mmx_<insn><mode>3"
[(set (match_operand:MMXMODE12 0 "register_operand")
(sat_plusminus:MMXMODE12

View File

@ -1,6 +1,6 @@
/* PR tree-optimization/83843 */
/* { dg-do run } */
/* { dg-options "-O2 -fdump-tree-store-merging" } */
/* { dg-options "-O2 -fno-tree-vectorize -fdump-tree-store-merging" } */
/* { dg-final { scan-tree-dump-times "Merging successful" 3 "store-merging" { target { store_merge && { ! arm*-*-* } } } } } */
__attribute__((noipa)) void

View File

@ -1,7 +1,7 @@
/* PR tree-optimization/88709 */
/* { dg-do run { target int32 } } */
/* { dg-require-effective-target store_merge } */
/* { dg-options "-O2 -fdump-tree-store-merging-details" } */
/* { dg-options "-O2 -fno-tree-vectorize -fdump-tree-store-merging-details" } */
/* { dg-final { scan-tree-dump "New sequence of 3 stores to replace old one of 6 stores" "store-merging" { target { le && { ! arm*-*-* } } } } } */
/* { dg-final { scan-tree-dump "New sequence of \[34] stores to replace old one of 6 stores" "store-merging" { target { be && { ! arm*-*-* } } } } } */

View File

@ -0,0 +1,23 @@
/* PR target/103861 */
/* { dg-do compile } */
/* { dg-options "-O2 -dp" } */
typedef char __v2qi __attribute__ ((__vector_size__ (2)));
__v2qi and (__v2qi a, __v2qi b) { return a & b; };
__v2qi andn (__v2qi a, __v2qi b) { return a & ~b; };
__v2qi or (__v2qi a, __v2qi b) { return a | b; };
__v2qi xor (__v2qi a, __v2qi b) { return a ^ b; };
__v2qi not (__v2qi a) { return ~a; };
__v2qi plus (__v2qi a, __v2qi b) { return a + b; };
__v2qi minus (__v2qi a, __v2qi b) { return a - b; };
__v2qi neg (__v2qi a) { return -a; };
/* { dg-final { scan-assembler-not "insvhi" } } */

View File

@ -123,7 +123,7 @@ truncdb_128 (v16qi * dst, v4si * __restrict src)
/* { dg-final { scan-assembler-times "vpmovqd" 2 } } */
/* { dg-final { scan-assembler-times "vpmovqw" 2 } } */
/* { dg-final { scan-assembler-times "vpmovqb\[ \t]*%ymm" 1 } } */
/* { dg-final { scan-assembler-times "vpmovqb\[ \t]*%xmm" 1 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times "vpmovqb\[ \t]*%ymm" 1 } } */
/* { dg-final { scan-assembler-times "vpmovqb\[ \t]*%xmm" 1 } } */
/* { dg-final { scan-assembler-times "vpmovdw" 2 } } */
/* { dg-final { scan-assembler-times "vpmovdb" 2 } } */

View File

@ -81,7 +81,7 @@ bar_s8_s64 (v2di * dst, v16qi src)
dst[0] = *(v2di *) tem;
}
/* { dg-final { scan-assembler-times "pmovsxbq" 2 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times "pmovsxbq" 2 } } */
void
foo_s16_s32 (v4si * dst, v8hi * __restrict src)

View File

@ -81,7 +81,7 @@ bar_u8_u64 (v2di * dst, v16qi src)
dst[0] = *(v2di *) tem;
}
/* { dg-final { scan-assembler-times "pmovzxbq" 2 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times "pmovzxbq" 2 } } */
void
foo_u16_u32 (v4si * dst, v8hi * __restrict src)

View File

@ -11,8 +11,8 @@ int main (int argc, char *argv[])
argc, 1, 15, 38, 12, -1, argc, 2};
vector (16, signed char) res[] =
{
v0 + v1, /* { dg-warning "expanded in parallel" } */
v0 - v1, /* { dg-warning "expanded in parallel" } */
v0 + v1, /* { dg-warning "expanded piecewise" } */
v0 - v1, /* { dg-warning "expanded piecewise" } */
v0 > v1, /* { dg-warning "expanded piecewise" } */
v0 & v1, /* { dg-warning "expanded in parallel" } */
__builtin_shuffle (v0, v1), /* { dg-warning "expanded piecewise" } */