diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e99c6a11589..720627bc702 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,16 @@ +2019-05-02 Alejandro Martinez + + * config/aarch64/aarch64-sve.md (dot_prod): Taken from SVE + ACLE branch. + * config/aarch64/iterators.md: Copied Vetype_fourth, VSI2QI and vsi2qi from + SVE ACLE branch. + * tree-vect-loop.c (use_mask_by_cond_expr_p): New function to check if a + VEC_COND_EXPR be inserted to emulate a conditional internal function. + (build_vect_cond_expr): Emit the VEC_COND_EXPR. + (vectorizable_reduction): Use the functions above to vectorize in a + fully masked loop codes that don't have a conditional internal + function. + 2019-05-02 Martin Liska * cgraphclones.c: Call valid_attribute_p with 1 for diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 3f39c4c5b63..02d33b7276f 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3132,3 +3132,19 @@ DONE; } ) + +;; Unpredicated DOT product. +(define_insn "dot_prod" + [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w") + (plus:SVE_SDI + (unspec:SVE_SDI + [(match_operand: 1 "register_operand" "w, w") + (match_operand: 2 "register_operand" "w, w")] + DOTPROD) + (match_operand:SVE_SDI 3 "register_operand" "0, w")))] + "TARGET_SVE" + "@ + dot\\t%0., %1., %2. + movprfx\t%0, %3\;dot\\t%0., %1., %2." + [(set_attr "movprfx" "*,yes")] +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 6caeeac8086..b3b2d6e470a 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -663,6 +663,9 @@ (QI "b") (HI "h") (SI "s") (DI "d")]) +;; Like Vetype, but map to types that are a quarter of the element size. +(define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")]) + ;; Equivalent of "size" for a vector element. (define_mode_attr Vesize [(VNx16QI "b") (VNx8HI "h") (VNx8HF "h") @@ -1029,8 +1032,10 @@ (V2SF "p") (V4SF "v") (V4HF "v") (V8HF "v")]) -(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")]) -(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")]) +(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi") + (VNx4SI "vnx16qi") (VNx2DI "vnx8hi")]) +(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI") + (VNx4SI "VNx16QI") (VNx2DI "VNx8HI")]) ;; Register suffix for DOTPROD input types from the return type. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 85199342615..37edbeada5a 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2019-05-02 Alejandro Martinez + + * gcc.target/aarch64/sve/dot_1.c: New test for dot product. + 2019-05-02 Martin Liska * gcc.target/i386/funcspec-4.c: Update scanned pattern. diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c new file mode 100644 index 00000000000..8ff66714e9b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c @@ -0,0 +1,39 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +#define DEF_DOT(TYPE1, TYPE2) \ +TYPE1 __attribute__ ((noinline, noclone)) \ +dot_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n) \ +{ \ + TYPE1 sum = 0; \ + for (int i = 0; i < n; i++) \ + { \ + sum += x[i] * y[i]; \ + } \ + return sum; \ +} + +DEF_DOT(uint32_t, uint8_t) +DEF_DOT(int32_t, int8_t) +DEF_DOT(int64_t, int16_t) + +/* The uint16_t->uint64_t dot product requires a casting to satisfy the C + language rules. */ +uint64_t __attribute__ ((noinline, noclone)) +dot_uint64_t_uint16_t (uint16_t *restrict x, uint16_t *restrict y, int n) +{ + uint64_t sum = 0; + for (int i = 0; i < n; i++) + { + sum += (unsigned int)x[i] * y[i]; + } + return sum; +} + +/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\t} 8 } } */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 0edcdc7ee5f..493c1ab8c71 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -5958,6 +5958,55 @@ is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop) <= TYPE_PRECISION (lhs_type)); } +/* Check if masking can be supported by inserting a conditional expression. + CODE is the code for the operation. COND_FN is the conditional internal + function, if it exists. VECTYPE_IN is the type of the vector input. */ +static bool +use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn, + tree vectype_in) +{ + if (cond_fn != IFN_LAST + && direct_internal_fn_supported_p (cond_fn, vectype_in, + OPTIMIZE_FOR_SPEED)) + return false; + + switch (code) + { + case DOT_PROD_EXPR: + return true; + + default: + return false; + } +} + +/* Insert a conditional expression to enable masked vectorization. CODE is the + code for the operation. VOP is the array of operands. MASK is the loop + mask. GSI is a statement iterator used to place the new conditional + expression. */ +static void +build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask, + gimple_stmt_iterator *gsi) +{ + switch (code) + { + case DOT_PROD_EXPR: + { + tree vectype = TREE_TYPE (vop[1]); + tree zero = build_zero_cst (vectype); + tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1"); + gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, + mask, vop[1], zero); + gsi_insert_before (gsi, select, GSI_SAME_STMT); + vop[1] = masked_op1; + break; + } + + default: + gcc_unreachable (); + } +} + /* Function vectorizable_reduction. Check if STMT_INFO performs a reduction operation that can be vectorized. @@ -6931,6 +6980,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, internal_fn cond_fn = get_conditional_internal_fn (code); vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in); if (!vec_stmt) /* transformation not required. */ { @@ -6938,6 +6988,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) { if (reduction_type != FOLD_LEFT_REDUCTION + && !mask_by_cond_expr && (cond_fn == IFN_LAST || !direct_internal_fn_supported_p (cond_fn, vectype_in, OPTIMIZE_FOR_SPEED))) @@ -7101,7 +7152,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) { tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; - if (masked_loop_p) + if (masked_loop_p && !mask_by_cond_expr) { /* Make sure that the reduction accumulator is vop[0]. */ if (reduc_index == 1) @@ -7125,6 +7176,14 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (op_type == ternary_op) vop[2] = vec_oprnds2[i]; + if (masked_loop_p && mask_by_cond_expr) + { + tree mask = vect_get_loop_mask (gsi, masks, + vec_num * ncopies, + vectype_in, i * ncopies + j); + build_vect_cond_expr (code, vop, mask, gsi); + } + gassign *new_stmt = gimple_build_assign (vec_dest, code, vop[0], vop[1], vop[2]); new_temp = make_ssa_name (vec_dest, new_stmt);