[16/n] PR85694: Add detection of averaging operations

This patch adds detection of average instructions: a = (((wide) b + (wide) c) >> 1); --> a = (wide) .AVG_FLOOR (b, c); a = (((wide) b + (wide) c + 1) >> 1); --> a = (wide) .AVG_CEIL (b, c); in cases where users of "a" need only the low half of the result, making the cast to (wide) redundant. The heavy lifting was done by earlier patches. This showed up another problem in vectorizable_call: if the call is a pattern definition statement rather than the main pattern statement, the type of vectorised call might be different from the type of the original statement. 2018-07-03 Richard Sandiford <richard.sandiford@arm.com> gcc/ PR tree-optimization/85694 * doc/md.texi (avgM3_floor, uavgM3_floor, avgM3_ceil) (uavgM3_ceil): Document new optabs. * doc/sourcebuild.texi (vect_avg_qi): Document new target selector. * internal-fn.def (IFN_AVG_FLOOR, IFN_AVG_CEIL): New internal functions. * optabs.def (savg_floor_optab, uavg_floor_optab, savg_ceil_optab) (savg_ceil_optab): New optabs. * tree-vect-patterns.c (vect_recog_average_pattern): New function. (vect_vect_recog_func_ptrs): Add it. * tree-vect-stmts.c (vectorizable_call): Get the type of the zero constant directly from the associated lhs. gcc/testsuite/ PR tree-optimization/85694 * lib/target-supports.exp (check_effective_target_vect_avg_qi): New proc. * gcc.dg/vect/vect-avg-1.c: New test. * gcc.dg/vect/vect-avg-2.c: Likewise. * gcc.dg/vect/vect-avg-3.c: Likewise. * gcc.dg/vect/vect-avg-4.c: Likewise. * gcc.dg/vect/vect-avg-5.c: Likewise. * gcc.dg/vect/vect-avg-6.c: Likewise. * gcc.dg/vect/vect-avg-7.c: Likewise. * gcc.dg/vect/vect-avg-8.c: Likewise. * gcc.dg/vect/vect-avg-9.c: Likewise. * gcc.dg/vect/vect-avg-10.c: Likewise. * gcc.dg/vect/vect-avg-11.c: Likewise. * gcc.dg/vect/vect-avg-12.c: Likewise. * gcc.dg/vect/vect-avg-13.c: Likewise. * gcc.dg/vect/vect-avg-14.c: Likewise. From-SVN: r262335
2018-07-03 10:03:44 +00:00 · 2018-07-03 10:03:44 +00:00 · 0267732bae
parent 4ef79c960a
commit 0267732bae
23 changed files with 502 additions and 3 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,18 @@
+2018-07-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	PR tree-optimization/85694
+	* doc/md.texi (avgM3_floor, uavgM3_floor, avgM3_ceil)
+	(uavgM3_ceil): Document new optabs.
+	* doc/sourcebuild.texi (vect_avg_qi): Document new target selector.
+	* internal-fn.def (IFN_AVG_FLOOR, IFN_AVG_CEIL): New internal
+	functions.
+	* optabs.def (savg_floor_optab, uavg_floor_optab, savg_ceil_optab)
+	(savg_ceil_optab): New optabs.
+	* tree-vect-patterns.c (vect_recog_average_pattern): New function.
+	(vect_vect_recog_func_ptrs): Add it.
+	* tree-vect-stmts.c (vectorizable_call): Get the type of the zero
+	constant directly from the associated lhs.
+
 2018-07-03  Richard Sandiford  <richard.sandiford@arm.com>

 	* tree-vect-patterns.c (vect_split_statement): New function.
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@ -5599,6 +5599,34 @@ Other shift and rotate instructions, analogous to the
 Vector shift and rotate instructions that take vectors as operand 2
 instead of a scalar type.

+@cindex @code{avg@var{m}3_floor} instruction pattern
+@cindex @code{uavg@var{m}3_floor} instruction pattern
+@item @samp{avg@var{m}3_floor}
+@itemx @samp{uavg@var{m}3_floor}
+Signed and unsigned average instructions.  These instructions add
+operands 1 and 2 without truncation, divide the result by 2,
+round towards -Inf, and store the result in operand 0.  This is
+equivalent to the C code:
+@smallexample
+narrow op0, op1, op2;
+@dots{}
+op0 = (narrow) (((wide) op1 + (wide) op2) >> 1);
+@end smallexample
+where the sign of @samp{narrow} determines whether this is a signed
+or unsigned operation.
+
+@cindex @code{avg@var{m}3_ceil} instruction pattern
+@cindex @code{uavg@var{m}3_ceil} instruction pattern
+@item @samp{avg@var{m}3_ceil}
+@itemx @samp{uavg@var{m}3_ceil}
+Like @samp{avg@var{m}3_floor} and @samp{uavg@var{m}3_floor}, but round
+towards +Inf.  This is equivalent to the C code:
+@smallexample
+narrow op0, op1, op2;
+@dots{}
+op0 = (narrow) (((wide) op1 + (wide) op2 + 1) >> 1);
+@end smallexample
+
@cindex @code{bswap@var{m}2} instruction pattern
@item @samp{bswap@var{m}2}
 Reverse the order of bytes of operand 1 and store the result in operand 0.
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@ -1417,6 +1417,10 @@ Target supports Fortran @code{real} kinds larger than @code{real(8)}.
 The target's ABI allows stack variables to be aligned to the preferred
 vector alignment.

+@item vect_avg_qi
+Target supports both signed and unsigned averaging operations on vectors
+of bytes.
+
@item vect_condition
 Target supports vector conditional operations.

--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@ -143,6 +143,11 @@ DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
 DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
 DEF_INTERNAL_OPTAB_FN (FNMS, ECF_CONST, fnms, ternary)

+DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_FLOOR, ECF_CONST | ECF_NOTHROW, first,
+			      savg_floor, uavg_floor, binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_CEIL, ECF_CONST | ECF_NOTHROW, first,
+			      savg_ceil, uavg_ceil, binary)
+
 DEF_INTERNAL_OPTAB_FN (COND_ADD, ECF_CONST, cond_add, cond_binary)
 DEF_INTERNAL_OPTAB_FN (COND_SUB, ECF_CONST, cond_sub, cond_binary)
 DEF_INTERNAL_OPTAB_FN (COND_MUL, ECF_CONST, cond_smul, cond_binary)
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@ -316,6 +316,10 @@ OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a")
 OPTAB_D (extract_last_optab, "extract_last_$a")
 OPTAB_D (fold_extract_last_optab, "fold_extract_last_$a")

+OPTAB_D (savg_floor_optab, "avg$a3_floor")
+OPTAB_D (uavg_floor_optab, "uavg$a3_floor")
+OPTAB_D (savg_ceil_optab, "avg$a3_ceil")
+OPTAB_D (uavg_ceil_optab, "uavg$a3_ceil")
 OPTAB_D (sdot_prod_optab, "sdot_prod$I$a")
 OPTAB_D (ssum_widen_optab, "widen_ssum$I$a3")
 OPTAB_D (udot_prod_optab, "udot_prod$I$a")
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,23 @@
+2018-07-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	PR tree-optimization/85694
+	* lib/target-supports.exp (check_effective_target_vect_avg_qi): New
+	proc.
+	* gcc.dg/vect/vect-avg-1.c: New test.
+	* gcc.dg/vect/vect-avg-2.c: Likewise.
+	* gcc.dg/vect/vect-avg-3.c: Likewise.
+	* gcc.dg/vect/vect-avg-4.c: Likewise.
+	* gcc.dg/vect/vect-avg-5.c: Likewise.
+	* gcc.dg/vect/vect-avg-6.c: Likewise.
+	* gcc.dg/vect/vect-avg-7.c: Likewise.
+	* gcc.dg/vect/vect-avg-8.c: Likewise.
+	* gcc.dg/vect/vect-avg-9.c: Likewise.
+	* gcc.dg/vect/vect-avg-10.c: Likewise.
+	* gcc.dg/vect/vect-avg-11.c: Likewise.
+	* gcc.dg/vect/vect-avg-12.c: Likewise.
+	* gcc.dg/vect/vect-avg-13.c: Likewise.
+	* gcc.dg/vect/vect-avg-14.c: Likewise.
+
 2018-07-03  Richard Sandiford  <richard.sandiford@arm.com>

 	* gcc.dg/vect/vect-over-widen-5.c: Test that the extensions
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-1.c
@ -0,0 +1,47 @@
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS
+#define SIGNEDNESS unsigned
+#endif
+#ifndef BIAS
+#define BIAS 0
+#endif
+
+void __attribute__ ((noipa))
+f (SIGNEDNESS char *restrict a, SIGNEDNESS char *restrict b,
+   SIGNEDNESS char *restrict c)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    a[i] = (b[i] + c[i] + BIAS) >> 1;
+}
+
+#define BASE1 ((SIGNEDNESS int) -1 < 0 ? -126 : 4)
+#define BASE2 ((SIGNEDNESS int) -1 < 0 ? -101 : 26)
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS char a[N], b[N], c[N];
+  for (int i = 0; i < N; ++i)
+    {
+      b[i] = BASE1 + i * 5;
+      c[i] = BASE2 + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+  f (a, b, c);
+  for (int i = 0; i < N; ++i)
+    if (a[i] != ((BASE1 + BASE2 + i * 9 + BIAS) >> 1))
+      __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-not {vector\([^\n]*short} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-10.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-10.c
@ -0,0 +1,8 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS signed
+#define BIAS 2
+
+#include "vect-avg-5.c"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_average_pattern: detected" "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-11.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-11.c
@ -0,0 +1,57 @@
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS
+#define SIGNEDNESS unsigned
+#endif
+#ifndef BIAS
+#define BIAS 0
+#endif
+
+void __attribute__ ((noipa))
+f (SIGNEDNESS char *restrict a, SIGNEDNESS char *restrict b,
+   SIGNEDNESS char *restrict c)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int tmp = b[i];
+      tmp ^= 0x55;
+      tmp += BIAS;
+      tmp += c[i];
+      tmp >>= 1;
+      tmp |= 0x40;
+      a[i] = tmp;
+    }
+}
+
+#define BASE1 ((SIGNEDNESS int) -1 < 0 ? -126 : 4)
+#define BASE2 ((SIGNEDNESS int) -1 < 0 ? -101 : 26)
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS char a[N], b[N], c[N];
+  for (int i = 0; i < N; ++i)
+    {
+      b[i] = BASE1 + i * 5;
+      c[i] = BASE2 + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+  f (a, b, c);
+  for (int i = 0; i < N; ++i)
+    if (a[i] != (((((BASE1 + i * 5) ^ 0x55)
+		   + (BASE2 + i * 4)
+		   + BIAS) >> 1) | 0x40))
+      __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-not {vector\([^\n]*short} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-12.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-12.c
@ -0,0 +1,10 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS signed
+
+#include "vect-avg-11.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-not {vector\([^\n]*short} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-13.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-13.c
@ -0,0 +1,11 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS unsigned
+#define BIAS 1
+
+#include "vect-avg-11.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.AVG_CEIL} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-not {vector\([^\n]*short} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-14.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-14.c
@ -0,0 +1,11 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS signed
+#define BIAS 1
+
+#include "vect-avg-11.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.AVG_CEIL} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-not {vector\([^\n]*short} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-2.c
@ -0,0 +1,10 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS signed
+
+#include "vect-avg-1.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-not {vector\([^\n]*short} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-3.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-3.c
@ -0,0 +1,11 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS unsigned
+#define BIAS 1
+
+#include "vect-avg-1.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.AVG_CEIL} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-not {vector\([^\n]*short} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-4.c
@ -0,0 +1,11 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS signed
+#define BIAS 1
+
+#include "vect-avg-1.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.AVG_CEIL} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-not {vector\([^\n]*short} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-5.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-5.c
@ -0,0 +1,51 @@
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS
+#define SIGNEDNESS unsigned
+#endif
+#ifndef BIAS
+#define BIAS 0
+#endif
+
+void __attribute__ ((noipa))
+f (SIGNEDNESS char *restrict a, SIGNEDNESS char *restrict b,
+   SIGNEDNESS char *restrict c)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int tmp1 = b[i] + BIAS;
+      int tmp2 = tmp1 + c[i];
+      a[i] = tmp2 >> 1;
+    }
+}
+
+#define BASE1 ((SIGNEDNESS int) -1 < 0 ? -126 : 4)
+#define BASE2 ((SIGNEDNESS int) -1 < 0 ? -101 : 26)
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS char a[N], b[N], c[N];
+  for (int i = 0; i < N; ++i)
+    {
+      b[i] = BASE1 + i * 5;
+      c[i] = BASE2 + i * 4;
+      asm volatile ("" ::: "memory");
+    }
+  f (a, b, c);
+  for (int i = 0; i < N; ++i)
+    if (a[i] != ((BASE1 + BASE2 + i * 9 + BIAS) >> 1))
+      __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-not {vector\([^\n]*short} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-6.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-6.c
@ -0,0 +1,10 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS signed
+
+#include "vect-avg-5.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-not {vector\([^\n]*short} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-7.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-7.c
@ -0,0 +1,11 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS unsigned
+#define BIAS 1
+
+#include "vect-avg-5.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.AVG_CEIL} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-not {vector\([^\n]*short} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-8.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-8.c
@ -0,0 +1,11 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS signed
+#define BIAS 1
+
+#include "vect-avg-5.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.AVG_CEIL} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-not {vector\([^\n]*short} "vect" { target vect_avg_qi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_avg_qi } } } */
--- a/gcc/testsuite/gcc.dg/vect/vect-avg-9.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-avg-9.c
@ -0,0 +1,8 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS unsigned
+#define BIAS 2
+
+#include "vect-avg-5.c"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_average_pattern: detected" "vect" } } */
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@ -6313,6 +6313,13 @@ proc check_effective_target_vect_usad_char { } {
    return $et_vect_usad_char_saved($et_index)
 }

+# Return 1 if the target plus current options supports both signed
+# and unsigned average operations on vectors of bytes.
+
+proc check_effective_target_vect_avg_qi {} {
+    return 0
+}
+
 # Return 1 if the target plus current options supports a vector
 # demotion (packing) of shorts (to chars) and ints (to shorts) 
 # using modulo arithmetic, 0 otherwise.
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@ -1721,6 +1721,153 @@ vect_recog_over_widening_pattern (vec<gimple *> *stmts, tree *type_out)
  return pattern_stmt;
 }

+/* Recognize the patterns:
+
+	    ATYPE a;  // narrower than TYPE
+	    BTYPE b;  // narrower than TYPE
+	(1) TYPE avg = ((TYPE) a + (TYPE) b) >> 1;
+     or (2) TYPE avg = ((TYPE) a + (TYPE) b + 1) >> 1;
+
+   where only the bottom half of avg is used.  Try to transform them into:
+
+	(1) NTYPE avg' = .AVG_FLOOR ((NTYPE) a, (NTYPE) b);
+     or (2) NTYPE avg' = .AVG_CEIL ((NTYPE) a, (NTYPE) b);
+
+  followed by:
+
+	    TYPE avg = (TYPE) avg';
+
+  where NTYPE is no wider than half of TYPE.  Since only the bottom half
+  of avg is used, all or part of the cast of avg' should become redundant.  */
+
+static gimple *
+vect_recog_average_pattern (vec<gimple *> *stmts, tree *type_out)
+{
+  /* Check for a shift right by one bit.  */
+  gassign *last_stmt = dyn_cast <gassign *> (stmts->pop ());
+  if (!last_stmt
+      || gimple_assign_rhs_code (last_stmt) != RSHIFT_EXPR
+      || !integer_onep (gimple_assign_rhs2 (last_stmt)))
+    return NULL;
+
+  stmt_vec_info last_stmt_info = vinfo_for_stmt (last_stmt);
+  vec_info *vinfo = last_stmt_info->vinfo;
+
+  /* Check that the shift result is wider than the users of the
+     result need (i.e. that narrowing would be a natural choice).  */
+  tree lhs = gimple_assign_lhs (last_stmt);
+  tree type = TREE_TYPE (lhs);
+  unsigned int target_precision
+    = vect_element_precision (last_stmt_info->min_output_precision);
+  if (!INTEGRAL_TYPE_P (type) || target_precision >= TYPE_PRECISION (type))
+    return NULL;
+
+  /* Get the definition of the shift input.  */
+  tree rshift_rhs = gimple_assign_rhs1 (last_stmt);
+  stmt_vec_info plus_stmt_info = vect_get_internal_def (vinfo, rshift_rhs);
+  if (!plus_stmt_info)
+    return NULL;
+
+  /* Check whether the shift input can be seen as a tree of additions on
+     2 or 3 widened inputs.
+
+     Note that the pattern should be a win even if the result of one or
+     more additions is reused elsewhere: if the pattern matches, we'd be
+     replacing 2N RSHIFT_EXPRs and N VEC_PACK_*s with N IFN_AVG_*s.  */
+  internal_fn ifn = IFN_AVG_FLOOR;
+  vect_unpromoted_value unprom[3];
+  tree new_type;
+  unsigned int nops = vect_widened_op_tree (plus_stmt_info, PLUS_EXPR,
+					    PLUS_EXPR, false, 3,
+					    unprom, &new_type);
+  if (nops == 0)
+    return NULL;
+  if (nops == 3)
+    {
+      /* Check that one operand is 1.  */
+      unsigned int i;
+      for (i = 0; i < 3; ++i)
+	if (integer_onep (unprom[i].op))
+	  break;
+      if (i == 3)
+	return NULL;
+      /* Throw away the 1 operand and keep the other two.  */
+      if (i < 2)
+	unprom[i] = unprom[2];
+      ifn = IFN_AVG_CEIL;
+    }
+
+  vect_pattern_detected ("vect_recog_average_pattern", last_stmt);
+
+  /* We know that:
+
+     (a) the operation can be viewed as:
+
+	   TYPE widened0 = (TYPE) UNPROM[0];
+	   TYPE widened1 = (TYPE) UNPROM[1];
+	   TYPE tmp1 = widened0 + widened1 {+ 1};
+	   TYPE tmp2 = tmp1 >> 1;   // LAST_STMT_INFO
+
+     (b) the first two statements are equivalent to:
+
+	   TYPE widened0 = (TYPE) (NEW_TYPE) UNPROM[0];
+	   TYPE widened1 = (TYPE) (NEW_TYPE) UNPROM[1];
+
+     (c) vect_recog_over_widening_pattern has already tried to narrow TYPE
+	 where sensible;
+
+     (d) all the operations can be performed correctly at twice the width of
+	 NEW_TYPE, due to the nature of the average operation; and
+
+     (e) users of the result of the right shift need only TARGET_PRECISION
+	 bits, where TARGET_PRECISION is no more than half of TYPE's
+	 precision.
+
+     Under these circumstances, the only situation in which NEW_TYPE
+     could be narrower than TARGET_PRECISION is if widened0, widened1
+     and an addition result are all used more than once.  Thus we can
+     treat any widening of UNPROM[0] and UNPROM[1] to TARGET_PRECISION
+     as "free", whereas widening the result of the average instruction
+     from NEW_TYPE to TARGET_PRECISION would be a new operation.  It's
+     therefore better not to go narrower than TARGET_PRECISION.  */
+  if (TYPE_PRECISION (new_type) < target_precision)
+    new_type = build_nonstandard_integer_type (target_precision,
+					       TYPE_UNSIGNED (new_type));
+
+  /* Check for target support.  */
+  tree new_vectype = get_vectype_for_scalar_type (new_type);
+  if (!new_vectype
+      || !direct_internal_fn_supported_p (ifn, new_vectype,
+					  OPTIMIZE_FOR_SPEED))
+    return NULL;
+
+  /* The IR requires a valid vector type for the cast result, even though
+     it's likely to be discarded.  */
+  *type_out = get_vectype_for_scalar_type (type);
+  if (!*type_out)
+    return NULL;
+
+  /* Generate the IFN_AVG* call.  */
+  tree new_var = vect_recog_temp_ssa_var (new_type, NULL);
+  tree new_ops[2];
+  vect_convert_inputs (last_stmt_info, 2, new_ops, new_type,
+		       unprom, new_vectype);
+  gcall *average_stmt = gimple_build_call_internal (ifn, 2, new_ops[0],
+						    new_ops[1]);
+  gimple_call_set_lhs (average_stmt, new_var);
+  gimple_set_location (average_stmt, gimple_location (last_stmt));
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "created pattern stmt: ");
+      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, average_stmt, 0);
+    }
+
+  stmts->safe_push (last_stmt);
+  return vect_convert_output (last_stmt_info, type, average_stmt, new_vectype);
+}
+
 /* Recognize cases in which the input to a cast is wider than its
   output, and the input is fed by a widening operation.  Fold this
   by removing the unnecessary intermediate widening.  E.g.:
@ -4670,6 +4817,9 @@ struct vect_recog_func
   less comples onex (widen_sum only after dot_prod or sad for example).  */
 static vect_recog_func vect_vect_recog_func_ptrs[] = {
  { vect_recog_over_widening_pattern, "over_widening" },
+  /* Must come after over_widening, which narrows the shift as much as
+     possible beforehand.  */
+  { vect_recog_average_pattern, "average" },
  { vect_recog_cast_forwprop_pattern, "cast_forwprop" },
  { vect_recog_widen_mult_pattern, "widen_mult" },
  { vect_recog_dot_prod_pattern, "dot_prod" },
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@ -3116,7 +3116,7 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
  gcall *stmt;
  tree vec_dest;
  tree scalar_dest;
-  tree op, type;
+  tree op;
  tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
  stmt_vec_info stmt_info = vinfo_for_stmt (gs), prev_stmt_info;
  tree vectype_out, vectype_in;
@ -3592,12 +3592,11 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
  if (slp_node)
    return true;

-  type = TREE_TYPE (scalar_dest);
  if (is_pattern_stmt_p (stmt_info))
    stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
  lhs = gimple_get_lhs (stmt_info->stmt);

-  new_stmt = gimple_build_assign (lhs, build_zero_cst (type));
+  new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
  set_vinfo_for_stmt (new_stmt, stmt_info);
  set_vinfo_for_stmt (stmt_info->stmt, NULL);
  STMT_VINFO_STMT (stmt_info) = new_stmt;