diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 8acc942bfbe..176a40c6d72 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,18 @@ +2014-05-23 Thomas Preud'homme + + PR tree-optimization/54733 + * tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure. + (CMPNOP): Define. + (find_bswap_or_nop_load): New. + (find_bswap_1): Renamed to ... + (find_bswap_or_nop_1): This. Also add support for memory source. + (find_bswap): Renamed to ... + (find_bswap_or_nop): This. Also add support for memory source and + detection of bitwise operations equivalent to load in host endianness. + (execute_optimize_bswap): Likewise. Also move its leading comment back + in place and split statement transformation into ... + (bswap_replace): This. + 2014-05-22 Vladimir Makarov PR rtl-optimization/61215 diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 139e750eccf..e0b594b433d 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,13 @@ +2014-05-23 Thomas Preud'homme + + PR tree-optimization/54733 + * gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap + optimization to support memory sources and bitwise operations + equivalent to load in host endianness. + * gcc.dg/optimize-bswaphi-1.c: Likewise. + * gcc.dg/optimize-bswapsi-2.c: Likewise. + * gcc.c-torture/execute/bswap-2.c: Likewise. + 2014-05-23 Thomas Preud'homme * lib/target-supports.exp: New effective targets for architectures diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c new file mode 100644 index 00000000000..e91b4874fc8 --- /dev/null +++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c @@ -0,0 +1,90 @@ +#ifdef __UINT32_TYPE__ +typedef __UINT32_TYPE__ uint32_t; +#else +typedef __UINT32_TYPE__ unsigned; +#endif + +struct bitfield { + unsigned char f0:7; + unsigned char f1:7; + unsigned char f2:7; + unsigned char f3:7; +}; + +struct ok { + unsigned char f0; + unsigned char f1; + unsigned char f2; + unsigned char f3; +}; + +union bf_or_uint32 { + struct ok inval; + struct bitfield bfval; +}; + +__attribute__ ((noinline, noclone)) uint32_t +partial_read_le32 (union bf_or_uint32 in) +{ + return in.bfval.f0 | (in.bfval.f1 << 8) + | (in.bfval.f2 << 16) | (in.bfval.f3 << 24); +} + +__attribute__ ((noinline, noclone)) uint32_t +partial_read_be32 (union bf_or_uint32 in) +{ + return in.bfval.f3 | (in.bfval.f2 << 8) + | (in.bfval.f1 << 16) | (in.bfval.f0 << 24); +} + +__attribute__ ((noinline, noclone)) uint32_t +fake_read_le32 (char *x, char *y) +{ + unsigned char c0, c1, c2, c3; + + c0 = x[0]; + c1 = x[1]; + *y = 1; + c2 = x[2]; + c3 = x[3]; + return c0 | c1 << 8 | c2 << 16 | c3 << 24; +} + +__attribute__ ((noinline, noclone)) uint32_t +fake_read_be32 (char *x, char *y) +{ + unsigned char c0, c1, c2, c3; + + c0 = x[0]; + c1 = x[1]; + *y = 1; + c2 = x[2]; + c3 = x[3]; + return c3 | c2 << 8 | c1 << 16 | c0 << 24; +} + +int +main () +{ + union bf_or_uint32 bfin; + uint32_t out; + char cin[] = { 0x83, 0x85, 0x87, 0x89 }; + + if (sizeof (uint32_t) * __CHAR_BIT__ != 32) + return 0; + bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 }; + out = partial_read_le32 (bfin); + if (out != 0x09070503 && out != 0x88868482) + __builtin_abort (); + bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 }; + out = partial_read_be32 (bfin); + if (out != 0x03050709 && out != 0x82848688) + __builtin_abort (); + out = fake_read_le32 (cin, &cin[2]); + if (out != 0x89018583) + __builtin_abort (); + out = fake_read_be32 (cin, &cin[2]); + if (out != 0x83850189) + __builtin_abort (); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c new file mode 100644 index 00000000000..0a8bf2e739d --- /dev/null +++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c @@ -0,0 +1,64 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target bswap64 } */ +/* { dg-require-effective-target stdint_types } */ +/* { dg-options "-O2 -fdump-tree-bswap" } */ + +#include + +unsigned char data[8]; + +struct uint64_st { + unsigned char u0, u1, u2, u3, u4, u5, u6, u7; +}; + +uint64_t read_le64_1 (void) +{ + return (uint64_t) data[0] | ((uint64_t) data[1] << 8) + | ((uint64_t) data[2] << 16) | ((uint64_t) data[3] << 24) + | ((uint64_t) data[4] << 32) | ((uint64_t) data[5] << 40) + | ((uint64_t) data[6] << 48) | ((uint64_t) data[7] << 56); +} + +uint64_t read_le64_2 (struct uint64_st data) +{ + return (uint64_t) data.u0 | ((uint64_t) data.u1 << 8) + | ((uint64_t) data.u2 << 16) | ((uint64_t) data.u3 << 24) + | ((uint64_t) data.u4 << 32) | ((uint64_t) data.u5 << 40) + | ((uint64_t) data.u6 << 48) | ((uint64_t) data.u7 << 56); +} + +uint64_t read_le64_3 (unsigned char *data) +{ + return (uint64_t) *data | ((uint64_t) *(data + 1) << 8) + | ((uint64_t) *(data + 2) << 16) | ((uint64_t) *(data + 3) << 24) + | ((uint64_t) *(data + 4) << 32) | ((uint64_t) *(data + 5) << 40) + | ((uint64_t) *(data + 6) << 48) | ((uint64_t) *(data + 7) << 56); +} + +uint64_t read_be64_1 (void) +{ + return (uint64_t) data[7] | ((uint64_t) data[6] << 8) + | ((uint64_t) data[5] << 16) | ((uint64_t) data[4] << 24) + | ((uint64_t) data[3] << 32) | ((uint64_t) data[2] << 40) + | ((uint64_t) data[1] << 48) | ((uint64_t) data[0] << 56); +} + +uint64_t read_be64_2 (struct uint64_st data) +{ + return (uint64_t) data.u7 | ((uint64_t) data.u6 << 8) + | ((uint64_t) data.u5 << 16) | ((uint64_t) data.u4 << 24) + | ((uint64_t) data.u3 << 32) | ((uint64_t) data.u2 << 40) + | ((uint64_t) data.u1 << 48) | ((uint64_t) data.u0 << 56); +} + +uint64_t read_be64_3 (unsigned char *data) +{ + return (uint64_t) *(data + 7) | ((uint64_t) *(data + 6) << 8) + | ((uint64_t) *(data + 5) << 16) | ((uint64_t) *(data + 4) << 24) + | ((uint64_t) *(data + 3) << 32) | ((uint64_t) *(data + 2) << 40) + | ((uint64_t) *(data + 1) << 48) | ((uint64_t) *data << 56); +} + +/* { dg-final { scan-tree-dump-times "64 bit load in host endianness found at" 3 "bswap" } } */ +/* { dg-final { scan-tree-dump-times "64 bit bswap implementation found at" 3 "bswap" { xfail alpha*-*-* arm*-*-* } } } */ +/* { dg-final { cleanup-tree-dump "bswap" } } */ diff --git a/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c new file mode 100644 index 00000000000..65bff98f9f9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c @@ -0,0 +1,47 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target bswap16 } */ +/* { dg-require-effective-target stdint_types } */ +/* { dg-options "-O2 -fdump-tree-bswap" } */ +/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */ + +#include + +unsigned char data[2]; + +struct uint16_st { + unsigned char u0, u1; +}; + +uint32_t read_le16_1 (void) +{ + return data[0] | (data[1] << 8); +} + +uint32_t read_le16_2 (struct uint16_st data) +{ + return data.u0 | (data.u1 << 8); +} + +uint32_t read_le16_3 (unsigned char *data) +{ + return *data | (*(data + 1) << 8); +} + +uint32_t read_be16_1 (void) +{ + return data[1] | (data[0] << 8); +} + +uint32_t read_be16_2 (struct uint16_st data) +{ + return data.u1 | (data.u0 << 8); +} + +uint32_t read_be16_3 (unsigned char *data) +{ + return *(data + 1) | (*data << 8); +} + +/* { dg-final { scan-tree-dump-times "16 bit load in host endianness found at" 3 "bswap" } } */ +/* { dg-final { scan-tree-dump-times "16 bit bswap implementation found at" 3 "bswap" { xfail alpha*-*-* arm*-*-* } } } */ +/* { dg-final { cleanup-tree-dump "bswap" } } */ diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c new file mode 100644 index 00000000000..518b5108378 --- /dev/null +++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target bswap32 } */ +/* { dg-require-effective-target stdint_types } */ +/* { dg-options "-O2 -fdump-tree-bswap" } */ +/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */ + +#include + +extern unsigned char data[4]; + +struct uint32_st { + unsigned char u0, u1, u2, u3; +}; + +uint32_t read_le32_1 (void) +{ + return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24); +} + +uint32_t read_le32_2 (struct uint32_st data) +{ + return data.u0 | (data.u1 << 8) | (data.u2 << 16) | (data.u3 << 24); +} + +uint32_t read_le32_3 (unsigned char *data) +{ + return *data | (*(data + 1) << 8) | (*(data + 2) << 16) + | (*(data + 3) << 24); +} + +uint32_t read_be32_1 (void) +{ + return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24); +} + +uint32_t read_be32_2 (struct uint32_st data) +{ + return data.u3 | (data.u2 << 8) | (data.u1 << 16) | (data.u0 << 24); +} + +uint32_t read_be32_3 (unsigned char *data) +{ + return *(data + 3) | (*(data + 2) << 8) | (*(data + 1) << 16) + | (*data << 24); +} + +/* { dg-final { scan-tree-dump-times "32 bit load in host endianness found at" 3 "bswap" } } */ +/* { dg-final { scan-tree-dump-times "32 bit bswap implementation found at" 3 "bswap" { xfail alpha*-*-* arm*-*-* } } } */ +/* { dg-final { cleanup-tree-dump "bswap" } } */ diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c index 336626d0886..7c8c63ac99a 100644 --- a/gcc/tree-ssa-math-opts.c +++ b/gcc/tree-ssa-math-opts.c @@ -98,6 +98,7 @@ along with GCC; see the file COPYING3. If not see #include "is-a.h" #include "gimple.h" #include "gimple-iterator.h" +#include "gimplify.h" #include "gimplify-me.h" #include "stor-layout.h" #include "gimple-ssa.h" @@ -170,15 +171,15 @@ static struct static struct { - /* Number of hand-written 16-bit bswaps found. */ + /* Number of hand-written 16-bit nop / bswaps found. */ int found_16bit; - /* Number of hand-written 32-bit bswaps found. */ + /* Number of hand-written 32-bit nop / bswaps found. */ int found_32bit; - /* Number of hand-written 64-bit bswaps found. */ + /* Number of hand-written 64-bit nop / bswaps found. */ int found_64bit; -} bswap_stats; +} nop_stats, bswap_stats; static struct { @@ -1604,13 +1605,43 @@ make_pass_cse_sincos (gcc::context *ctxt) 0 - byte has the value 0 1..size - byte contains the content of the byte - number indexed with that value minus one */ + number indexed with that value minus one. + + To detect permutations on memory sources (arrays and structures), a symbolic + number is also associated a base address (the array or structure the load is + made from), an offset from the base address and a range which gives the + difference between the highest and lowest accessed memory location to make + such a symbolic number. The range is thus different from size which reflects + the size of the type of current expression. Note that for non memory source, + range holds the same value as size. + + For instance, for an array char a[], (short) a[0] | (short) a[3] would have + a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would + still have a size of 2 but this time a range of 1. */ struct symbolic_number { unsigned HOST_WIDEST_INT n; int size; + tree base_addr; + tree offset; + HOST_WIDE_INT bytepos; + tree alias_set; + tree vuse; + unsigned HOST_WIDE_INT range; }; +/* The number which the find_bswap_or_nop_1 result should match in + order to have a nop. The number is masked according to the size of + the symbolic number before using it. */ +#define CMPNOP (sizeof (HOST_WIDEST_INT) < 8 ? 0 : \ + (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201) + +/* The number which the find_bswap_or_nop_1 result should match in + order to have a byte swap. The number is masked according to the + size of the symbolic number before using it. */ +#define CMPXCHG (sizeof (HOST_WIDEST_INT) < 8 ? 0 : \ + (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708) + /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic number N. Return false if the requested operation is not permitted on a symbolic number. */ @@ -1670,13 +1701,76 @@ verify_symbolic_number_p (struct symbolic_number *n, gimple stmt) return true; } -/* find_bswap_1 invokes itself recursively with N and tries to perform - the operation given by the rhs of STMT on the result. If the - operation could successfully be executed the function returns the - tree expression of the source operand and NULL otherwise. */ +/* Check if STMT might be a byte swap or a nop from a memory source and returns + the answer. If so, REF is that memory source and the base of the memory area + accessed and the offset of the access from that base are recorded in N. */ + +bool +find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n) +{ + /* Leaf node is an array or component ref. Memorize its base and + offset from base to compare to other such leaf node. */ + HOST_WIDE_INT bitsize, bitpos; + enum machine_mode mode; + int unsignedp, volatilep; + + if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt)) + return false; + + n->base_addr = get_inner_reference (ref, &bitsize, &bitpos, &n->offset, + &mode, &unsignedp, &volatilep, false); + + if (TREE_CODE (n->base_addr) == MEM_REF) + { + offset_int bit_offset = 0; + tree off = TREE_OPERAND (n->base_addr, 1); + + if (!integer_zerop (off)) + { + offset_int boff, coff = mem_ref_offset (n->base_addr); + boff = wi::lshift (coff, LOG2_BITS_PER_UNIT); + bit_offset += boff; + } + + n->base_addr = TREE_OPERAND (n->base_addr, 0); + + /* Avoid returning a negative bitpos as this may wreak havoc later. */ + if (wi::neg_p (bit_offset)) + { + offset_int mask = wi::mask (LOG2_BITS_PER_UNIT, false); + offset_int tem = bit_offset.and_not (mask); + /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf. + Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */ + bit_offset -= tem; + tem = wi::arshift (tem, LOG2_BITS_PER_UNIT); + if (n->offset) + n->offset = size_binop (PLUS_EXPR, n->offset, + wide_int_to_tree (sizetype, tem)); + else + n->offset = wide_int_to_tree (sizetype, tem); + } + + bitpos += bit_offset.to_shwi (); + } + + if (bitpos % BITS_PER_UNIT) + return false; + if (bitsize % BITS_PER_UNIT) + return false; + + n->bytepos = bitpos / BITS_PER_UNIT; + n->alias_set = reference_alias_ptr_type (ref); + n->vuse = gimple_vuse (stmt); + return true; +} + +/* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform + the operation given by the rhs of STMT on the result. If the operation + could successfully be executed the function returns the tree expression of + the source operand and NULL otherwise. */ static tree -find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit) +find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit) { enum tree_code code; tree rhs1, rhs2 = NULL; @@ -1689,6 +1783,9 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit) rhs1 = gimple_assign_rhs1 (stmt); + if (find_bswap_or_nop_load (stmt, rhs1, n)) + return rhs1; + if (TREE_CODE (rhs1) != SSA_NAME) return NULL_TREE; @@ -1715,11 +1812,11 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit) && code != CONVERT_EXPR) return NULL_TREE; - source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1); + source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1); - /* If find_bswap_1 returned NULL STMT is a leaf node and we have - to initialize the symbolic number. */ - if (!source_expr1) + /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and + we have to initialize the symbolic number. */ + if (!source_expr1 || gimple_assign_load_p (rhs1_stmt)) { /* Set up the symbolic number N by setting each byte to a value between 1 and the byte size of rhs1. The highest @@ -1729,14 +1826,18 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit) if (n->size % BITS_PER_UNIT != 0) return NULL_TREE; n->size /= BITS_PER_UNIT; - n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 : - (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201); + n->range = n->size; + n->n = CMPNOP; if (n->size < (int)sizeof (HOST_WIDEST_INT)) n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1; - source_expr1 = rhs1; + if (!source_expr1) + { + n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE; + source_expr1 = rhs1; + } } switch (code) @@ -1777,6 +1878,8 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit) n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1; } n->size = type_size / BITS_PER_UNIT; + if (!n->base_addr) + n->range = n->size; } break; default: @@ -1805,17 +1908,79 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit) switch (code) { case BIT_IOR_EXPR: - source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1); + source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1); if (!source_expr1) return NULL_TREE; - source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1); + source_expr2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1); - if (source_expr1 != source_expr2 - || n1.size != n2.size) + if (n1.size != n2.size || !source_expr2) return NULL_TREE; + if (!n1.vuse != !n2.vuse || + (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0))) + return NULL_TREE; + + if (source_expr1 != source_expr2) + { + HOST_WIDEST_INT inc, mask; + unsigned i; + HOST_WIDE_INT off_sub; + struct symbolic_number *n_ptr; + + if (!n1.base_addr || !n2.base_addr + || !operand_equal_p (n1.base_addr, n2.base_addr, 0)) + return NULL_TREE; + if (!n1.offset != !n2.offset || + (n1.offset && !operand_equal_p (n1.offset, n2.offset, 0))) + return NULL_TREE; + + /* We swap n1 with n2 to have n1 < n2. */ + if (n2.bytepos < n1.bytepos) + { + struct symbolic_number tmpn; + + tmpn = n2; + n2 = n1; + n1 = tmpn; + source_expr1 = source_expr2; + } + + off_sub = n2.bytepos - n1.bytepos; + + /* Check that the range of memory covered < biggest int size. */ + if (off_sub + n2.range > (int) sizeof (HOST_WIDEST_INT)) + return NULL_TREE; + n->range = n2.range + off_sub; + + /* Reinterpret byte marks in symbolic number holding the value of + bigger weight according to host endianness. */ + inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub; + mask = 0xFF; + if (BYTES_BIG_ENDIAN) + n_ptr = &n1; + else + n_ptr = &n2; + for (i = 0; i < sizeof (HOST_WIDEST_INT); i++, inc <<= 8, + mask <<= 8) + { + if (n_ptr->n & mask) + n_ptr->n += inc; + } + } + else + n->range = n1.range; + + if (!n1.alias_set + || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set)) + n->alias_set = n1.alias_set; + else + n->alias_set = ptr_type_node; + n->vuse = n1.vuse; + n->base_addr = n1.base_addr; + n->offset = n1.offset; + n->bytepos = n1.bytepos; n->size = n1.size; for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT) { @@ -1840,57 +2005,75 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit) return NULL_TREE; } -/* Check if STMT completes a bswap implementation consisting of ORs, - SHIFTs and ANDs. Return the source tree expression on which the - byte swap is performed and NULL if no bswap was found. */ +/* Check if STMT completes a bswap implementation or a read in a given + endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP + accordingly. It also sets N to represent the kind of operations + performed: size of the resulting expression and whether it works on + a memory source, and if so alias-set and vuse. At last, the + function returns the source tree expression. */ static tree -find_bswap (gimple stmt) +find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap) { -/* The number which the find_bswap result should match in order to - have a full byte swap. The number is shifted to the left according - to the size of the symbolic number before using it. */ - unsigned HOST_WIDEST_INT cmp = - sizeof (HOST_WIDEST_INT) < 8 ? 0 : - (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708; +/* The number which the find_bswap_or_nop_1 result should match in order + to have a full byte swap. The number is shifted to the right + according to the size of the symbolic number before using it. */ + unsigned HOST_WIDEST_INT cmpxchg = CMPXCHG; + unsigned HOST_WIDEST_INT cmpnop = CMPNOP; - struct symbolic_number n; tree source_expr; int limit; /* The last parameter determines the depth search limit. It usually - correlates directly to the number of bytes to be touched. We - increase that number by three here in order to also - cover signed -> unsigned converions of the src operand as can be seen + correlates directly to the number n of bytes to be touched. We + increase that number by log2(n) + 1 here in order to also + cover signed -> unsigned conversions of the src operand as can be seen in libgcc, and for initial shift/and operation of the src operand. */ limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt))); limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit); - source_expr = find_bswap_1 (stmt, &n, limit); + source_expr = find_bswap_or_nop_1 (stmt, n, limit); if (!source_expr) return NULL_TREE; - /* Zero out the extra bits of N and CMP. */ - if (n.size < (int)sizeof (HOST_WIDEST_INT)) + /* Find real size of result (highest non zero byte). */ + if (n->base_addr) { - unsigned HOST_WIDEST_INT mask = - ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1; + int rsize; + unsigned HOST_WIDEST_INT tmpn; - n.n &= mask; - cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT; + for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_UNIT, rsize++); + n->range = rsize; } - /* A complete byte swap should make the symbolic number to start - with the largest digit in the highest order byte. */ - if (cmp != n.n) + /* Zero out the extra bits of N and CMP*. */ + if (n->range < (int)sizeof (HOST_WIDEST_INT)) + { + unsigned HOST_WIDEST_INT mask; + + mask = ((unsigned HOST_WIDEST_INT)1 << (n->range * BITS_PER_UNIT)) - 1; + cmpxchg >>= (sizeof (HOST_WIDEST_INT) - n->range) * BITS_PER_UNIT; + cmpnop &= mask; + } + + /* A complete byte swap should make the symbolic number to start with + the largest digit in the highest order byte. Unchanged symbolic + number indicates a read with same endianness as host architecture. */ + if (n->n == cmpnop) + *bswap = false; + else if (n->n == cmpxchg) + *bswap = true; + else return NULL_TREE; + /* Useless bit manipulation performed by code. */ + if (!n->base_addr && n->n == cmpnop) + return NULL_TREE; + + n->range *= BITS_PER_UNIT; return source_expr; } -/* Find manual byte swap implementations and turn them into a bswap - builtin invokation. */ - namespace { const pass_data pass_data_optimize_bswap = @@ -1924,6 +2107,156 @@ public: }; // class pass_optimize_bswap +/* Perform the bswap optimization: replace the statement STMT at GSI + with load type, VUSE and set-alias as described by N if a memory + source is involved (N->base_addr is non null), followed by the + builtin bswap invocation in FNDECL if BSWAP is true. SRC gives + the source on which STMT is operating and N->range gives the + size of the expression involved for maintaining some statistics. */ + +static bool +bswap_replace (gimple stmt, gimple_stmt_iterator *gsi, tree src, tree fndecl, + tree bswap_type, tree load_type, struct symbolic_number *n, + bool bswap) +{ + tree tmp, tgt; + gimple call; + + tgt = gimple_assign_lhs (stmt); + + /* Need to load the value from memory first. */ + if (n->base_addr) + { + tree addr_expr, addr_tmp, val_expr, val_tmp; + tree load_offset_ptr, aligned_load_type; + gimple addr_stmt, load_stmt; + unsigned align; + + align = get_object_alignment (src); + if (bswap && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align)) + return false; + + /* Compute address to load from and cast according to the size + of the load. */ + addr_expr = build_fold_addr_expr (unshare_expr (src)); + if (is_gimple_min_invariant (addr_expr)) + addr_tmp = addr_expr; + else + { + addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL, + "load_src"); + addr_stmt = gimple_build_assign (addr_tmp, addr_expr); + gsi_insert_before (gsi, addr_stmt, GSI_SAME_STMT); + } + + /* Perform the load. */ + aligned_load_type = load_type; + if (align < TYPE_ALIGN (load_type)) + aligned_load_type = build_aligned_type (load_type, align); + load_offset_ptr = build_int_cst (n->alias_set, 0); + val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp, + load_offset_ptr); + + if (!bswap) + { + if (n->range == 16) + nop_stats.found_16bit++; + else if (n->range == 32) + nop_stats.found_32bit++; + else + { + gcc_assert (n->range == 64); + nop_stats.found_64bit++; + } + + /* Convert the result of load if necessary. */ + if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type)) + { + val_tmp = make_temp_ssa_name (aligned_load_type, NULL, + "load_dst"); + load_stmt = gimple_build_assign (val_tmp, val_expr); + gimple_set_vuse (load_stmt, n->vuse); + gsi_insert_before (gsi, load_stmt, GSI_SAME_STMT); + gimple_assign_set_rhs_with_ops_1 (gsi, NOP_EXPR, val_tmp, + NULL_TREE, NULL_TREE); + } + else + gimple_assign_set_rhs_with_ops_1 (gsi, MEM_REF, val_expr, + NULL_TREE, NULL_TREE); + update_stmt (gsi_stmt (*gsi)); + + if (dump_file) + { + fprintf (dump_file, + "%d bit load in host endianness found at: ", + (int)n->range); + print_gimple_stmt (dump_file, stmt, 0, 0); + } + return true; + } + else + { + val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst"); + load_stmt = gimple_build_assign (val_tmp, val_expr); + gimple_set_vuse (load_stmt, n->vuse); + gsi_insert_before (gsi, load_stmt, GSI_SAME_STMT); + } + src = val_tmp; + } + + if (n->range == 16) + bswap_stats.found_16bit++; + else if (n->range == 32) + bswap_stats.found_32bit++; + else + { + gcc_assert (n->range == 64); + bswap_stats.found_64bit++; + } + + tmp = src; + + /* Convert the src expression if necessary. */ + if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type)) + { + gimple convert_stmt; + tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc"); + convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tmp, src, NULL); + gsi_insert_before (gsi, convert_stmt, GSI_SAME_STMT); + } + + call = gimple_build_call (fndecl, 1, tmp); + + tmp = tgt; + + /* Convert the result if necessary. */ + if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type)) + { + gimple convert_stmt; + tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst"); + convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tgt, tmp, NULL); + gsi_insert_after (gsi, convert_stmt, GSI_SAME_STMT); + } + + gimple_call_set_lhs (call, tmp); + + if (dump_file) + { + fprintf (dump_file, "%d bit bswap implementation found at: ", + (int)n->range); + print_gimple_stmt (dump_file, stmt, 0, 0); + } + + gsi_insert_after (gsi, call, GSI_SAME_STMT); + gsi_remove (gsi, true); + return true; +} + +/* Find manual byte swap implementations as well as load in a given + endianness. Byte swaps are turned into a bswap builtin invokation + while endian loads are converted to bswap builtin invokation or + simple load according to the host endianness. */ + unsigned int pass_optimize_bswap::execute (function *fun) { @@ -1946,9 +2279,6 @@ pass_optimize_bswap::execute (function *fun) && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing || (bswap32_p && word_mode == SImode))); - if (!bswap16_p && !bswap32_p && !bswap64_p) - return 0; - /* Determine the argument type of the builtins. The code later on assumes that the return and argument type are the same. */ if (bswap16_p) @@ -1969,6 +2299,7 @@ pass_optimize_bswap::execute (function *fun) bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); } + memset (&nop_stats, 0, sizeof (nop_stats)); memset (&bswap_stats, 0, sizeof (bswap_stats)); FOR_EACH_BB_FN (bb, fun) @@ -1982,21 +2313,24 @@ pass_optimize_bswap::execute (function *fun) for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi)) { gimple stmt = gsi_stmt (gsi); - tree bswap_src, bswap_type; - tree bswap_tmp; - tree fndecl = NULL_TREE; - int type_size; - gimple call; + tree fndecl = NULL_TREE, bswap_type = NULL_TREE; + tree src, load_type; + struct symbolic_number n; + bool bswap; if (!is_gimple_assign (stmt) || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR) continue; - type_size = TYPE_PRECISION (gimple_expr_type (stmt)); + src = find_bswap_or_nop (stmt, &n, &bswap); - switch (type_size) + if (!src) + continue; + + switch (n.range) { case 16: + load_type = uint16_type_node; if (bswap16_p) { fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16); @@ -2004,6 +2338,7 @@ pass_optimize_bswap::execute (function *fun) } break; case 32: + load_type = uint32_type_node; if (bswap32_p) { fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32); @@ -2011,6 +2346,7 @@ pass_optimize_bswap::execute (function *fun) } break; case 64: + load_type = uint64_type_node; if (bswap64_p) { fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64); @@ -2021,62 +2357,21 @@ pass_optimize_bswap::execute (function *fun) continue; } - if (!fndecl) + if (bswap && !fndecl) continue; - bswap_src = find_bswap (stmt); - - if (!bswap_src) - continue; - - changed = true; - if (type_size == 16) - bswap_stats.found_16bit++; - else if (type_size == 32) - bswap_stats.found_32bit++; - else - bswap_stats.found_64bit++; - - bswap_tmp = bswap_src; - - /* Convert the src expression if necessary. */ - if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type)) - { - gimple convert_stmt; - bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc"); - convert_stmt = gimple_build_assign_with_ops - (NOP_EXPR, bswap_tmp, bswap_src, NULL); - gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT); - } - - call = gimple_build_call (fndecl, 1, bswap_tmp); - - bswap_tmp = gimple_assign_lhs (stmt); - - /* Convert the result if necessary. */ - if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type)) - { - gimple convert_stmt; - bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst"); - convert_stmt = gimple_build_assign_with_ops - (NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL); - gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT); - } - - gimple_call_set_lhs (call, bswap_tmp); - - if (dump_file) - { - fprintf (dump_file, "%d bit bswap implementation found at: ", - (int)type_size); - print_gimple_stmt (dump_file, stmt, 0, 0); - } - - gsi_insert_after (&gsi, call, GSI_SAME_STMT); - gsi_remove (&gsi, true); + if (bswap_replace (stmt, &gsi, src, fndecl, bswap_type, load_type, + &n, bswap)) + changed = true; } } + statistics_counter_event (fun, "16-bit nop implementations found", + nop_stats.found_16bit); + statistics_counter_event (fun, "32-bit nop implementations found", + nop_stats.found_32bit); + statistics_counter_event (fun, "64-bit nop implementations found", + nop_stats.found_64bit); statistics_counter_event (fun, "16-bit bswap implementations found", bswap_stats.found_16bit); statistics_counter_event (fun, "32-bit bswap implementations found",