diff --git a/gcc/ChangeLog b/gcc/ChangeLog index eba58f69f80..aa295f09368 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,19 @@ +2019-08-15 H.J. Lu + + PR target/90878 + * config/i386/i386.c (inline_memory_move_cost): Use hard_register + for costs of hard register moves. + (ix86_register_move_cost): Likewise. + * config/i386/i386.h (processor_costs): Move costs of hard + register moves to hard_register. Add int_load, int_store, + xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse, + sse_load, sse_store, sse_unaligned_load and sse_unaligned_store + for costs of RTL expressions. + * config/i386/x86-tune-costs.h: Move costs of hard register + moves to hard_register. Duplicate int_load, int_store, + xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse, + sse_load, sse_store for costs of RTL expressions. + 2019-08-15 Richard Sandiford * target.def (setup_incoming_vararg_bounds): Remove. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5eb625ce724..647bcbef050 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -18464,8 +18464,10 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in) return 100; } if (in == 2) - return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]); - return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index]; + return MAX (ix86_cost->hard_register.fp_load [index], + ix86_cost->hard_register.fp_store [index]); + return in ? ix86_cost->hard_register.fp_load [index] + : ix86_cost->hard_register.fp_store [index]; } if (SSE_CLASS_P (regclass)) { @@ -18473,8 +18475,10 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in) if (index == -1) return 100; if (in == 2) - return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]); - return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index]; + return MAX (ix86_cost->hard_register.sse_load [index], + ix86_cost->hard_register.sse_store [index]); + return in ? ix86_cost->hard_register.sse_load [index] + : ix86_cost->hard_register.sse_store [index]; } if (MMX_CLASS_P (regclass)) { @@ -18491,8 +18495,10 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in) return 100; } if (in == 2) - return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]); - return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index]; + return MAX (ix86_cost->hard_register.mmx_load [index], + ix86_cost->hard_register.mmx_store [index]); + return in ? ix86_cost->hard_register.mmx_load [index] + : ix86_cost->hard_register.mmx_store [index]; } switch (GET_MODE_SIZE (mode)) { @@ -18500,37 +18506,41 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in) if (Q_CLASS_P (regclass) || TARGET_64BIT) { if (!in) - return ix86_cost->int_store[0]; + return ix86_cost->hard_register.int_store[0]; if (TARGET_PARTIAL_REG_DEPENDENCY && optimize_function_for_speed_p (cfun)) - cost = ix86_cost->movzbl_load; + cost = ix86_cost->hard_register.movzbl_load; else - cost = ix86_cost->int_load[0]; + cost = ix86_cost->hard_register.int_load[0]; if (in == 2) - return MAX (cost, ix86_cost->int_store[0]); + return MAX (cost, ix86_cost->hard_register.int_store[0]); return cost; } else { if (in == 2) - return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4); + return MAX (ix86_cost->hard_register.movzbl_load, + ix86_cost->hard_register.int_store[0] + 4); if (in) - return ix86_cost->movzbl_load; + return ix86_cost->hard_register.movzbl_load; else - return ix86_cost->int_store[0] + 4; + return ix86_cost->hard_register.int_store[0] + 4; } break; case 2: if (in == 2) - return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]); - return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1]; + return MAX (ix86_cost->hard_register.int_load[1], + ix86_cost->hard_register.int_store[1]); + return in ? ix86_cost->hard_register.int_load[1] + : ix86_cost->hard_register.int_store[1]; default: if (in == 2) - cost = MAX (ix86_cost->int_load[2], ix86_cost->int_store[2]); + cost = MAX (ix86_cost->hard_register.int_load[2], + ix86_cost->hard_register.int_store[2]); else if (in) - cost = ix86_cost->int_load[2]; + cost = ix86_cost->hard_register.int_load[2]; else - cost = ix86_cost->int_store[2]; + cost = ix86_cost->hard_register.int_store[2]; /* Multiply with the number of GPR moves needed. */ return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD); } @@ -18600,20 +18610,21 @@ ix86_register_move_cost (machine_mode mode, reg_class_t class1_i, because of missing QImode and HImode moves to, from or between MMX/SSE registers. */ return MAX (8, SSE_CLASS_P (class1) - ? ix86_cost->sse_to_integer : ix86_cost->integer_to_sse); + ? ix86_cost->hard_register.sse_to_integer + : ix86_cost->hard_register.integer_to_sse); if (MAYBE_FLOAT_CLASS_P (class1)) - return ix86_cost->fp_move; + return ix86_cost->hard_register.fp_move; if (MAYBE_SSE_CLASS_P (class1)) { if (GET_MODE_BITSIZE (mode) <= 128) - return ix86_cost->xmm_move; + return ix86_cost->hard_register.xmm_move; if (GET_MODE_BITSIZE (mode) <= 256) - return ix86_cost->ymm_move; - return ix86_cost->zmm_move; + return ix86_cost->hard_register.ymm_move; + return ix86_cost->hard_register.zmm_move; } if (MAYBE_MMX_CLASS_P (class1)) - return ix86_cost->mmx_move; + return ix86_cost->hard_register.mmx_move; return 2; } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 800d7c4c4e3..e0a77e1fb25 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -237,9 +237,46 @@ struct stringop_algs } size [MAX_STRINGOP_ALGS]; }; -/* Define the specific costs for a given cpu */ +/* Define the specific costs for a given cpu. NB: hard_register is used + by TARGET_REGISTER_MOVE_COST and TARGET_MEMORY_MOVE_COST to compute + hard register move costs by register allocator. Relative costs of + pseudo register load and store versus pseudo register moves in RTL + expressions for TARGET_RTX_COSTS can be different from relative + costs of hard registers to get the most efficient operations with + pseudo registers. */ struct processor_costs { + /* Costs used by register allocator. integer->integer register move + cost is 2. */ + struct + { + const int movzbl_load; /* cost of loading using movzbl */ + const int int_load[3]; /* cost of loading integer registers + in QImode, HImode and SImode relative + to reg-reg move (2). */ + const int int_store[3]; /* cost of storing integer register + in QImode, HImode and SImode */ + const int fp_move; /* cost of reg,reg fld/fst */ + const int fp_load[3]; /* cost of loading FP register + in SFmode, DFmode and XFmode */ + const int fp_store[3]; /* cost of storing FP register + in SFmode, DFmode and XFmode */ + const int mmx_move; /* cost of moving MMX register. */ + const int mmx_load[2]; /* cost of loading MMX register + in SImode and DImode */ + const int mmx_store[2]; /* cost of storing MMX register + in SImode and DImode */ + const int xmm_move; /* cost of moving XMM register. */ + const int ymm_move; /* cost of moving XMM register. */ + const int zmm_move; /* cost of moving XMM register. */ + const int sse_load[5]; /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + const int sse_store[5]; /* cost of storing SSE register + in SImode, DImode and TImode. */ + const int sse_to_integer; /* cost of moving SSE register to integer. */ + const int integer_to_sse; /* cost of moving integer register to SSE. */ + } hard_register; + const int add; /* cost of an add instruction */ const int lea; /* cost of a lea instruction */ const int shift_var; /* variable shift costs */ @@ -254,32 +291,20 @@ struct processor_costs { const int large_insn; /* insns larger than this cost more */ const int move_ratio; /* The threshold of number of scalar memory-to-memory move insns. */ - const int movzbl_load; /* cost of loading using movzbl */ const int int_load[3]; /* cost of loading integer registers in QImode, HImode and SImode relative to reg-reg move (2). */ const int int_store[3]; /* cost of storing integer register in QImode, HImode and SImode */ - const int fp_move; /* cost of reg,reg fld/fst */ - const int fp_load[3]; /* cost of loading FP register - in SFmode, DFmode and XFmode */ - const int fp_store[3]; /* cost of storing FP register - in SFmode, DFmode and XFmode */ - const int mmx_move; /* cost of moving MMX register. */ - const int mmx_load[2]; /* cost of loading MMX register - in SImode and DImode */ - const int mmx_store[2]; /* cost of storing MMX register - in SImode and DImode */ - const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ - zmm_move; const int sse_load[5]; /* cost of loading SSE register in 32bit, 64bit, 128bit, 256bit and 512bit */ - const int sse_unaligned_load[5];/* cost of unaligned load. */ const int sse_store[5]; /* cost of storing SSE register - in SImode, DImode and TImode. */ + in 32bit, 64bit, 128bit, 256bit and 512bit */ + const int sse_unaligned_load[5];/* cost of unaligned load. */ const int sse_unaligned_store[5];/* cost of unaligned store. */ + const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ + zmm_move; const int sse_to_integer; /* cost of moving SSE register to integer. */ - const int integer_to_sse; /* cost of moving integer register to SSE. */ const int gather_static, gather_per_elt; /* Cost of gather load is computed as static + per_item * nelts. */ const int scatter_static, scatter_per_elt; /* Cost of gather store is diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 8b963c07051..ad9ea4bfa08 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -36,6 +36,30 @@ static stringop_algs ix86_size_memset[2] = { const struct processor_costs ix86_size_cost = {/* costs for tuning for size */ + /* Start of register allocator costs. integer->integer move cost is 2. */ + 2, /* cost for loading QImode using movzbl */ + {2, 2, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 2, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 2}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {2, 2, 2}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 3, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {3, 3}, /* cost of storing MMX registers + in SImode and DImode */ + 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ + {3, 3, 3, 3, 3}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {3, 3, 3, 3, 3}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 3, 3, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_BYTES (2), /* cost of an add instruction */ COSTS_N_BYTES (3), /* cost of a lea instruction */ COSTS_N_BYTES (2), /* variable shift costs */ @@ -55,33 +79,20 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ COSTS_N_BYTES (3), /* cost of movzx */ 0, /* "large" insn */ 2, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2. */ - 2, /* cost for loading QImode using movzbl */ {2, 2, 2}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {2, 2, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {2, 2, 2}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {2, 2, 2}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 3, /* cost of moving MMX register */ - {3, 3}, /* cost of loading MMX registers - in SImode and DImode */ - {3, 3}, /* cost of storing MMX registers - in SImode and DImode */ - 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ - {3, 3, 3, 3, 3}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {3, 3, 3, 3, 3}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {3, 3, 3, 3, 3}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {3, 3, 3, 3, 3}, /* cost of unaligned SSE load in 128bit, 256bit and 512bit */ - {3, 3, 3, 3, 3}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ - {3, 3, 3, 3, 3}, /* cost of unaligned SSE store + {3, 3, 3, 3, 3}, /* cost of unaligned SSE store in 128bit, 256bit and 512bit */ - 3, 3, /* SSE->integer and integer->SSE moves */ + 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ + 3, /* cost of moving SSE register to integer. */ 5, 0, /* Gather load static, per_elt. */ 5, 0, /* Gather store static, per_elt. */ 0, /* size of l1 cache */ @@ -127,6 +138,30 @@ static stringop_algs i386_memset[2] = { static const struct processor_costs i386_cost = { /* 386 specific costs */ + /* Start of register allocator costs. integer->integer move cost is 2. */ + 4, /* cost for loading QImode using movzbl */ + {2, 4, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 4, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {8, 8, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {8, 8, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 8, 16, 32, 64}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 3, 3, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ COSTS_N_INSNS (3), /* variable shift costs */ @@ -146,32 +181,18 @@ struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (2), /* cost of movzx */ 15, /* "large" insn */ 3, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 4, /* cost for loading QImode using movzbl */ {2, 4, 2}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {2, 4, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {8, 8, 8}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {8, 8, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {4, 8, 16, 32, 64}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {4, 8, 16, 32, 64}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ - {4, 8, 16, 32, 64}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ - 3, 3, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 3, /* cost of moving SSE register to integer. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 0, /* size of l1 cache */ @@ -216,6 +237,30 @@ static stringop_algs i486_memset[2] = { static const struct processor_costs i486_cost = { /* 486 specific costs */ + /* Start of register allocator costs. integer->integer move cost is 2. */ + 4, /* cost for loading QImode using movzbl */ + {2, 4, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 4, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {8, 8, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {8, 8, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 8, 16, 32, 64}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 3, 3, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ COSTS_N_INSNS (3), /* variable shift costs */ @@ -235,32 +280,18 @@ struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (2), /* cost of movzx */ 15, /* "large" insn */ 3, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 4, /* cost for loading QImode using movzbl */ {2, 4, 2}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {2, 4, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {8, 8, 8}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {8, 8, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {4, 8, 16, 32, 64}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {4, 8, 16, 32, 64}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ - {4, 8, 16, 32, 64}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ - 3, 3, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 3, /* cost of moving SSE register to integer. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 4, /* size of l1 cache. 486 has 8kB cache @@ -307,6 +338,30 @@ static stringop_algs pentium_memset[2] = { static const struct processor_costs pentium_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 6, /* cost for loading QImode using movzbl */ + {2, 4, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 4, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 8, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 8, 16, 32, 64}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 3, 3, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ COSTS_N_INSNS (4), /* variable shift costs */ @@ -326,32 +381,18 @@ struct processor_costs pentium_cost = { COSTS_N_INSNS (2), /* cost of movzx */ 8, /* "large" insn */ 6, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 6, /* cost for loading QImode using movzbl */ {2, 4, 2}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {2, 4, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {2, 2, 6}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 6}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 8, /* cost of moving MMX register */ - {8, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {8, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {4, 8, 16, 32, 64}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {4, 8, 16, 32, 64}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ - {4, 8, 16, 32, 64}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ - 3, 3, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 3, /* cost of moving SSE register to integer. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -389,6 +430,30 @@ struct processor_costs pentium_cost = { static const struct processor_costs lakemont_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 6, /* cost for loading QImode using movzbl */ + {2, 4, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 4, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 8, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 8, 16, 32, 64}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 3, 3, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -408,32 +473,18 @@ struct processor_costs lakemont_cost = { COSTS_N_INSNS (2), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 6, /* cost for loading QImode using movzbl */ {2, 4, 2}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {2, 4, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {2, 2, 6}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 6}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 8, /* cost of moving MMX register */ - {8, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {8, 8}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {4, 8, 16, 32, 64}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {4, 8, 16, 32, 64}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ - {4, 8, 16, 32, 64}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ - 3, 3, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 3, /* cost of moving SSE register to integer. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -486,6 +537,30 @@ static stringop_algs pentiumpro_memset[2] = { DUMMY_STRINGOP_ALGS}; static const struct processor_costs pentiumpro_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 2, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 2, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {2, 2}, /* cost of loading MMX registers + in SImode and DImode */ + {2, 2}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 8, 16, 32, 64}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 3, 3, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -505,32 +580,18 @@ struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 6, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 2, /* cost for loading QImode using movzbl */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {2, 2, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {2, 2, 6}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 6}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {2, 2}, /* cost of loading MMX registers - in SImode and DImode */ - {2, 2}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {4, 8, 16, 32, 64}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {4, 8, 16, 32, 64}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {4, 8, 16, 32, 64}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ - {4, 8, 16, 32, 64}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ - 3, 3, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 3, /* cost of moving SSE register to integer. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -574,6 +635,30 @@ static stringop_algs geode_memset[2] = { DUMMY_STRINGOP_ALGS}; static const struct processor_costs geode_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 2, /* cost for loading QImode using movzbl */ + {2, 2, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 2, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 2}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 6, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {2, 2}, /* cost of loading MMX registers + in SImode and DImode */ + {2, 2}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {2, 2, 8, 16, 32}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {2, 2, 8, 16, 32}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 6, 6, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ COSTS_N_INSNS (2), /* variable shift costs */ @@ -593,33 +678,18 @@ struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 4, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 2, /* cost for loading QImode using movzbl */ {2, 2, 2}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {2, 2, 2}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {2, 2, 2}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 6, 6}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - - 2, /* cost of moving MMX register */ - {2, 2}, /* cost of loading MMX registers - in SImode and DImode */ - {2, 2}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {2, 2, 8, 16, 32}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {2, 2, 8, 16, 32}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {2, 2, 8, 16, 32}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ - {2, 2, 8, 16, 32}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ - 6, 6, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 6, /* cost of moving SSE register to integer. */ 2, 2, /* Gather load static, per_elt. */ 2, 2, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -663,6 +733,30 @@ static stringop_algs k6_memset[2] = { DUMMY_STRINGOP_ALGS}; static const struct processor_costs k6_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 3, /* cost for loading QImode using movzbl */ + {4, 5, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 3, 2}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {6, 6, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 4}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {2, 2}, /* cost of loading MMX registers + in SImode and DImode */ + {2, 2}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {2, 2, 8, 16, 32}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {2, 2, 8, 16, 32}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 6, 6, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -682,32 +776,18 @@ struct processor_costs k6_cost = { COSTS_N_INSNS (2), /* cost of movzx */ 8, /* "large" insn */ 4, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 3, /* cost for loading QImode using movzbl */ {4, 5, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {2, 3, 2}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {6, 6, 6}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {4, 4, 4}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {2, 2}, /* cost of loading MMX registers - in SImode and DImode */ - {2, 2}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {2, 2, 8, 16, 32}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {2, 2, 8, 16, 32}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {2, 2, 8, 16, 32}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ - {2, 2, 8, 16, 32}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ - 6, 6, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 6, /* cost of moving SSE register to integer. */ 2, 2, /* Gather load static, per_elt. */ 2, 2, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -757,6 +837,30 @@ static stringop_algs athlon_memset[2] = { DUMMY_STRINGOP_ALGS}; static const struct processor_costs athlon_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {4, 4}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 4, 12, 12, 24}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 4, 10, 10, 20}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 5, 5, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -776,32 +880,18 @@ struct processor_costs athlon_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 4, /* cost for loading QImode using movzbl */ {3, 4, 3}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {3, 4, 3}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {4, 4, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {4, 4}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {4, 4, 12, 12, 24}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {4, 4, 12, 12, 24}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {4, 4, 10, 10, 20}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {4, 4, 12, 12, 24}, /* cost of unaligned loads. */ - {4, 4, 10, 10, 20}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ - 5, 5, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 5, /* cost of moving SSE register to integer. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -853,6 +943,30 @@ static stringop_algs k8_memset[2] = { {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; static const struct processor_costs k8_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 3, 12, 12, 24}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 4, 10, 10, 20}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 5, 5, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -872,32 +986,18 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 4, /* cost for loading QImode using movzbl */ {3, 4, 3}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {3, 4, 3}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {4, 4, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {3, 3}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {4, 3, 12, 12, 24}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {4, 3, 12, 12, 24}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {4, 4, 10, 10, 20}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {4, 3, 12, 12, 24}, /* cost of unaligned loads. */ - {4, 4, 10, 10, 20}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ - 5, 5, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 5, /* cost of moving SSE register to integer. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -953,6 +1053,39 @@ static stringop_algs amdfam10_memset[2] = { {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; struct processor_costs amdfam10_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {4, 4, 3, 6, 12}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {4, 4, 5, 10, 20}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 3, 3, /* SSE->integer and integer->SSE moves */ + + /* On K8: + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10: + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -972,40 +1105,18 @@ struct processor_costs amdfam10_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 4, /* cost for loading QImode using movzbl */ {3, 4, 3}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {3, 4, 3}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {4, 4, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 8}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {3, 3}, /* cost of loading MMX registers - in SImode and DImode */ - {4, 4}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {4, 4, 3, 6, 12}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {4, 4, 3, 6, 12}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {4, 4, 5, 10, 20}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {4, 4, 3, 7, 12}, /* cost of unaligned loads. */ - {4, 4, 5, 10, 20}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ - 3, 3, /* SSE->integer and integer->SSE moves */ - /* On K8: - MOVD reg64, xmmreg Double FSTORE 4 - MOVD reg32, xmmreg Double FSTORE 4 - On AMDFAM10: - MOVD reg64, xmmreg Double FADD 3 - 1/1 1/1 - MOVD reg32, xmmreg Double FADD 3 - 1/1 1/1 */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 3, /* cost of moving SSE register to integer. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1062,6 +1173,30 @@ static stringop_algs bdver_memset[2] = { {-1, libcall, false}}}}; const struct processor_costs bdver_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 8, /* cost for loading QImode using movzbl */ + {8, 8, 8}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {8, 8, 8}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 28}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {10, 10, 18}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 4, /* cost of moving MMX register */ + {12, 12}, /* cost of loading MMX registers + in SImode and DImode */ + {10, 10}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {12, 12, 10, 40, 60}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 10, 40, 60}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 16, 20, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -1081,32 +1216,18 @@ const struct processor_costs bdver_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 8, /* cost for loading QImode using movzbl */ {8, 8, 8}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {8, 8, 8}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {12, 12, 28}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {10, 10, 18}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 4, /* cost of moving MMX register */ - {12, 12}, /* cost of loading MMX registers - in SImode and DImode */ - {10, 10}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {12, 12, 10, 40, 60}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {12, 12, 10, 40, 60}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {10, 10, 10, 40, 60}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ - {10, 10, 10, 40, 60}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ - 16, 20, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 16, /* cost of moving SSE register to integer. */ 12, 12, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 16, /* size of l1 cache. */ @@ -1164,31 +1285,7 @@ static stringop_algs znver1_memset[2] = { {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; struct processor_costs znver1_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction. */ - COSTS_N_INSNS (1), /* cost of a lea instruction. */ - COSTS_N_INSNS (1), /* variable shift costs. */ - COSTS_N_INSNS (1), /* constant shift costs. */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ - COSTS_N_INSNS (3), /* HI. */ - COSTS_N_INSNS (3), /* SI. */ - COSTS_N_INSNS (3), /* DI. */ - COSTS_N_INSNS (3)}, /* other. */ - 0, /* cost of multiply per each bit - set. */ - /* Depending on parameters, idiv can get faster on ryzen. This is upper - bound. */ - {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ - COSTS_N_INSNS (22), /* HI. */ - COSTS_N_INSNS (30), /* SI. */ - COSTS_N_INSNS (45), /* DI. */ - COSTS_N_INSNS (45)}, /* other. */ - COSTS_N_INSNS (1), /* cost of movsx. */ - COSTS_N_INSNS (1), /* cost of movzx. */ - 8, /* "large" insn. */ - 9, /* MOVE_RATIO. */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ + /* Start of register allocator costs. integer->integer move cost is 2. */ /* reg-reg moves are done by renaming and thus they are even cheaper than 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond @@ -1214,11 +1311,46 @@ struct processor_costs znver1_cost = { 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ {6, 6, 6, 12, 24}, /* cost of loading SSE registers in 32,64,128,256 and 512-bit. */ - {6, 6, 6, 12, 24}, /* cost of unaligned loads. */ {8, 8, 8, 16, 32}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit. */ - {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ 6, 6, /* SSE->integer and integer->SSE moves. */ + /* End of register allocator costs. */ + + COSTS_N_INSNS (1), /* cost of an add instruction. */ + COSTS_N_INSNS (1), /* cost of a lea instruction. */ + COSTS_N_INSNS (1), /* variable shift costs. */ + COSTS_N_INSNS (1), /* constant shift costs. */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ + COSTS_N_INSNS (3), /* HI. */ + COSTS_N_INSNS (3), /* SI. */ + COSTS_N_INSNS (3), /* DI. */ + COSTS_N_INSNS (3)}, /* other. */ + 0, /* cost of multiply per each bit + set. */ + /* Depending on parameters, idiv can get faster on ryzen. This is upper + bound. */ + {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ + COSTS_N_INSNS (22), /* HI. */ + COSTS_N_INSNS (30), /* SI. */ + COSTS_N_INSNS (45), /* DI. */ + COSTS_N_INSNS (45)}, /* other. */ + COSTS_N_INSNS (1), /* cost of movsx. */ + COSTS_N_INSNS (1), /* cost of movzx. */ + 8, /* "large" insn. */ + 9, /* MOVE_RATIO. */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {8, 8, 8}, /* cost of storing integer + registers. */ + {6, 6, 6, 12, 24}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {8, 8, 8, 16, 32}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {6, 6, 6, 12, 24}, /* cost of unaligned loads. */ + {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ + 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ + 6, /* cost of moving SSE register to integer. */ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, throughput 12. Approx 9 uops do not depend on vector size and every load is 7 uops. */ @@ -1288,31 +1420,7 @@ static stringop_algs znver2_memset[2] = { {-1, libcall, false}}}}; struct processor_costs znver2_cost = { - COSTS_N_INSNS (1), /* cost of an add instruction. */ - COSTS_N_INSNS (1), /* cost of a lea instruction. */ - COSTS_N_INSNS (1), /* variable shift costs. */ - COSTS_N_INSNS (1), /* constant shift costs. */ - {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ - COSTS_N_INSNS (3), /* HI. */ - COSTS_N_INSNS (3), /* SI. */ - COSTS_N_INSNS (3), /* DI. */ - COSTS_N_INSNS (3)}, /* other. */ - 0, /* cost of multiply per each bit - set. */ - /* Depending on parameters, idiv can get faster on ryzen. This is upper - bound. */ - {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ - COSTS_N_INSNS (22), /* HI. */ - COSTS_N_INSNS (30), /* SI. */ - COSTS_N_INSNS (45), /* DI. */ - COSTS_N_INSNS (45)}, /* other. */ - COSTS_N_INSNS (1), /* cost of movsx. */ - COSTS_N_INSNS (1), /* cost of movzx. */ - 8, /* "large" insn. */ - 9, /* MOVE_RATIO. */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ + /* Start of register allocator costs. integer->integer move cost is 2. */ /* reg-reg moves are done by renaming and thus they are even cheaper than 1 cycle. Because reg-reg move cost is 2 and following tables correspond @@ -1339,12 +1447,48 @@ struct processor_costs znver2_cost = { register. */ {6, 6, 6, 6, 12}, /* cost of loading SSE registers in 32,64,128,256 and 512-bit. */ - {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ {8, 8, 8, 8, 16}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit. */ - {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 6, 6, /* SSE->integer and integer->SSE moves. */ + /* End of register allocator costs. */ + + COSTS_N_INSNS (1), /* cost of an add instruction. */ + COSTS_N_INSNS (1), /* cost of a lea instruction. */ + COSTS_N_INSNS (1), /* variable shift costs. */ + COSTS_N_INSNS (1), /* constant shift costs. */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ + COSTS_N_INSNS (3), /* HI. */ + COSTS_N_INSNS (3), /* SI. */ + COSTS_N_INSNS (3), /* DI. */ + COSTS_N_INSNS (3)}, /* other. */ + 0, /* cost of multiply per each bit + set. */ + /* Depending on parameters, idiv can get faster on ryzen. This is upper + bound. */ + {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ + COSTS_N_INSNS (22), /* HI. */ + COSTS_N_INSNS (30), /* SI. */ + COSTS_N_INSNS (45), /* DI. */ + COSTS_N_INSNS (45)}, /* other. */ + COSTS_N_INSNS (1), /* cost of movsx. */ + COSTS_N_INSNS (1), /* cost of movzx. */ + 8, /* "large" insn. */ + 9, /* MOVE_RATIO. */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {8, 8, 8}, /* cost of storing integer + registers. */ + {6, 6, 6, 6, 12}, /* cost of loading SSE registers + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {8, 8, 8, 8, 16}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ + {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ + 2, 2, 3, /* cost of moving XMM,YMM,ZMM + register. */ + 6, /* cost of moving SSE register to integer. */ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, throughput 12. Approx 9 uops do not depend on vector size and every load is 7 uops. */ @@ -1416,6 +1560,30 @@ static stringop_algs skylake_memset[2] = { static const struct processor_costs skylake_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 6, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 6, 3}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {6, 6, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 10}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {6, 6}, /* cost of loading MMX registers + in SImode and DImode */ + {6, 6}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ + {6, 6, 6, 10, 20}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {8, 8, 8, 12, 24}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 2, 2, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1)+1, /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -1437,30 +1605,18 @@ struct processor_costs skylake_cost = { COSTS_N_INSNS (0), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ - - 6, /* cost for loading QImode using movzbl */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {6, 6, 3}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {6, 6, 8}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 10}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {6, 6}, /* cost of loading MMX registers - in SImode and DImode */ - {6, 6}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ - {6, 6, 6, 10, 20}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {6, 6, 6, 10, 20}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {8, 8, 8, 12, 24}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ - {8, 8, 8, 12, 24}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ - 2, 2, /* SSE->integer and integer->SSE moves */ + 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ + 2, /* cost of moving SSE register to integer. */ 20, 8, /* Gather load static, per_elt. */ 22, 10, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1509,6 +1665,30 @@ static stringop_algs btver1_memset[2] = { {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; const struct processor_costs btver1_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 8, /* cost for loading QImode using movzbl */ + {6, 8, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 8, 6}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 28}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {12, 12, 38}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 4, /* cost of moving MMX register */ + {10, 10}, /* cost of loading MMX registers + in SImode and DImode */ + {12, 12}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {10, 10, 12, 48, 96}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 12, 48, 96}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 14, 14, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -1528,32 +1708,18 @@ const struct processor_costs btver1_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 8, /* cost for loading QImode using movzbl */ {6, 8, 6}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {6, 8, 6}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {12, 12, 28}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {12, 12, 38}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 4, /* cost of moving MMX register */ - {10, 10}, /* cost of loading MMX registers - in SImode and DImode */ - {12, 12}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {10, 10, 12, 48, 96}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {10, 10, 12, 48, 96}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {10, 10, 12, 48, 96}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ - {10, 10, 12, 48, 96}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ - 14, 14, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 14, /* cost of moving SSE register to integer. */ 10, 10, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -1600,6 +1766,30 @@ static stringop_algs btver2_memset[2] = { {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; const struct processor_costs btver2_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 8, /* cost for loading QImode using movzbl */ + {8, 8, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {8, 8, 6}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 28}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {12, 12, 38}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 4, /* cost of moving MMX register */ + {10, 10}, /* cost of loading MMX registers + in SImode and DImode */ + {12, 12}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {10, 10, 12, 48, 96}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {10, 10, 12, 48, 96}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 14, 14, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -1619,32 +1809,18 @@ const struct processor_costs btver2_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 9, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 8, /* cost for loading QImode using movzbl */ {8, 8, 6}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {8, 8, 6}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {12, 12, 28}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {12, 12, 38}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 4, /* cost of moving MMX register */ - {10, 10}, /* cost of loading MMX registers - in SImode and DImode */ - {12, 12}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {10, 10, 12, 48, 96}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {10, 10, 12, 48, 96}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {10, 10, 12, 48, 96}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ - {10, 10, 12, 48, 96}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ - 14, 14, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 14, /* cost of moving SSE register to integer. */ 10, 10, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -1690,6 +1866,30 @@ static stringop_algs pentium4_memset[2] = { static const struct processor_costs pentium4_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 5, /* cost for loading QImode using movzbl */ + {4, 5, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 3, 2}, /* cost of storing integer registers */ + 12, /* cost of reg,reg fld/fst */ + {14, 14, 14}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {14, 14, 14}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 12, /* cost of moving MMX register */ + {16, 16}, /* cost of loading MMX registers + in SImode and DImode */ + {16, 16}, /* cost of storing MMX registers + in SImode and DImode */ + 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ + {16, 16, 16, 32, 64}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {16, 16, 16, 32, 64}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 20, 12, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (3), /* cost of a lea instruction */ COSTS_N_INSNS (4), /* variable shift costs */ @@ -1709,32 +1909,18 @@ struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 16, /* "large" insn */ 6, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 5, /* cost for loading QImode using movzbl */ {4, 5, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {2, 3, 2}, /* cost of storing integer registers */ - 12, /* cost of reg,reg fld/fst */ - {14, 14, 14}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {14, 14, 14}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 12, /* cost of moving MMX register */ - {16, 16}, /* cost of loading MMX registers - in SImode and DImode */ - {16, 16}, /* cost of storing MMX registers - in SImode and DImode */ - 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ - {16, 16, 16, 32, 64}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {16, 16, 16, 32, 64}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {16, 16, 16, 32, 64}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {32, 32, 32, 64, 128}, /* cost of unaligned loads. */ - {16, 16, 16, 32, 64}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ - 20, 12, /* SSE->integer and integer->SSE moves */ + 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ + 20, /* cost of moving SSE register to integer. */ 16, 16, /* Gather load static, per_elt. */ 16, 16, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -1783,6 +1969,30 @@ static stringop_algs nocona_memset[2] = { static const struct processor_costs nocona_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 4, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 12, /* cost of reg,reg fld/fst */ + {14, 14, 14}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {14, 14, 14}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 14, /* cost of moving MMX register */ + {12, 12}, /* cost of loading MMX registers + in SImode and DImode */ + {12, 12}, /* cost of storing MMX registers + in SImode and DImode */ + 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ + {12, 12, 12, 24, 48}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {12, 12, 12, 24, 48}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 20, 12, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -1802,32 +2012,18 @@ struct processor_costs nocona_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 16, /* "large" insn */ 17, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 4, /* cost for loading QImode using movzbl */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {4, 4, 4}, /* cost of storing integer registers */ - 12, /* cost of reg,reg fld/fst */ - {14, 14, 14}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {14, 14, 14}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 14, /* cost of moving MMX register */ - {12, 12}, /* cost of loading MMX registers - in SImode and DImode */ - {12, 12}, /* cost of storing MMX registers - in SImode and DImode */ - 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ - {12, 12, 12, 24, 48}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {12, 12, 12, 24, 48}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {12, 12, 12, 24, 48}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {24, 24, 24, 48, 96}, /* cost of unaligned loads. */ - {12, 12, 12, 24, 48}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ - 20, 12, /* SSE->integer and integer->SSE moves */ + 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ + 20, /* cost of moving SSE register to integer. */ 12, 12, /* Gather load static, per_elt. */ 12, 12, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -1874,6 +2070,30 @@ static stringop_algs atom_memset[2] = { {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; static const struct processor_costs atom_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 6, /* cost for loading QImode using movzbl */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 6, 6}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {6, 6, 18}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {14, 14, 24}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {10, 10}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {8, 8, 8, 16, 32}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {8, 8, 8, 16, 32}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 8, 6, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -1893,32 +2113,18 @@ struct processor_costs atom_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 6, /* cost for loading QImode using movzbl */ {6, 6, 6}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {6, 6, 6}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {6, 6, 18}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {14, 14, 24}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {8, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {10, 10}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {8, 8, 8, 16, 32}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {8, 8, 8, 16, 32}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {8, 8, 8, 16, 32}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ - {8, 8, 8, 16, 32}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ - 8, 6, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 8, /* cost of moving SSE register to integer. */ 8, 8, /* Gather load static, per_elt. */ 8, 8, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -1965,6 +2171,30 @@ static stringop_algs slm_memset[2] = { {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; static const struct processor_costs slm_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 8, /* cost for loading QImode using movzbl */ + {8, 8, 8}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 6, 6}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {8, 8, 18}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 18}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {6, 6}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + {8, 8, 8, 16, 32}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {8, 8, 8, 16, 32}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 8, 6, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -1984,32 +2214,18 @@ struct processor_costs slm_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 8, /* cost for loading QImode using movzbl */ {8, 8, 8}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {6, 6, 6}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {8, 8, 18}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 18}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {8, 8}, /* cost of loading MMX registers - in SImode and DImode */ - {6, 6}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {8, 8, 8, 16, 32}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {8, 8, 8, 16, 32}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {8, 8, 8, 16, 32}, /* cost of storing SSE register + in SImode, DImode and TImode. */ {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ - {8, 8, 8, 16, 32}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ - 8, 6, /* SSE->integer and integer->SSE moves */ + 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ + 8, /* cost of moving SSE register to integer. */ 8, 8, /* Gather load static, per_elt. */ 8, 8, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2056,6 +2272,30 @@ static stringop_algs intel_memset[2] = { {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; static const struct processor_costs intel_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 6, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 6, 6}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {6, 6, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 10}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {6, 6}, /* cost of loading MMX registers + in SImode and DImode */ + {6, 6}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ + {6, 6, 6, 6, 6}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {6, 6, 6, 6, 6}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 4, 4, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -2075,32 +2315,18 @@ struct processor_costs intel_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 6, /* cost for loading QImode using movzbl */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {6, 6, 6}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {6, 6, 8}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 10}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {6, 6}, /* cost of loading MMX registers - in SImode and DImode */ - {6, 6}, /* cost of storing MMX registers - in SImode and DImode */ + {6, 6, 6, 6, 6}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {6, 6, 6, 6, 6}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ + {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ - {6, 6, 6, 6, 6}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ - {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ - {6, 6, 6, 6, 6}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ - {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ - 4, 4, /* SSE->integer and integer->SSE moves */ + 4, /* cost of moving SSE register to integer. */ 6, 6, /* Gather load static, per_elt. */ 6, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2151,6 +2377,30 @@ static stringop_algs generic_memset[2] = { {-1, libcall, false}}}}; static const struct processor_costs generic_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 6, /* cost for loading QImode using movzbl */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 6, 6}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {6, 6, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 12}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {6, 6}, /* cost of loading MMX registers + in SImode and DImode */ + {6, 6}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ + {6, 6, 6, 10, 15}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {6, 6, 6, 10, 15}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 6, 6, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ /* Setting cost to 2 makes our current implementation of synth_mult result in use of unnecessary temporary registers causing regression on several @@ -2173,32 +2423,18 @@ struct processor_costs generic_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 6, /* cost for loading QImode using movzbl */ {6, 6, 6}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {6, 6, 6}, /* cost of storing integer registers */ - 4, /* cost of reg,reg fld/fst */ - {6, 6, 12}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 12}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {6, 6}, /* cost of loading MMX registers - in SImode and DImode */ - {6, 6}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ - {6, 6, 6, 10, 15}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {6, 6, 6, 10, 15}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {6, 6, 6, 10, 15}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ - {6, 6, 6, 10, 15}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ - 6, 6, /* SSE->integer and integer->SSE moves */ + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ + 6, /* cost of moving SSE register to integer. */ 18, 6, /* Gather load static, per_elt. */ 18, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2251,6 +2487,30 @@ static stringop_algs core_memset[2] = { static const struct processor_costs core_cost = { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 6, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 6, 6}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {6, 6, 8}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 10}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {6, 6}, /* cost of loading MMX registers + in SImode and DImode */ + {6, 6}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ + {6, 6, 6, 6, 12}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {6, 6, 6, 6, 12}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 2, 2, /* SSE->integer and integer->SSE moves */ + /* End of register allocator costs. */ + COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With this cost however our current implementation of synth_mult results in @@ -2277,32 +2537,18 @@ struct processor_costs core_cost = { COSTS_N_INSNS (1), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ - - /* All move costs are relative to integer->integer move times 2 and thus - they are latency*2. */ - 6, /* cost for loading QImode using movzbl */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ {6, 6, 6}, /* cost of storing integer registers */ - 2, /* cost of reg,reg fld/fst */ - {6, 6, 8}, /* cost of loading fp registers - in SFmode, DFmode and XFmode */ - {6, 6, 10}, /* cost of storing fp registers - in SFmode, DFmode and XFmode */ - 2, /* cost of moving MMX register */ - {6, 6}, /* cost of loading MMX registers - in SImode and DImode */ - {6, 6}, /* cost of storing MMX registers - in SImode and DImode */ - 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ - {6, 6, 6, 6, 12}, /* cost of loading SSE registers - in 32,64,128,256 and 512-bit */ + {6, 6, 6, 6, 12}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {6, 6, 6, 6, 12}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ - {6, 6, 6, 6, 12}, /* cost of storing SSE registers - in 32,64,128,256 and 512-bit */ {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ - 2, 2, /* SSE->integer and integer->SSE moves */ + 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ + 2, /* cost of moving SSE register to integer. */ /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, rec. throughput 6. So 5 uops statically and one uops per load. */