linux/arch/powerpc/kernel/power4-pmu.c

559 lines
15 KiB
C
Raw Normal View History

/*
* Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
*
* Copyright 2009 Paul Mackerras, IBM Corporation.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/perf_counter.h>
#include <asm/reg.h>
/*
* Bits in event code for POWER4
*/
#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
#define PM_PMC_MSK 0xf
#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
#define PM_UNIT_MSK 0xf
#define PM_LOWER_SH 6
#define PM_LOWER_MSK 1
#define PM_LOWER_MSKS 0x40
#define PM_BYTE_SH 4 /* Byte number of event bus to use */
#define PM_BYTE_MSK 3
#define PM_PMCSEL_MSK 7
/*
* Unit code values
*/
#define PM_FPU 1
#define PM_ISU1 2
#define PM_IFU 3
#define PM_IDU0 4
#define PM_ISU1_ALT 6
#define PM_ISU2 7
#define PM_IFU_ALT 8
#define PM_LSU0 9
#define PM_LSU1 0xc
#define PM_GPS 0xf
/*
* Bits in MMCR0 for POWER4
*/
#define MMCR0_PMC1SEL_SH 8
#define MMCR0_PMC2SEL_SH 1
#define MMCR_PMCSEL_MSK 0x1f
/*
* Bits in MMCR1 for POWER4
*/
#define MMCR1_TTM0SEL_SH 62
#define MMCR1_TTC0SEL_SH 61
#define MMCR1_TTM1SEL_SH 59
#define MMCR1_TTC1SEL_SH 58
#define MMCR1_TTM2SEL_SH 56
#define MMCR1_TTC2SEL_SH 55
#define MMCR1_TTM3SEL_SH 53
#define MMCR1_TTC3SEL_SH 52
#define MMCR1_TTMSEL_MSK 3
#define MMCR1_TD_CP_DBG0SEL_SH 50
#define MMCR1_TD_CP_DBG1SEL_SH 48
#define MMCR1_TD_CP_DBG2SEL_SH 46
#define MMCR1_TD_CP_DBG3SEL_SH 44
#define MMCR1_DEBUG0SEL_SH 43
#define MMCR1_DEBUG1SEL_SH 42
#define MMCR1_DEBUG2SEL_SH 41
#define MMCR1_DEBUG3SEL_SH 40
#define MMCR1_PMC1_ADDER_SEL_SH 39
#define MMCR1_PMC2_ADDER_SEL_SH 38
#define MMCR1_PMC6_ADDER_SEL_SH 37
#define MMCR1_PMC5_ADDER_SEL_SH 36
#define MMCR1_PMC8_ADDER_SEL_SH 35
#define MMCR1_PMC7_ADDER_SEL_SH 34
#define MMCR1_PMC3_ADDER_SEL_SH 33
#define MMCR1_PMC4_ADDER_SEL_SH 32
#define MMCR1_PMC3SEL_SH 27
#define MMCR1_PMC4SEL_SH 22
#define MMCR1_PMC5SEL_SH 17
#define MMCR1_PMC6SEL_SH 12
#define MMCR1_PMC7SEL_SH 7
#define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */
static short mmcr1_adder_bits[8] = {
MMCR1_PMC1_ADDER_SEL_SH,
MMCR1_PMC2_ADDER_SEL_SH,
MMCR1_PMC3_ADDER_SEL_SH,
MMCR1_PMC4_ADDER_SEL_SH,
MMCR1_PMC5_ADDER_SEL_SH,
MMCR1_PMC6_ADDER_SEL_SH,
MMCR1_PMC7_ADDER_SEL_SH,
MMCR1_PMC8_ADDER_SEL_SH
};
/*
* Bits in MMCRA
*/
#define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */
/*
* Layout of constraint bits:
* 6666555555555544444444443333333333222222222211111111110000000000
* 3210987654321098765432109876543210987654321098765432109876543210
* |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><>
* | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
* \SMPL ||\TTC3SEL
* |\TTC_IFU_SEL
* \TTM2SEL0
*
* SMPL - SAMPLE_ENABLE constraint
* 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
*
* UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
* 55: UC1 error 0x0080_0000_0000_0000
* 54: FPU events needed 0x0040_0000_0000_0000
* 53: ISU1 events needed 0x0020_0000_0000_0000
* 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
*
* UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
* 51: UC2 error 0x0008_0000_0000_0000
* 50: FPU events needed 0x0004_0000_0000_0000
* 49: IFU events needed 0x0002_0000_0000_0000
* 48: LSU0 events needed 0x0001_0000_0000_0000
*
* UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
* 47: UC3 error 0x8000_0000_0000
* 46: LSU0 events needed 0x4000_0000_0000
* 45: IFU events needed 0x2000_0000_0000
* 44: IDU0|ISU2 events needed 0x1000_0000_0000
* 43: ISU1 events needed 0x0800_0000_0000
*
* TTM2SEL0
* 42: 0 = IDU0 events needed
* 1 = ISU2 events needed 0x0400_0000_0000
*
* TTC_IFU_SEL
* 41: 0 = IFU.U events needed
* 1 = IFU.L events needed 0x0200_0000_0000
*
* TTC3SEL
* 40: 0 = LSU1.U events needed
* 1 = LSU1.L events needed 0x0100_0000_0000
*
* PS1
* 39: PS1 error 0x0080_0000_0000
* 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
*
* PS2
* 35: PS2 error 0x0008_0000_0000
* 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
*
* B0
* 28-31: Byte 0 event source 0xf000_0000
* 1 = FPU
* 2 = ISU1
* 3 = IFU
* 4 = IDU0
* 7 = ISU2
* 9 = LSU0
* c = LSU1
* f = GPS
*
* B1, B2, B3
* 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
*
* P8
* 15: P8 error 0x8000
* 14-15: Count of events needing PMC8
*
* P1..P7
* 0-13: Count of events needing PMC1..PMC7
*
* Note: this doesn't allow events using IFU.U to be combined with events
* using IFU.L, though that is feasible (using TTM0 and TTM2). However
* there are no listed events for IFU.L (they are debug events not
* verified for performance monitoring) so this shouldn't cause a
* problem.
*/
static struct unitinfo {
u64 value, mask;
int unit;
int lowerbit;
} p4_unitinfo[16] = {
[PM_FPU] = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
[PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
[PM_ISU1_ALT] =
{ 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
[PM_IFU] = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
[PM_IFU_ALT] =
{ 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
[PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
[PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
[PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
[PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
[PM_GPS] = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
};
static unsigned char direct_marked_event[8] = {
(1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
(1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
(1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */
(1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
(1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */
(1<<3) | (1<<4) | (1<<5),
/* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
(1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
(1<<4), /* PMC8: PM_MRK_LSU_FIN */
};
/*
* Returns 1 if event counts things relating to marked instructions
* and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
*/
static int p4_marked_instr_event(unsigned int event)
{
int pmc, psel, unit, byte, bit;
unsigned int mask;
pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
psel = event & PM_PMCSEL_MSK;
if (pmc) {
if (direct_marked_event[pmc - 1] & (1 << psel))
return 1;
if (psel == 0) /* add events */
bit = (pmc <= 4)? pmc - 1: 8 - pmc;
else if (psel == 6) /* decode events */
bit = 4;
else
return 0;
} else
bit = psel;
byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
mask = 0;
switch (unit) {
case PM_LSU1:
if (event & PM_LOWER_MSKS)
mask = 1 << 28; /* byte 7 bit 4 */
else
mask = 6 << 24; /* byte 3 bits 1 and 2 */
break;
case PM_LSU0:
/* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
mask = 0x083dff00;
}
return (mask >> (byte * 8 + bit)) & 1;
}
static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
{
int pmc, byte, unit, lower, sh;
u64 mask = 0, value = 0;
int grp = -1;
pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
if (pmc) {
if (pmc > 8)
return -1;
sh = (pmc - 1) * 2;
mask |= 2 << sh;
value |= 1 << sh;
grp = ((pmc - 1) >> 1) & 1;
}
unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
if (unit) {
lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
/*
* Bus events on bytes 0 and 2 can be counted
* on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
*/
if (!pmc)
grp = byte & 1;
if (!p4_unitinfo[unit].unit)
return -1;
mask |= p4_unitinfo[unit].mask;
value |= p4_unitinfo[unit].value;
sh = p4_unitinfo[unit].lowerbit;
if (sh > 1)
value |= (u64)lower << sh;
else if (lower != sh)
return -1;
unit = p4_unitinfo[unit].unit;
/* Set byte lane select field */
mask |= 0xfULL << (28 - 4 * byte);
value |= (u64)unit << (28 - 4 * byte);
}
if (grp == 0) {
/* increment PMC1/2/5/6 field */
mask |= 0x8000000000ull;
value |= 0x1000000000ull;
} else {
/* increment PMC3/4/7/8 field */
mask |= 0x800000000ull;
value |= 0x100000000ull;
}
/* Marked instruction events need sample_enable set */
if (p4_marked_instr_event(event)) {
mask |= 1ull << 56;
value |= 1ull << 56;
}
/* PMCSEL=6 decode events on byte 2 need sample_enable clear */
if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
mask |= 1ull << 56;
*maskp = mask;
*valp = value;
return 0;
}
static unsigned int ppc_inst_cmpl[] = {
0x1001, 0x4001, 0x6001, 0x7001, 0x8001
};
perf_counter: powerpc: allow use of limited-function counters POWER5+ and POWER6 have two hardware counters with limited functionality: PMC5 counts instructions completed in run state and PMC6 counts cycles in run state. (Run state is the state when a hardware RUN bit is 1; the idle task clears RUN while waiting for work to do and sets it when there is work to do.) These counters can't be written to by the kernel, can't generate interrupts, and don't obey the freeze conditions. That means we can only use them for per-task counters (where we know we'll always be in run state; we can't put a per-task counter on an idle task), and only if we don't want interrupts and we do want to count in all processor modes. Obviously some counters can't go on a limited hardware counter, but there are also situations where we can only put a counter on a limited hardware counter - if there are already counters on that exclude some processor modes and we want to put on a per-task cycle or instruction counter that doesn't exclude any processor mode, it could go on if it can use a limited hardware counter. To keep track of these constraints, this adds a flags argument to the processor-specific get_alternatives() functions, with three bits defined: one to say that we can accept alternative event codes that go on limited counters, one to say we only want alternatives on limited counters, and one to say that this is a per-task counter and therefore events that are gated by run state are equivalent to those that aren't (e.g. a "cycles" event is equivalent to a "cycles in run state" event). These flags are computed for each counter and stored in the counter->hw.counter_base field (slightly wonky name for what it does, but it was an existing unused field). Since the limited counters don't freeze when we freeze the other counters, we need some special handling to avoid getting skew between things counted on the limited counters and those counted on normal counters. To minimize this skew, if we are using any limited counters, we read PMC5 and PMC6 immediately after setting and clearing the freeze bit. This is done in a single asm in the new write_mmcr0() function. The code here is specific to PMC5 and PMC6 being the limited hardware counters. Being more general (e.g. having a bitmap of limited hardware counter numbers) would have meant more complex code to read the limited counters when freezing and unfreezing the normal counters, with conditional branches, which would have increased the skew. Since it isn't necessary for the code to be more general at this stage, it isn't. This also extends the back-ends for POWER5+ and POWER6 to be able to handle up to 6 counters rather than the 4 they previously handled. Signed-off-by: Paul Mackerras <paulus@samba.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Robert Richter <robert.richter@amd.com> LKML-Reference: <18936.19035.163066.892208@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-04-29 14:38:51 +02:00
static int p4_get_alternatives(unsigned int event, unsigned int flags,
unsigned int alt[])
{
int i, j, na;
alt[0] = event;
na = 1;
/* 2 possibilities for PM_GRP_DISP_REJECT */
if (event == 0x8003 || event == 0x0224) {
alt[1] = event ^ (0x8003 ^ 0x0224);
return 2;
}
/* 2 possibilities for PM_ST_MISS_L1 */
if (event == 0x0c13 || event == 0x0c23) {
alt[1] = event ^ (0x0c13 ^ 0x0c23);
return 2;
}
/* several possibilities for PM_INST_CMPL */
for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
if (event == ppc_inst_cmpl[i]) {
for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
if (j != i)
alt[na++] = ppc_inst_cmpl[j];
break;
}
}
return na;
}
static int p4_compute_mmcr(unsigned int event[], int n_ev,
unsigned int hwc[], u64 mmcr[])
{
u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
unsigned int pmc, unit, byte, psel, lower;
unsigned int ttm, grp;
unsigned int pmc_inuse = 0;
unsigned int pmc_grp_use[2];
unsigned char busbyte[4];
unsigned char unituse[16];
unsigned int unitlower = 0;
int i;
if (n_ev > 8)
return -1;
/* First pass to count resource use */
pmc_grp_use[0] = pmc_grp_use[1] = 0;
memset(busbyte, 0, sizeof(busbyte));
memset(unituse, 0, sizeof(unituse));
for (i = 0; i < n_ev; ++i) {
pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
if (pmc) {
if (pmc_inuse & (1 << (pmc - 1)))
return -1;
pmc_inuse |= 1 << (pmc - 1);
/* count 1/2/5/6 vs 3/4/7/8 use */
++pmc_grp_use[((pmc - 1) >> 1) & 1];
}
unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
if (unit) {
if (!pmc)
++pmc_grp_use[byte & 1];
if (unit == 6 || unit == 8)
/* map alt ISU1/IFU codes: 6->2, 8->3 */
unit = (unit >> 1) - 1;
if (busbyte[byte] && busbyte[byte] != unit)
return -1;
busbyte[byte] = unit;
lower <<= unit;
if (unituse[unit] && lower != (unitlower & lower))
return -1;
unituse[unit] = 1;
unitlower |= lower;
}
}
if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
return -1;
/*
* Assign resources and set multiplexer selects.
*
* Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
* Each TTMx can only select one unit, but since
* units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
* we have some choices.
*/
if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
unituse[6] = 1; /* Move 2 to 6 */
unituse[2] = 0;
}
if (unituse[3] & (unituse[1] | unituse[2])) {
unituse[8] = 1; /* Move 3 to 8 */
unituse[3] = 0;
unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
}
/* Check only one unit per TTMx */
if (unituse[1] + unituse[2] + unituse[3] > 1 ||
unituse[4] + unituse[6] + unituse[7] > 1 ||
unituse[8] + unituse[9] > 1 ||
(unituse[5] | unituse[10] | unituse[11] |
unituse[13] | unituse[14]))
return -1;
/* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */
mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
/* Set TTCxSEL fields. */
if (unitlower & 0xe)
mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
if (unitlower & 0xf0)
mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
if (unitlower & 0xf00)
mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
if (unitlower & 0x7000)
mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
/* Set byte lane select fields. */
for (byte = 0; byte < 4; ++byte) {
unit = busbyte[byte];
if (!unit)
continue;
if (unit == 0xf) {
/* special case for GPS */
mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
} else {
if (!unituse[unit])
ttm = unit - 1; /* 2->1, 3->2 */
else
ttm = unit >> 2;
mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
}
}
/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
for (i = 0; i < n_ev; ++i) {
pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
psel = event[i] & PM_PMCSEL_MSK;
if (!pmc) {
/* Bus event or 00xxx direct event (off or cycles) */
if (unit)
psel |= 0x10 | ((byte & 2) << 2);
for (pmc = 0; pmc < 8; ++pmc) {
if (pmc_inuse & (1 << pmc))
continue;
grp = (pmc >> 1) & 1;
if (unit) {
if (grp == (byte & 1))
break;
} else if (pmc_grp_use[grp] < 4) {
++pmc_grp_use[grp];
break;
}
}
pmc_inuse |= 1 << pmc;
} else {
/* Direct event */
--pmc;
if (psel == 0 && (byte & 2))
/* add events on higher-numbered bus */
mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
else if (psel == 6 && byte == 3)
/* seem to need to set sample_enable here */
mmcra |= MMCRA_SAMPLE_ENABLE;
psel |= 8;
}
if (pmc <= 1)
mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
else
mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
if (pmc == 7) /* PMC8 */
mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
hwc[i] = pmc;
if (p4_marked_instr_event(event[i]))
mmcra |= MMCRA_SAMPLE_ENABLE;
}
if (pmc_inuse & 1)
mmcr0 |= MMCR0_PMC1CE;
if (pmc_inuse & 0xfe)
mmcr0 |= MMCR0_PMCjCE;
mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
/* Return MMCRx values */
mmcr[0] = mmcr0;
mmcr[1] = mmcr1;
mmcr[2] = mmcra;
return 0;
}
static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
{
/*
* Setting the PMCxSEL field to 0 disables PMC x.
* (Note that pmc is 0-based here, not 1-based.)
*/
if (pmc <= 1) {
mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
} else {
mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
if (pmc == 7)
mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
}
}
static int p4_generic_events[] = {
[PERF_COUNT_CPU_CYCLES] = 7,
[PERF_COUNT_INSTRUCTIONS] = 0x1001,
[PERF_COUNT_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */
[PERF_COUNT_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */
[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */
[PERF_COUNT_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */
};
struct power_pmu power4_pmu = {
.n_counter = 8,
.max_alternatives = 5,
.add_fields = 0x0000001100005555ull,
.test_adder = 0x0011083300000000ull,
.compute_mmcr = p4_compute_mmcr,
.get_constraint = p4_get_constraint,
.get_alternatives = p4_get_alternatives,
.disable_pmc = p4_disable_pmc,
.n_generic = ARRAY_SIZE(p4_generic_events),
.generic_events = p4_generic_events,
};