Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf updates from Ingo Molnar: "The main kernel changes were: - add support for Intel's "adaptive PEBS v4" - which embedds LBS data in PEBS records and can thus batch up and reduce the IRQ (NMI) rate significantly - reducing overhead and making call-graph profiling less intrusive. - add Intel CPU core and uncore support updates for Tremont, Icelake, - extend the x86 PMU constraints scheduler with 'constraint ranges' to better support Icelake hw constraints, - make x86 call-chain support work better with CONFIG_FRAME_POINTER=y - misc other changes Tooling changes: - updates to the main tools: 'perf record', 'perf trace', 'perf stat' - updated Intel and S/390 vendor events - libtraceevent updates - misc other updates and fixes" * 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (69 commits) perf/x86: Make perf callchains work without CONFIG_FRAME_POINTER watchdog: Fix typo in comment perf/x86/intel: Add Tremont core PMU support perf/x86/intel/uncore: Add Intel Icelake uncore support perf/x86/msr: Add Icelake support perf/x86/intel/rapl: Add Icelake support perf/x86/intel/cstate: Add Icelake support perf/x86/intel: Add Icelake support perf/x86: Support constraint ranges perf/x86/lbr: Avoid reading the LBRs when adaptive PEBS handles them perf/x86/intel: Support adaptive PEBS v4 perf/x86/intel/ds: Extract code of event update in short period perf/x86/intel: Extract memory code PEBS parser for reuse perf/x86: Support outputting XMM registers perf/x86/intel: Force resched when TFA sysctl is modified perf/core: Add perf_pmu_resched() as global function perf/headers: Fix stale comment for struct perf_addr_filter perf/core: Make perf_swevent_init_cpu() static perf/x86: Add sanity checks to x86_schedule_events() perf/x86: Optimize x86_schedule_events() ...
2019-05-06 14:16:36 -07:00 · 2019-05-06 14:16:36 -07:00 · 90489a72fb
parent 007dc78fea d15d356887
commit 90489a72fb
140 changed files with 13693 additions and 8444 deletions
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@ -560,6 +560,21 @@ int x86_pmu_hw_config(struct perf_event *event)
 			return -EINVAL;
 	}

+	/* sample_regs_user never support XMM registers */
+	if (unlikely(event->attr.sample_regs_user & PEBS_XMM_REGS))
+		return -EINVAL;
+	/*
+	 * Besides the general purpose registers, XMM registers may
+	 * be collected in PEBS on some platforms, e.g. Icelake
+	 */
+	if (unlikely(event->attr.sample_regs_intr & PEBS_XMM_REGS)) {
+		if (x86_pmu.pebs_no_xmm_regs)
+			return -EINVAL;
+
+		if (!event->attr.precise_ip)
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }

@ -661,6 +676,10 @@ static inline int is_x86_event(struct perf_event *event)
 	return event->pmu == &pmu;
 }

+struct pmu *x86_get_pmu(void)
+{
+	return &pmu;
+}
 /*
 * Event scheduler state:
 *
@ -849,18 +868,43 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	struct event_constraint *c;
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	struct perf_event *e;
-	int i, wmin, wmax, unsched = 0;
+	int n0, i, wmin, wmax, unsched = 0;
 	struct hw_perf_event *hwc;

 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

+	/*
+	 * Compute the number of events already present; see x86_pmu_add(),
+	 * validate_group() and x86_pmu_commit_txn(). For the former two
+	 * cpuc->n_events hasn't been updated yet, while for the latter
+	 * cpuc->n_txn contains the number of events added in the current
+	 * transaction.
+	 */
+	n0 = cpuc->n_events;
+	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+		n0 -= cpuc->n_txn;
+
 	if (x86_pmu.start_scheduling)
 		x86_pmu.start_scheduling(cpuc);

 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-		cpuc->event_constraint[i] = NULL;
-		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
-		cpuc->event_constraint[i] = c;
+		c = cpuc->event_constraint[i];
+
+		/*
+		 * Previously scheduled events should have a cached constraint,
+		 * while new events should not have one.
+		 */
+		WARN_ON_ONCE((c && i >= n0) || (!c && i < n0));
+
+		/*
+		 * Request constraints for new events; or for those events that
+		 * have a dynamic constraint -- for those the constraint can
+		 * change due to external factors (sibling state, allow_tfa).
+		 */
+		if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) {
+			c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
+			cpuc->event_constraint[i] = c;
+		}

 		wmin = min(wmin, c->weight);
 		wmax = max(wmax, c->weight);
@ -925,25 +969,20 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	if (!unsched && assign) {
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
-			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
 			if (x86_pmu.commit_scheduling)
 				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
 		}
 	} else {
-		for (i = 0; i < n; i++) {
+		for (i = n0; i < n; i++) {
 			e = cpuc->event_list[i];
-			/*
-			 * do not put_constraint() on comitted events,
-			 * because they are good to go
-			 */
-			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
-				continue;

 			/*
 			 * release events that failed scheduling
 			 */
 			if (x86_pmu.put_event_constraints)
 				x86_pmu.put_event_constraints(cpuc, e);
+
+			cpuc->event_constraint[i] = NULL;
 		}
 	}

@ -1372,11 +1411,6 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	int i;

-	/*
-	 * event is descheduled
-	 */
-	event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
-
 	/*
 	 * If we're called during a txn, we only need to undo x86_pmu.add.
 	 * The events never got scheduled and ->cancel_txn will truncate
@ -1413,6 +1447,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 		cpuc->event_list[i-1] = cpuc->event_list[i];
 		cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
 	}
+	cpuc->event_constraint[i-1] = NULL;
 	--cpuc->n_events;

 	perf_event_update_userpage(event);
@ -2024,7 +2059,7 @@ static int validate_event(struct perf_event *event)
 	if (IS_ERR(fake_cpuc))
 		return PTR_ERR(fake_cpuc);

-	c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
+	c = x86_pmu.get_event_constraints(fake_cpuc, 0, event);

 	if (!c || !c->weight)
 		ret = -EINVAL;
@ -2072,8 +2107,7 @@ static int validate_group(struct perf_event *event)
 	if (n < 0)
 		goto out;

-	fake_cpuc->n_events = n;
-
+	fake_cpuc->n_events = 0;
 	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);

 out:
@ -2348,6 +2382,15 @@ void arch_perf_update_userpage(struct perf_event *event,
 	cyc2ns_read_end();
 }

+/*
+ * Determine whether the regs were taken from an irq/exception handler rather
+ * than from perf_arch_fetch_caller_regs().
+ */
+static bool perf_hw_regs(struct pt_regs *regs)
+{
+	return regs->flags & X86_EFLAGS_FIXED;
+}
+
 void
 perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
@ -2359,11 +2402,15 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
 		return;
 	}

-	if (perf_callchain_store(entry, regs->ip))
-		return;
+	if (perf_hw_regs(regs)) {
+		if (perf_callchain_store(entry, regs->ip))
+			return;
+		unwind_start(&state, current, regs, NULL);
+	} else {
+		unwind_start(&state, current, NULL, (void *)regs->sp);
+	}

-	for (unwind_start(&state, current, regs, NULL); !unwind_done(&state);
-	     unwind_next_frame(&state)) {
+	for (; !unwind_done(&state); unwind_next_frame(&state)) {
 		addr = unwind_get_return_address(&state);
 		if (!addr || perf_callchain_store(entry, addr))
 			return;
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@ -239,6 +239,35 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
 	EVENT_EXTRA_END
 };

+static struct event_constraint intel_icl_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	INTEL_UEVENT_CONSTRAINT(0x1c0, 0),	/* INST_RETIRED.PREC_DIST */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	FIXED_EVENT_CONSTRAINT(0x0400, 3),	/* SLOTS */
+	INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf),
+	INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf),
+	INTEL_EVENT_CONSTRAINT(0x32, 0xf),	/* SW_PREFETCH_ACCESS.* */
+	INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf),
+	INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf),
+	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff),  /* CYCLE_ACTIVITY.STALLS_TOTAL */
+	INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff),  /* CYCLE_ACTIVITY.STALLS_MEM_ANY */
+	INTEL_EVENT_CONSTRAINT(0xa3, 0xf),      /* CYCLE_ACTIVITY.* */
+	INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf),
+	INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf),
+	INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf),
+	INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_icl_extra_regs[] __read_mostly = {
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff9fffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff9fffull, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+	INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+	EVENT_EXTRA_END
+};
+
 EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3");
 EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");
@ -1827,6 +1856,45 @@ static __initconst const u64 glp_hw_cache_extra_regs
 	},
 };

+#define TNT_LOCAL_DRAM			BIT_ULL(26)
+#define TNT_DEMAND_READ			GLM_DEMAND_DATA_RD
+#define TNT_DEMAND_WRITE		GLM_DEMAND_RFO
+#define TNT_LLC_ACCESS			GLM_ANY_RESPONSE
+#define TNT_SNP_ANY			(SNB_SNP_NOT_NEEDED|SNB_SNP_MISS| \
+					 SNB_NO_FWD|SNB_SNP_FWD|SNB_HITM)
+#define TNT_LLC_MISS			(TNT_SNP_ANY|SNB_NON_DRAM|TNT_LOCAL_DRAM)
+
+static __initconst const u64 tnt_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= TNT_DEMAND_READ|
+						  TNT_LLC_ACCESS,
+			[C(RESULT_MISS)]	= TNT_DEMAND_READ|
+						  TNT_LLC_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= TNT_DEMAND_WRITE|
+						  TNT_LLC_ACCESS,
+			[C(RESULT_MISS)]	= TNT_DEMAND_WRITE|
+						  TNT_LLC_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= 0x0,
+			[C(RESULT_MISS)]	= 0x0,
+		},
+	},
+};
+
+static struct extra_reg intel_tnt_extra_regs[] __read_mostly = {
+	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffffff9fffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xffffff9fffull, RSP_1),
+	EVENT_EXTRA_END
+};
+
 #define KNL_OT_L2_HITE		BIT_ULL(19) /* Other Tile L2 Hit */
 #define KNL_OT_L2_HITF		BIT_ULL(20) /* Other Tile L2 Hit */
 #define KNL_MCDRAM_LOCAL	BIT_ULL(21)
@ -2015,7 +2083,7 @@ static void intel_tfa_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int
 	/*
 	 * We're going to use PMC3, make sure TFA is set before we touch it.
 	 */
-	if (cntr == 3 && !cpuc->is_fake)
+	if (cntr == 3)
 		intel_set_tfa(cpuc, true);
 }

@ -2149,6 +2217,11 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
 	bits <<= (idx * 4);
 	mask = 0xfULL << (idx * 4);

+	if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) {
+		bits |= ICL_FIXED_0_ADAPTIVE << (idx * 4);
+		mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4);
+	}
+
 	rdmsrl(hwc->config_base, ctrl_val);
 	ctrl_val &= ~mask;
 	ctrl_val |= bits;
@ -2692,7 +2765,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,

 	if (x86_pmu.event_constraints) {
 		for_each_event_constraint(c, x86_pmu.event_constraints) {
-			if ((event->hw.config & c->cmask) == c->code) {
+			if (constraint_match(c, event->hw.config)) {
 				event->hw.flags |= c->flags;
 				return c;
 			}
@ -2842,7 +2915,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
 	struct intel_excl_states *xlo;
 	int tid = cpuc->excl_thread_id;
-	int is_excl, i;
+	int is_excl, i, w;

 	/*
 	 * validating a group does not require
@ -2898,36 +2971,40 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 * SHARED   : sibling counter measuring non-exclusive event
 	 * UNUSED   : sibling counter unused
 	 */
+	w = c->weight;
 	for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
 		/*
 		 * exclusive event in sibling counter
 		 * our corresponding counter cannot be used
 		 * regardless of our event
 		 */
-		if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+		if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) {
 			__clear_bit(i, c->idxmsk);
+			w--;
+			continue;
+		}
 		/*
 		 * if measuring an exclusive event, sibling
 		 * measuring non-exclusive, then counter cannot
 		 * be used
 		 */
-		if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+		if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) {
 			__clear_bit(i, c->idxmsk);
+			w--;
+			continue;
+		}
 	}

-	/*
-	 * recompute actual bit weight for scheduling algorithm
-	 */
-	c->weight = hweight64(c->idxmsk64);
-
 	/*
 	 * if we return an empty mask, then switch
 	 * back to static empty constraint to avoid
 	 * the cost of freeing later on
 	 */
-	if (c->weight == 0)
+	if (!w)
 		c = &emptyconstraint;

+	c->weight = w;
+
 	return c;
 }

@ -2935,11 +3012,9 @@ static struct event_constraint *
 intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			    struct perf_event *event)
 {
-	struct event_constraint *c1 = NULL;
-	struct event_constraint *c2;
+	struct event_constraint *c1, *c2;

-	if (idx >= 0) /* fake does < 0 */
-		c1 = cpuc->event_constraint[idx];
+	c1 = cpuc->event_constraint[idx];

 	/*
 	 * first time only
@ -2947,7 +3022,8 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 	 * - dynamic constraint: handled by intel_get_excl_constraints()
 	 */
 	c2 = __intel_get_event_constraints(cpuc, idx, event);
-	if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+	if (c1) {
+	        WARN_ON_ONCE(!(c1->flags & PERF_X86_EVENT_DYNAMIC));
 		bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
 		c1->weight = c2->weight;
 		c2 = c1;
@ -3370,6 +3446,12 @@ static struct event_constraint counter0_constraint =
 static struct event_constraint counter2_constraint =
 			EVENT_CONSTRAINT(0, 0x4, 0);

+static struct event_constraint fixed0_constraint =
+			FIXED_EVENT_CONSTRAINT(0x00c0, 0);
+
+static struct event_constraint fixed0_counter0_constraint =
+			INTEL_ALL_EVENT_CONSTRAINT(0, 0x100000001ULL);
+
 static struct event_constraint *
 hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			  struct perf_event *event)
@ -3388,6 +3470,21 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 	return c;
 }

+static struct event_constraint *
+icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
+{
+	/*
+	 * Fixed counter 0 has less skid.
+	 * Force instruction:ppp in Fixed counter 0
+	 */
+	if ((event->attr.precise_ip == 3) &&
+	    constraint_match(&fixed0_constraint, event->hw.config))
+		return &fixed0_constraint;
+
+	return hsw_get_event_constraints(cpuc, idx, event);
+}
+
 static struct event_constraint *
 glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			  struct perf_event *event)
@ -3403,6 +3500,29 @@ glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 	return c;
 }

+static struct event_constraint *
+tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	/*
+	 * :ppp means to do reduced skid PEBS,
+	 * which is available on PMC0 and fixed counter 0.
+	 */
+	if (event->attr.precise_ip == 3) {
+		/* Force instruction:ppp on PMC0 and Fixed counter 0 */
+		if (constraint_match(&fixed0_constraint, event->hw.config))
+			return &fixed0_counter0_constraint;
+
+		return &counter0_constraint;
+	}
+
+	c = intel_get_event_constraints(cpuc, idx, event);
+
+	return c;
+}
+
 static bool allow_tsx_force_abort = true;

 static struct event_constraint *
@ -3414,7 +3534,7 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 	/*
 	 * Without TFA we must not use PMC3.
 	 */
-	if (!allow_tsx_force_abort && test_bit(3, c->idxmsk) && idx >= 0) {
+	if (!allow_tsx_force_abort && test_bit(3, c->idxmsk)) {
 		c = dyn_constraint(cpuc, c, idx);
 		c->idxmsk64 &= ~(1ULL << 3);
 		c->weight--;
@ -3511,6 +3631,8 @@ static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)

 int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
 {
+	cpuc->pebs_record_size = x86_pmu.pebs_record_size;
+
 	if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
 		cpuc->shared_regs = allocate_shared_regs(cpu);
 		if (!cpuc->shared_regs)
@ -4118,6 +4240,42 @@ static struct attribute *hsw_tsx_events_attrs[] = {
 	NULL
 };

+EVENT_ATTR_STR(tx-capacity-read,  tx_capacity_read,  "event=0x54,umask=0x80");
+EVENT_ATTR_STR(tx-capacity-write, tx_capacity_write, "event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-capacity-read,  el_capacity_read,  "event=0x54,umask=0x80");
+EVENT_ATTR_STR(el-capacity-write, el_capacity_write, "event=0x54,umask=0x2");
+
+static struct attribute *icl_events_attrs[] = {
+	EVENT_PTR(mem_ld_hsw),
+	EVENT_PTR(mem_st_hsw),
+	NULL,
+};
+
+static struct attribute *icl_tsx_events_attrs[] = {
+	EVENT_PTR(tx_start),
+	EVENT_PTR(tx_abort),
+	EVENT_PTR(tx_commit),
+	EVENT_PTR(tx_capacity_read),
+	EVENT_PTR(tx_capacity_write),
+	EVENT_PTR(tx_conflict),
+	EVENT_PTR(el_start),
+	EVENT_PTR(el_abort),
+	EVENT_PTR(el_commit),
+	EVENT_PTR(el_capacity_read),
+	EVENT_PTR(el_capacity_write),
+	EVENT_PTR(el_conflict),
+	EVENT_PTR(cycles_t),
+	EVENT_PTR(cycles_ct),
+	NULL,
+};
+
+static __init struct attribute **get_icl_events_attrs(void)
+{
+	return boot_cpu_has(X86_FEATURE_RTM) ?
+		merge_attr(icl_events_attrs, icl_tsx_events_attrs) :
+		icl_events_attrs;
+}
+
 static ssize_t freeze_on_smi_show(struct device *cdev,
 				  struct device_attribute *attr,
 				  char *buf)
@ -4157,6 +4315,50 @@ done:
 	return count;
 }

+static void update_tfa_sched(void *ignored)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	/*
+	 * check if PMC3 is used
+	 * and if so force schedule out for all event types all contexts
+	 */
+	if (test_bit(3, cpuc->active_mask))
+		perf_pmu_resched(x86_get_pmu());
+}
+
+static ssize_t show_sysctl_tfa(struct device *cdev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	return snprintf(buf, 40, "%d\n", allow_tsx_force_abort);
+}
+
+static ssize_t set_sysctl_tfa(struct device *cdev,
+			      struct device_attribute *attr,
+			      const char *buf, size_t count)
+{
+	bool val;
+	ssize_t ret;
+
+	ret = kstrtobool(buf, &val);
+	if (ret)
+		return ret;
+
+	/* no change */
+	if (val == allow_tsx_force_abort)
+		return count;
+
+	allow_tsx_force_abort = val;
+
+	get_online_cpus();
+	on_each_cpu(update_tfa_sched, NULL, 1);
+	put_online_cpus();
+
+	return count;
+}
+
+
 static DEVICE_ATTR_RW(freeze_on_smi);

 static ssize_t branches_show(struct device *cdev,
@ -4189,7 +4391,9 @@ static struct attribute *intel_pmu_caps_attrs[] = {
       NULL
 };

-static DEVICE_BOOL_ATTR(allow_tsx_force_abort, 0644, allow_tsx_force_abort);
+static DEVICE_ATTR(allow_tsx_force_abort, 0644,
+		   show_sysctl_tfa,
+		   set_sysctl_tfa);

 static struct attribute *intel_pmu_attrs[] = {
 	&dev_attr_freeze_on_smi.attr,
@ -4450,6 +4654,32 @@ __init int intel_pmu_init(void)
 		name = "goldmont_plus";
 		break;

+	case INTEL_FAM6_ATOM_TREMONT_X:
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+		hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+
+		intel_pmu_lbr_init_skl();
+
+		x86_pmu.event_constraints = intel_slm_event_constraints;
+		x86_pmu.extra_regs = intel_tnt_extra_regs;
+		/*
+		 * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
+		 * for precise cycles.
+		 */
+		x86_pmu.pebs_aliases = NULL;
+		x86_pmu.pebs_prec_dist = true;
+		x86_pmu.lbr_pt_coexist = true;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.get_event_constraints = tnt_get_event_constraints;
+		extra_attr = slm_format_attr;
+		pr_cont("Tremont events, ");
+		name = "Tremont";
+		break;
+
 	case INTEL_FAM6_WESTMERE:
 	case INTEL_FAM6_WESTMERE_EP:
 	case INTEL_FAM6_WESTMERE_EX:
@ -4698,13 +4928,41 @@ __init int intel_pmu_init(void)
 			x86_pmu.get_event_constraints = tfa_get_event_constraints;
 			x86_pmu.enable_all = intel_tfa_pmu_enable_all;
 			x86_pmu.commit_scheduling = intel_tfa_commit_scheduling;
-			intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr.attr;
+			intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr;
 		}

 		pr_cont("Skylake events, ");
 		name = "skylake";
 		break;

+	case INTEL_FAM6_ICELAKE_MOBILE:
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+		intel_pmu_lbr_init_skl();
+
+		x86_pmu.event_constraints = intel_icl_event_constraints;
+		x86_pmu.pebs_constraints = intel_icl_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_icl_extra_regs;
+		x86_pmu.pebs_aliases = NULL;
+		x86_pmu.pebs_prec_dist = true;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = icl_get_event_constraints;
+		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+			hsw_format_attr : nhm_format_attr;
+		extra_attr = merge_attr(extra_attr, skl_format_attr);
+		x86_pmu.cpu_events = get_icl_events_attrs();
+		x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02);
+		x86_pmu.lbr_pt_coexist = true;
+		intel_pmu_pebs_data_source_skl(false);
+		pr_cont("Icelake events, ");
+		name = "icelake";
+		break;
+
 	default:
 		switch (x86_pmu.version) {
 		case 1:
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@ -578,6 +578,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
 	X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates),

 	X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates),
+
+	X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_MOBILE, snb_cstates),
 	{ },
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@ -849,6 +849,26 @@ struct event_constraint intel_skl_pebs_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };

+struct event_constraint intel_icl_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL),	/* INST_RETIRED.PREC_DIST */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x400000000ULL),	/* SLOTS */
+
+	INTEL_PLD_CONSTRAINT(0x1cd, 0xff),			/* MEM_TRANS_RETIRED.LOAD_LATENCY */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),	/* MEM_INST_RETIRED.LOAD */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),	/* MEM_INST_RETIRED.STORE */
+
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), /* MEM_LOAD_*_RETIRED.* */
+
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),		/* MEM_INST_RETIRED.* */
+
+	/*
+	 * Everything else is handled by PMU_FL_PEBS_ALL, because we
+	 * need the full constraints from the main table.
+	 */
+
+	EVENT_CONSTRAINT_END
+};
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 {
 	struct event_constraint *c;
@ -858,7 +878,7 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)

 	if (x86_pmu.pebs_constraints) {
 		for_each_event_constraint(c, x86_pmu.pebs_constraints) {
-			if ((event->hw.config & c->cmask) == c->code) {
+			if (constraint_match(c, event->hw.config)) {
 				event->hw.flags |= c->flags;
 				return c;
 			}
@ -906,17 +926,87 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)

 	if (cpuc->n_pebs == cpuc->n_large_pebs) {
 		threshold = ds->pebs_absolute_maximum -
-			reserved * x86_pmu.pebs_record_size;
+			reserved * cpuc->pebs_record_size;
 	} else {
-		threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+		threshold = ds->pebs_buffer_base + cpuc->pebs_record_size;
 	}

 	ds->pebs_interrupt_threshold = threshold;
 }

-static void
-pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu)
+static void adaptive_pebs_record_size_update(void)
 {
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	u64 pebs_data_cfg = cpuc->pebs_data_cfg;
+	int sz = sizeof(struct pebs_basic);
+
+	if (pebs_data_cfg & PEBS_DATACFG_MEMINFO)
+		sz += sizeof(struct pebs_meminfo);
+	if (pebs_data_cfg & PEBS_DATACFG_GP)
+		sz += sizeof(struct pebs_gprs);
+	if (pebs_data_cfg & PEBS_DATACFG_XMMS)
+		sz += sizeof(struct pebs_xmm);
+	if (pebs_data_cfg & PEBS_DATACFG_LBRS)
+		sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry);
+
+	cpuc->pebs_record_size = sz;
+}
+
+#define PERF_PEBS_MEMINFO_TYPE	(PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC |   \
+				PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \
+				PERF_SAMPLE_TRANSACTION)
+
+static u64 pebs_update_adaptive_cfg(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	u64 sample_type = attr->sample_type;
+	u64 pebs_data_cfg = 0;
+	bool gprs, tsx_weight;
+
+	if (!(sample_type & ~(PERF_SAMPLE_IP|PERF_SAMPLE_TIME)) &&
+	    attr->precise_ip > 1)
+		return pebs_data_cfg;
+
+	if (sample_type & PERF_PEBS_MEMINFO_TYPE)
+		pebs_data_cfg |= PEBS_DATACFG_MEMINFO;
+
+	/*
+	 * We need GPRs when:
+	 * + user requested them
+	 * + precise_ip < 2 for the non event IP
+	 * + For RTM TSX weight we need GPRs for the abort code.
+	 */
+	gprs = (sample_type & PERF_SAMPLE_REGS_INTR) &&
+	       (attr->sample_regs_intr & PEBS_GP_REGS);
+
+	tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) &&
+		     ((attr->config & INTEL_ARCH_EVENT_MASK) ==
+		      x86_pmu.rtm_abort_event);
+
+	if (gprs || (attr->precise_ip < 2) || tsx_weight)
+		pebs_data_cfg |= PEBS_DATACFG_GP;
+
+	if ((sample_type & PERF_SAMPLE_REGS_INTR) &&
+	    (attr->sample_regs_intr & PEBS_XMM_REGS))
+		pebs_data_cfg |= PEBS_DATACFG_XMMS;
+
+	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+		/*
+		 * For now always log all LBRs. Could configure this
+		 * later.
+		 */
+		pebs_data_cfg |= PEBS_DATACFG_LBRS |
+			((x86_pmu.lbr_nr-1) << PEBS_DATACFG_LBR_SHIFT);
+	}
+
+	return pebs_data_cfg;
+}
+
+static void
+pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
+		  struct perf_event *event, bool add)
+{
+	struct pmu *pmu = event->ctx->pmu;
 	/*
 	 * Make sure we get updated with the first PEBS
 	 * event. It will trigger also during removal, but
@ -933,6 +1023,29 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu)
 		update = true;
 	}

+	/*
+	 * The PEBS record doesn't shrink on pmu::del(). Doing so would require
+	 * iterating all remaining PEBS events to reconstruct the config.
+	 */
+	if (x86_pmu.intel_cap.pebs_baseline && add) {
+		u64 pebs_data_cfg;
+
+		/* Clear pebs_data_cfg and pebs_record_size for first PEBS. */
+		if (cpuc->n_pebs == 1) {
+			cpuc->pebs_data_cfg = 0;
+			cpuc->pebs_record_size = sizeof(struct pebs_basic);
+		}
+
+		pebs_data_cfg = pebs_update_adaptive_cfg(event);
+
+		/* Update pebs_record_size if new event requires more data. */
+		if (pebs_data_cfg & ~cpuc->pebs_data_cfg) {
+			cpuc->pebs_data_cfg |= pebs_data_cfg;
+			adaptive_pebs_record_size_update();
+			update = true;
+		}
+	}
+
 	if (update)
 		pebs_update_threshold(cpuc);
 }
@ -947,7 +1060,7 @@ void intel_pmu_pebs_add(struct perf_event *event)
 	if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
 		cpuc->n_large_pebs++;

-	pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
+	pebs_update_state(needed_cb, cpuc, event, true);
 }

 void intel_pmu_pebs_enable(struct perf_event *event)
@ -960,11 +1073,19 @@ void intel_pmu_pebs_enable(struct perf_event *event)

 	cpuc->pebs_enabled |= 1ULL << hwc->idx;

-	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+	if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5))
 		cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
 	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
 		cpuc->pebs_enabled |= 1ULL << 63;

+	if (x86_pmu.intel_cap.pebs_baseline) {
+		hwc->config |= ICL_EVENTSEL_ADAPTIVE;
+		if (cpuc->pebs_data_cfg != cpuc->active_pebs_data_cfg) {
+			wrmsrl(MSR_PEBS_DATA_CFG, cpuc->pebs_data_cfg);
+			cpuc->active_pebs_data_cfg = cpuc->pebs_data_cfg;
+		}
+	}
+
 	/*
 	 * Use auto-reload if possible to save a MSR write in the PMI.
 	 * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
@ -991,7 +1112,7 @@ void intel_pmu_pebs_del(struct perf_event *event)
 	if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
 		cpuc->n_large_pebs--;

-	pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
+	pebs_update_state(needed_cb, cpuc, event, false);
 }

 void intel_pmu_pebs_disable(struct perf_event *event)
@ -1004,7 +1125,8 @@ void intel_pmu_pebs_disable(struct perf_event *event)

 	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);

-	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+	if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) &&
+	    (x86_pmu.version < 5))
 		cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
 	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
 		cpuc->pebs_enabled &= ~(1ULL << 63);
@ -1125,34 +1247,57 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	return 0;
 }

-static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
+static inline u64 intel_get_tsx_weight(u64 tsx_tuning)
 {
-	if (pebs->tsx_tuning) {
-		union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
+	if (tsx_tuning) {
+		union hsw_tsx_tuning tsx = { .value = tsx_tuning };
 		return tsx.cycles_last_block;
 	}
 	return 0;
 }

-static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
+static inline u64 intel_get_tsx_transaction(u64 tsx_tuning, u64 ax)
 {
-	u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
+	u64 txn = (tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;

 	/* For RTM XABORTs also log the abort code from AX */
-	if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
-		txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
+	if ((txn & PERF_TXN_TRANSACTION) && (ax & 1))
+		txn |= ((ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
 	return txn;
 }

-static void setup_pebs_sample_data(struct perf_event *event,
-				   struct pt_regs *iregs, void *__pebs,
-				   struct perf_sample_data *data,
-				   struct pt_regs *regs)
+static inline u64 get_pebs_status(void *n)
 {
+	if (x86_pmu.intel_cap.pebs_format < 4)
+		return ((struct pebs_record_nhm *)n)->status;
+	return ((struct pebs_basic *)n)->applicable_counters;
+}
+
 #define PERF_X86_EVENT_PEBS_HSW_PREC \
 		(PERF_X86_EVENT_PEBS_ST_HSW | \
 		 PERF_X86_EVENT_PEBS_LD_HSW | \
 		 PERF_X86_EVENT_PEBS_NA_HSW)
+
+static u64 get_data_src(struct perf_event *event, u64 aux)
+{
+	u64 val = PERF_MEM_NA;
+	int fl = event->hw.flags;
+	bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
+
+	if (fl & PERF_X86_EVENT_PEBS_LDLAT)
+		val = load_latency_data(aux);
+	else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
+		val = precise_datala_hsw(event, aux);
+	else if (fst)
+		val = precise_store_data(aux);
+	return val;
+}
+
+static void setup_pebs_fixed_sample_data(struct perf_event *event,
+				   struct pt_regs *iregs, void *__pebs,
+				   struct perf_sample_data *data,
+				   struct pt_regs *regs)
+{
 	/*
 	 * We cast to the biggest pebs_record but are careful not to
 	 * unconditionally access the 'extra' entries.
@ -1160,17 +1305,13 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct pebs_record_skl *pebs = __pebs;
 	u64 sample_type;
-	int fll, fst, dsrc;
-	int fl = event->hw.flags;
+	int fll;

 	if (pebs == NULL)
 		return;

 	sample_type = event->attr.sample_type;
-	dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
-
-	fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
-	fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
+	fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT;

 	perf_sample_data_init(data, 0, event->hw.last_period);

@ -1185,16 +1326,8 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	/*
 	 * data.data_src encodes the data source
 	 */
-	if (dsrc) {
-		u64 val = PERF_MEM_NA;
-		if (fll)
-			val = load_latency_data(pebs->dse);
-		else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
-			val = precise_datala_hsw(event, pebs->dse);
-		else if (fst)
-			val = precise_store_data(pebs->dse);
-		data->data_src.val = val;
-	}
+	if (sample_type & PERF_SAMPLE_DATA_SRC)
+		data->data_src.val = get_data_src(event, pebs->dse);

 	/*
 	 * We must however always use iregs for the unwinder to stay sane; the
@ -1281,10 +1414,11 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	if (x86_pmu.intel_cap.pebs_format >= 2) {
 		/* Only set the TSX weight when no memory weight. */
 		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
-			data->weight = intel_hsw_weight(pebs);
+			data->weight = intel_get_tsx_weight(pebs->tsx_tuning);

 		if (sample_type & PERF_SAMPLE_TRANSACTION)
-			data->txn = intel_hsw_transaction(pebs);
+			data->txn = intel_get_tsx_transaction(pebs->tsx_tuning,
+							      pebs->ax);
 	}

 	/*
@ -1301,6 +1435,140 @@ static void setup_pebs_sample_data(struct perf_event *event,
 		data->br_stack = &cpuc->lbr_stack;
 }

+static void adaptive_pebs_save_regs(struct pt_regs *regs,
+				    struct pebs_gprs *gprs)
+{
+	regs->ax = gprs->ax;
+	regs->bx = gprs->bx;
+	regs->cx = gprs->cx;
+	regs->dx = gprs->dx;
+	regs->si = gprs->si;
+	regs->di = gprs->di;
+	regs->bp = gprs->bp;
+	regs->sp = gprs->sp;
+#ifndef CONFIG_X86_32
+	regs->r8 = gprs->r8;
+	regs->r9 = gprs->r9;
+	regs->r10 = gprs->r10;
+	regs->r11 = gprs->r11;
+	regs->r12 = gprs->r12;
+	regs->r13 = gprs->r13;
+	regs->r14 = gprs->r14;
+	regs->r15 = gprs->r15;
+#endif
+}
+
+/*
+ * With adaptive PEBS the layout depends on what fields are configured.
+ */
+
+static void setup_pebs_adaptive_sample_data(struct perf_event *event,
+					    struct pt_regs *iregs, void *__pebs,
+					    struct perf_sample_data *data,
+					    struct pt_regs *regs)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct pebs_basic *basic = __pebs;
+	void *next_record = basic + 1;
+	u64 sample_type;
+	u64 format_size;
+	struct pebs_meminfo *meminfo = NULL;
+	struct pebs_gprs *gprs = NULL;
+	struct x86_perf_regs *perf_regs;
+
+	if (basic == NULL)
+		return;
+
+	perf_regs = container_of(regs, struct x86_perf_regs, regs);
+	perf_regs->xmm_regs = NULL;
+
+	sample_type = event->attr.sample_type;
+	format_size = basic->format_size;
+	perf_sample_data_init(data, 0, event->hw.last_period);
+	data->period = event->hw.last_period;
+
+	if (event->attr.use_clockid == 0)
+		data->time = native_sched_clock_from_tsc(basic->tsc);
+
+	/*
+	 * We must however always use iregs for the unwinder to stay sane; the
+	 * record BP,SP,IP can point into thin air when the record is from a
+	 * previous PMI context or an (I)RET happened between the record and
+	 * PMI.
+	 */
+	if (sample_type & PERF_SAMPLE_CALLCHAIN)
+		data->callchain = perf_callchain(event, iregs);
+
+	*regs = *iregs;
+	/* The ip in basic is EventingIP */
+	set_linear_ip(regs, basic->ip);
+	regs->flags = PERF_EFLAGS_EXACT;
+
+	/*
+	 * The record for MEMINFO is in front of GP
+	 * But PERF_SAMPLE_TRANSACTION needs gprs->ax.
+	 * Save the pointer here but process later.
+	 */
+	if (format_size & PEBS_DATACFG_MEMINFO) {
+		meminfo = next_record;
+		next_record = meminfo + 1;
+	}
+
+	if (format_size & PEBS_DATACFG_GP) {
+		gprs = next_record;
+		next_record = gprs + 1;
+
+		if (event->attr.precise_ip < 2) {
+			set_linear_ip(regs, gprs->ip);
+			regs->flags &= ~PERF_EFLAGS_EXACT;
+		}
+
+		if (sample_type & PERF_SAMPLE_REGS_INTR)
+			adaptive_pebs_save_regs(regs, gprs);
+	}
+
+	if (format_size & PEBS_DATACFG_MEMINFO) {
+		if (sample_type & PERF_SAMPLE_WEIGHT)
+			data->weight = meminfo->latency ?:
+				intel_get_tsx_weight(meminfo->tsx_tuning);
+
+		if (sample_type & PERF_SAMPLE_DATA_SRC)
+			data->data_src.val = get_data_src(event, meminfo->aux);
+
+		if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
+			data->addr = meminfo->address;
+
+		if (sample_type & PERF_SAMPLE_TRANSACTION)
+			data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning,
+							  gprs ? gprs->ax : 0);
+	}
+
+	if (format_size & PEBS_DATACFG_XMMS) {
+		struct pebs_xmm *xmm = next_record;
+
+		next_record = xmm + 1;
+		perf_regs->xmm_regs = xmm->xmm;
+	}
+
+	if (format_size & PEBS_DATACFG_LBRS) {
+		struct pebs_lbr *lbr = next_record;
+		int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT)
+					& 0xff) + 1;
+		next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry);
+
+		if (has_branch_stack(event)) {
+			intel_pmu_store_pebs_lbrs(lbr);
+			data->br_stack = &cpuc->lbr_stack;
+		}
+	}
+
+	WARN_ONCE(next_record != __pebs + (format_size >> 48),
+			"PEBS record size %llu, expected %llu, config %llx\n",
+			format_size >> 48,
+			(u64)(next_record - __pebs),
+			basic->format_size);
+}
+
 static inline void *
 get_next_pebs_record_by_bit(void *base, void *top, int bit)
 {
@ -1318,19 +1586,19 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
 	if (base == NULL)
 		return NULL;

-	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
-		struct pebs_record_nhm *p = at;
+	for (at = base; at < top; at += cpuc->pebs_record_size) {
+		unsigned long status = get_pebs_status(at);

-		if (test_bit(bit, (unsigned long *)&p->status)) {
+		if (test_bit(bit, (unsigned long *)&status)) {
 			/* PEBS v3 has accurate status bits */
 			if (x86_pmu.intel_cap.pebs_format >= 3)
 				return at;

-			if (p->status == (1 << bit))
+			if (status == (1 << bit))
 				return at;

 			/* clear non-PEBS bit and re-check */
-			pebs_status = p->status & cpuc->pebs_enabled;
+			pebs_status = status & cpuc->pebs_enabled;
 			pebs_status &= PEBS_COUNTER_MASK;
 			if (pebs_status == (1 << bit))
 				return at;
@ -1410,11 +1678,18 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
 static void __intel_pmu_pebs_event(struct perf_event *event,
 				   struct pt_regs *iregs,
 				   void *base, void *top,
-				   int bit, int count)
+				   int bit, int count,
+				   void (*setup_sample)(struct perf_event *,
+						struct pt_regs *,
+						void *,
+						struct perf_sample_data *,
+						struct pt_regs *))
 {
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
 	struct perf_sample_data data;
-	struct pt_regs regs;
+	struct x86_perf_regs perf_regs;
+	struct pt_regs *regs = &perf_regs.regs;
 	void *at = get_next_pebs_record_by_bit(base, top, bit);

 	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
@ -1429,20 +1704,20 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 		return;

 	while (count > 1) {
-		setup_pebs_sample_data(event, iregs, at, &data, &regs);
-		perf_event_output(event, &data, &regs);
-		at += x86_pmu.pebs_record_size;
+		setup_sample(event, iregs, at, &data, regs);
+		perf_event_output(event, &data, regs);
+		at += cpuc->pebs_record_size;
 		at = get_next_pebs_record_by_bit(at, top, bit);
 		count--;
 	}

-	setup_pebs_sample_data(event, iregs, at, &data, &regs);
+	setup_sample(event, iregs, at, &data, regs);

 	/*
 	 * All but the last records are processed.
 	 * The last one is left to be able to call the overflow handler.
 	 */
-	if (perf_event_overflow(event, &data, &regs)) {
+	if (perf_event_overflow(event, &data, regs)) {
 		x86_pmu_stop(event, 0);
 		return;
 	}
@ -1483,7 +1758,27 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 		return;
 	}

-	__intel_pmu_pebs_event(event, iregs, at, top, 0, n);
+	__intel_pmu_pebs_event(event, iregs, at, top, 0, n,
+			       setup_pebs_fixed_sample_data);
+}
+
+static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size)
+{
+	struct perf_event *event;
+	int bit;
+
+	/*
+	 * The drain_pebs() could be called twice in a short period
+	 * for auto-reload event in pmu::read(). There are no
+	 * overflows have happened in between.
+	 * It needs to call intel_pmu_save_and_restart_reload() to
+	 * update the event->count for this case.
+	 */
+	for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) {
+		event = cpuc->events[bit];
+		if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+			intel_pmu_save_and_restart_reload(event, 0);
+	}
 }

 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
@ -1513,19 +1808,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	}

 	if (unlikely(base >= top)) {
-		/*
-		 * The drain_pebs() could be called twice in a short period
-		 * for auto-reload event in pmu::read(). There are no
-		 * overflows have happened in between.
-		 * It needs to call intel_pmu_save_and_restart_reload() to
-		 * update the event->count for this case.
-		 */
-		for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled,
-				 size) {
-			event = cpuc->events[bit];
-			if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
-				intel_pmu_save_and_restart_reload(event, 0);
-		}
+		intel_pmu_pebs_event_update_no_drain(cpuc, size);
 		return;
 	}

@ -1538,8 +1821,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)

 		/* PEBS v3 has more accurate status bits */
 		if (x86_pmu.intel_cap.pebs_format >= 3) {
-			for_each_set_bit(bit, (unsigned long *)&pebs_status,
-					 size)
+			for_each_set_bit(bit, (unsigned long *)&pebs_status, size)
 				counts[bit]++;

 			continue;
@ -1578,8 +1860,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		 * If collision happened, the record will be dropped.
 		 */
 		if (p->status != (1ULL << bit)) {
-			for_each_set_bit(i, (unsigned long *)&pebs_status,
-					 x86_pmu.max_pebs_events)
+			for_each_set_bit(i, (unsigned long *)&pebs_status, size)
 				error[i]++;
 			continue;
 		}
@ -1587,7 +1868,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		counts[bit]++;
 	}

-	for (bit = 0; bit < size; bit++) {
+	for_each_set_bit(bit, (unsigned long *)&mask, size) {
 		if ((counts[bit] == 0) && (error[bit] == 0))
 			continue;

@ -1608,11 +1889,66 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)

 		if (counts[bit]) {
 			__intel_pmu_pebs_event(event, iregs, base,
-					       top, bit, counts[bit]);
+					       top, bit, counts[bit],
+					       setup_pebs_fixed_sample_data);
 		}
 	}
 }

+static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs)
+{
+	short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct perf_event *event;
+	void *base, *at, *top;
+	int bit, size;
+	u64 mask;
+
+	if (!x86_pmu.pebs_active)
+		return;
+
+	base = (struct pebs_basic *)(unsigned long)ds->pebs_buffer_base;
+	top = (struct pebs_basic *)(unsigned long)ds->pebs_index;
+
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	mask = ((1ULL << x86_pmu.max_pebs_events) - 1) |
+	       (((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED);
+	size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed;
+
+	if (unlikely(base >= top)) {
+		intel_pmu_pebs_event_update_no_drain(cpuc, size);
+		return;
+	}
+
+	for (at = base; at < top; at += cpuc->pebs_record_size) {
+		u64 pebs_status;
+
+		pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
+		pebs_status &= mask;
+
+		for_each_set_bit(bit, (unsigned long *)&pebs_status, size)
+			counts[bit]++;
+	}
+
+	for_each_set_bit(bit, (unsigned long *)&mask, size) {
+		if (counts[bit] == 0)
+			continue;
+
+		event = cpuc->events[bit];
+		if (WARN_ON_ONCE(!event))
+			continue;
+
+		if (WARN_ON_ONCE(!event->attr.precise_ip))
+			continue;
+
+		__intel_pmu_pebs_event(event, iregs, base,
+				       top, bit, counts[bit],
+				       setup_pebs_adaptive_sample_data);
+	}
+}
+
 /*
 * BTS, PEBS probe and setup
 */
@ -1628,12 +1964,18 @@ void __init intel_ds_init(void)
 	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
 	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
 	x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
-	if (x86_pmu.version <= 4)
+	if (x86_pmu.version <= 4) {
 		x86_pmu.pebs_no_isolation = 1;
+		x86_pmu.pebs_no_xmm_regs = 1;
+	}
 	if (x86_pmu.pebs) {
 		char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
+		char *pebs_qual = "";
 		int format = x86_pmu.intel_cap.pebs_format;

+		if (format < 4)
+			x86_pmu.intel_cap.pebs_baseline = 0;
+
 		switch (format) {
 		case 0:
 			pr_cont("PEBS fmt0%c, ", pebs_type);
@ -1669,6 +2011,29 @@ void __init intel_ds_init(void)
 			x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
 			break;

+		case 4:
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl;
+			x86_pmu.pebs_record_size = sizeof(struct pebs_basic);
+			if (x86_pmu.intel_cap.pebs_baseline) {
+				x86_pmu.large_pebs_flags |=
+					PERF_SAMPLE_BRANCH_STACK |
+					PERF_SAMPLE_TIME;
+				x86_pmu.flags |= PMU_FL_PEBS_ALL;
+				pebs_qual = "-baseline";
+			} else {
+				/* Only basic record supported */
+				x86_pmu.pebs_no_xmm_regs = 1;
+				x86_pmu.large_pebs_flags &=
+					~(PERF_SAMPLE_ADDR |
+					  PERF_SAMPLE_TIME |
+					  PERF_SAMPLE_DATA_SRC |
+					  PERF_SAMPLE_TRANSACTION |
+					  PERF_SAMPLE_REGS_USER |
+					  PERF_SAMPLE_REGS_INTR);
+			}
+			pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual);
+			break;
+
 		default:
 			pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
 			x86_pmu.pebs = 0;
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@ -488,6 +488,8 @@ void intel_pmu_lbr_add(struct perf_event *event)
 	 * be 'new'. Conversely, a new event can get installed through the
 	 * context switch path for the first time.
 	 */
+	if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
+		cpuc->lbr_pebs_users++;
 	perf_sched_cb_inc(event->ctx->pmu);
 	if (!cpuc->lbr_users++ && !event->total_time_running)
 		intel_pmu_lbr_reset();
@ -507,8 +509,11 @@ void intel_pmu_lbr_del(struct perf_event *event)
 		task_ctx->lbr_callstack_users--;
 	}

+	if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
+		cpuc->lbr_pebs_users--;
 	cpuc->lbr_users--;
 	WARN_ON_ONCE(cpuc->lbr_users < 0);
+	WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
 	perf_sched_cb_dec(event->ctx->pmu);
 }

@ -658,7 +663,13 @@ void intel_pmu_lbr_read(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);

-	if (!cpuc->lbr_users)
+	/*
+	 * Don't read when all LBRs users are using adaptive PEBS.
+	 *
+	 * This could be smarter and actually check the event,
+	 * but this simple approach seems to work for now.
+	 */
+	if (!cpuc->lbr_users || cpuc->lbr_users == cpuc->lbr_pebs_users)
 		return;

 	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
@ -1080,6 +1091,28 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 	}
 }

+void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int i;
+
+	cpuc->lbr_stack.nr = x86_pmu.lbr_nr;
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		u64 info = lbr->lbr[i].info;
+		struct perf_branch_entry *e = &cpuc->lbr_entries[i];
+
+		e->from		= lbr->lbr[i].from;
+		e->to		= lbr->lbr[i].to;
+		e->mispred	= !!(info & LBR_INFO_MISPRED);
+		e->predicted	= !(info & LBR_INFO_MISPRED);
+		e->in_tx	= !!(info & LBR_INFO_IN_TX);
+		e->abort	= !!(info & LBR_INFO_ABORT);
+		e->cycles	= info & LBR_INFO_CYCLES;
+		e->reserved	= 0;
+	}
+	intel_pmu_lbr_filter(cpuc);
+}
+
 /*
 * Map interface branch filters onto LBR filters
 */
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@ -775,6 +775,8 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init),

 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init),
+
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE,  skl_rapl_init),
 	{},
 };

--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@ -1367,6 +1367,11 @@ static const struct intel_uncore_init_fun skx_uncore_init __initconst = {
 	.pci_init = skx_uncore_pci_init,
 };

+static const struct intel_uncore_init_fun icl_uncore_init __initconst = {
+	.cpu_init = icl_uncore_cpu_init,
+	.pci_init = skl_uncore_pci_init,
+};
+
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP,	  nhm_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM,	  nhm_uncore_init),
@ -1393,6 +1398,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,      skx_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, icl_uncore_init),
 	{},
 };

--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@ -512,6 +512,7 @@ int skl_uncore_pci_init(void);
 void snb_uncore_cpu_init(void);
 void nhm_uncore_cpu_init(void);
 void skl_uncore_cpu_init(void);
+void icl_uncore_cpu_init(void);
 int snb_pci2phy_map_init(int devid);

 /* uncore_snbep.c */
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@ -34,6 +34,8 @@
 #define PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC	0x3e33
 #define PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC	0x3eca
 #define PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC	0x3e32
+#define PCI_DEVICE_ID_INTEL_ICL_U_IMC		0x8a02
+#define PCI_DEVICE_ID_INTEL_ICL_U2_IMC		0x8a12

 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff
@ -93,6 +95,12 @@
 #define SKL_UNC_PERF_GLOBAL_CTL			0xe01
 #define SKL_UNC_GLOBAL_CTL_CORE_ALL		((1 << 5) - 1)

+/* ICL Cbo register */
+#define ICL_UNC_CBO_CONFIG			0x396
+#define ICL_UNC_NUM_CBO_MASK			0xf
+#define ICL_UNC_CBO_0_PER_CTR0			0x702
+#define ICL_UNC_CBO_MSR_OFFSET			0x8
+
 DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
 DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
 DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
@ -280,6 +288,70 @@ void skl_uncore_cpu_init(void)
 	snb_uncore_arb.ops = &skl_uncore_msr_ops;
 }

+static struct intel_uncore_type icl_uncore_cbox = {
+	.name		= "cbox",
+	.num_counters   = 4,
+	.perf_ctr_bits	= 44,
+	.perf_ctr	= ICL_UNC_CBO_0_PER_CTR0,
+	.event_ctl	= SNB_UNC_CBO_0_PERFEVTSEL0,
+	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
+	.msr_offset	= ICL_UNC_CBO_MSR_OFFSET,
+	.ops		= &skl_uncore_msr_ops,
+	.format_group	= &snb_uncore_format_group,
+};
+
+static struct uncore_event_desc icl_uncore_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff"),
+	{ /* end: all zeroes */ },
+};
+
+static struct attribute *icl_uncore_clock_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group icl_uncore_clock_format_group = {
+	.name = "format",
+	.attrs = icl_uncore_clock_formats_attr,
+};
+
+static struct intel_uncore_type icl_uncore_clockbox = {
+	.name		= "clock",
+	.num_counters	= 1,
+	.num_boxes	= 1,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNB_UNC_FIXED_CTR,
+	.fixed_ctl	= SNB_UNC_FIXED_CTR_CTRL,
+	.single_fixed	= 1,
+	.event_mask	= SNB_UNC_CTL_EV_SEL_MASK,
+	.format_group	= &icl_uncore_clock_format_group,
+	.ops		= &skl_uncore_msr_ops,
+	.event_descs	= icl_uncore_events,
+};
+
+static struct intel_uncore_type *icl_msr_uncores[] = {
+	&icl_uncore_cbox,
+	&snb_uncore_arb,
+	&icl_uncore_clockbox,
+	NULL,
+};
+
+static int icl_get_cbox_num(void)
+{
+	u64 num_boxes;
+
+	rdmsrl(ICL_UNC_CBO_CONFIG, num_boxes);
+
+	return num_boxes & ICL_UNC_NUM_CBO_MASK;
+}
+
+void icl_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = icl_msr_uncores;
+	icl_uncore_cbox.num_boxes = icl_get_cbox_num();
+	snb_uncore_arb.ops = &skl_uncore_msr_ops;
+}
+
 enum {
 	SNB_PCI_UNCORE_IMC,
 };
@ -668,6 +740,18 @@ static const struct pci_device_id skl_uncore_pci_ids[] = {
 	{ /* end: all zeroes */ },
 };

+static const struct pci_device_id icl_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
 static struct pci_driver snb_uncore_pci_driver = {
 	.name		= "snb_uncore",
 	.id_table	= snb_uncore_pci_ids,
@ -693,6 +777,11 @@ static struct pci_driver skl_uncore_pci_driver = {
 	.id_table	= skl_uncore_pci_ids,
 };

+static struct pci_driver icl_uncore_pci_driver = {
+	.name		= "icl_uncore",
+	.id_table	= icl_uncore_pci_ids,
+};
+
 struct imc_uncore_pci_dev {
 	__u32 pci_id;
 	struct pci_driver *driver;
@ -732,6 +821,8 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
 	IMC_DEV(CFL_4S_S_IMC, &skl_uncore_pci_driver),  /* 8th Gen Core S 4 Cores Server */
 	IMC_DEV(CFL_6S_S_IMC, &skl_uncore_pci_driver),  /* 8th Gen Core S 6 Cores Server */
 	IMC_DEV(CFL_8S_S_IMC, &skl_uncore_pci_driver),  /* 8th Gen Core S 8 Cores Server */
+	IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver),	/* 10th Gen Core Mobile */
+	IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver),	/* 10th Gen Core Mobile */
 	{  /* end marker */ }
 };

--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@ -89,6 +89,7 @@ static bool test_intel(int idx)
 	case INTEL_FAM6_SKYLAKE_X:
 	case INTEL_FAM6_KABYLAKE_MOBILE:
 	case INTEL_FAM6_KABYLAKE_DESKTOP:
+	case INTEL_FAM6_ICELAKE_MOBILE:
 		if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
 			return true;
 		break;
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@ -49,28 +49,33 @@ struct event_constraint {
 		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 		u64		idxmsk64;
 	};
-	u64	code;
-	u64	cmask;
-	int	weight;
-	int	overlap;
-	int	flags;
+	u64		code;
+	u64		cmask;
+	int		weight;
+	int		overlap;
+	int		flags;
+	unsigned int	size;
 };
+
+static inline bool constraint_match(struct event_constraint *c, u64 ecode)
+{
+	return ((ecode & c->cmask) - c->code) <= (u64)c->size;
+}
+
 /*
 * struct hw_perf_event.flags flags
 */
 #define PERF_X86_EVENT_PEBS_LDLAT	0x0001 /* ld+ldlat data address sampling */
 #define PERF_X86_EVENT_PEBS_ST		0x0002 /* st data address sampling */
 #define PERF_X86_EVENT_PEBS_ST_HSW	0x0004 /* haswell style datala, store */
-#define PERF_X86_EVENT_COMMITTED	0x0008 /* event passed commit_txn */
-#define PERF_X86_EVENT_PEBS_LD_HSW	0x0010 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW	0x0020 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_EXCL		0x0040 /* HT exclusivity on counter */
-#define PERF_X86_EVENT_DYNAMIC		0x0080 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED	0x0100 /* grant rdpmc permission */
-#define PERF_X86_EVENT_EXCL_ACCT	0x0200 /* accounted EXCL event */
-#define PERF_X86_EVENT_AUTO_RELOAD	0x0400 /* use PEBS auto-reload */
-#define PERF_X86_EVENT_LARGE_PEBS	0x0800 /* use large PEBS */
-
+#define PERF_X86_EVENT_PEBS_LD_HSW	0x0008 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW	0x0010 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL		0x0020 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC		0x0040 /* dynamic alloc'd constraint */
+#define PERF_X86_EVENT_RDPMC_ALLOWED	0x0080 /* grant rdpmc permission */
+#define PERF_X86_EVENT_EXCL_ACCT	0x0100 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD	0x0200 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_LARGE_PEBS	0x0400 /* use large PEBS */

 struct amd_nb {
 	int nb_id;  /* NorthBridge id */
@ -116,6 +121,24 @@ struct amd_nb {
 	 (1ULL << PERF_REG_X86_R14)   | \
 	 (1ULL << PERF_REG_X86_R15))

+#define PEBS_XMM_REGS                   \
+	((1ULL << PERF_REG_X86_XMM0)  | \
+	 (1ULL << PERF_REG_X86_XMM1)  | \
+	 (1ULL << PERF_REG_X86_XMM2)  | \
+	 (1ULL << PERF_REG_X86_XMM3)  | \
+	 (1ULL << PERF_REG_X86_XMM4)  | \
+	 (1ULL << PERF_REG_X86_XMM5)  | \
+	 (1ULL << PERF_REG_X86_XMM6)  | \
+	 (1ULL << PERF_REG_X86_XMM7)  | \
+	 (1ULL << PERF_REG_X86_XMM8)  | \
+	 (1ULL << PERF_REG_X86_XMM9)  | \
+	 (1ULL << PERF_REG_X86_XMM10) | \
+	 (1ULL << PERF_REG_X86_XMM11) | \
+	 (1ULL << PERF_REG_X86_XMM12) | \
+	 (1ULL << PERF_REG_X86_XMM13) | \
+	 (1ULL << PERF_REG_X86_XMM14) | \
+	 (1ULL << PERF_REG_X86_XMM15))
+
 /*
 * Per register state.
 */
@ -207,10 +230,16 @@ struct cpu_hw_events {
 	int			n_pebs;
 	int			n_large_pebs;

+	/* Current super set of events hardware configuration */
+	u64			pebs_data_cfg;
+	u64			active_pebs_data_cfg;
+	int			pebs_record_size;
+
 	/*
 	 * Intel LBR bits
 	 */
 	int				lbr_users;
+	int				lbr_pebs_users;
 	struct perf_branch_stack	lbr_stack;
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
 	struct er_account		*lbr_sel;
@ -257,18 +286,29 @@ struct cpu_hw_events {
 	void				*kfree_on_online[X86_PERF_KFREE_MAX];
 };

-#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
+#define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) {	\
 	{ .idxmsk64 = (n) },		\
 	.code = (c),			\
+	.size = (e) - (c),		\
 	.cmask = (m),			\
 	.weight = (w),			\
 	.overlap = (o),			\
 	.flags = f,			\
 }

+#define __EVENT_CONSTRAINT(c, n, m, w, o, f) \
+	__EVENT_CONSTRAINT_RANGE(c, c, n, m, w, o, f)
+
 #define EVENT_CONSTRAINT(c, n, m)	\
 	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)

+/*
+ * The constraint_match() function only works for 'simple' event codes
+ * and not for extended (AMD64_EVENTSEL_EVENT) events codes.
+ */
+#define EVENT_CONSTRAINT_RANGE(c, e, n, m) \
+	__EVENT_CONSTRAINT_RANGE(c, e, n, m, HWEIGHT(n), 0, 0)
+
 #define INTEL_EXCLEVT_CONSTRAINT(c, n)	\
 	__EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
 			   0, PERF_X86_EVENT_EXCL)
@ -303,6 +343,12 @@ struct cpu_hw_events {
 #define INTEL_EVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)

+/*
+ * Constraint on a range of Event codes
+ */
+#define INTEL_EVENT_CONSTRAINT_RANGE(c, e, n)			\
+	EVENT_CONSTRAINT_RANGE(c, e, n, ARCH_PERFMON_EVENTSEL_EVENT)
+
 /*
 * Constraint on the Event code + UMask + fixed-mask
 *
@ -350,6 +396,9 @@ struct cpu_hw_events {
 #define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)

+#define INTEL_FLAGS_EVENT_CONSTRAINT_RANGE(c, e, n)			\
+	EVENT_CONSTRAINT_RANGE(c, e, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
 /* Check only flags, but allow all event/umask */
 #define INTEL_ALL_EVENT_CONSTRAINT(code, n)	\
 	EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
@ -366,6 +415,11 @@ struct cpu_hw_events {
 			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)

+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(code, end, n) \
+	__EVENT_CONSTRAINT_RANGE(code, end, n,				\
+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
 #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
 	__EVENT_CONSTRAINT(code, n,			\
 			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
@ -473,6 +527,7 @@ union perf_capabilities {
 		 * values > 32bit.
 		 */
 		u64	full_width_write:1;
+		u64     pebs_baseline:1;
 	};
 	u64	capabilities;
 };
@ -613,14 +668,16 @@ struct x86_pmu {
 			pebs_broken		:1,
 			pebs_prec_dist		:1,
 			pebs_no_tlb		:1,
-			pebs_no_isolation	:1;
+			pebs_no_isolation	:1,
+			pebs_no_xmm_regs	:1;
 	int		pebs_record_size;
 	int		pebs_buffer_size;
+	int		max_pebs_events;
 	void		(*drain_pebs)(struct pt_regs *regs);
 	struct event_constraint *pebs_constraints;
 	void		(*pebs_aliases)(struct perf_event *event);
-	int 		max_pebs_events;
 	unsigned long	large_pebs_flags;
+	u64		rtm_abort_event;

 	/*
 	 * Intel LBR
@ -714,6 +771,7 @@ static struct perf_pmu_events_ht_attr event_attr_##v = {		\
 	.event_str_ht	= ht,						\
 }

+struct pmu *x86_get_pmu(void);
 extern struct x86_pmu x86_pmu __read_mostly;

 static inline bool x86_pmu_has_lbr_callstack(void)
@ -941,6 +999,8 @@ extern struct event_constraint intel_bdw_pebs_event_constraints[];

 extern struct event_constraint intel_skl_pebs_event_constraints[];

+extern struct event_constraint intel_icl_pebs_event_constraints[];
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event);

 void intel_pmu_pebs_add(struct perf_event *event);
@ -959,6 +1019,8 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);

 void intel_pmu_auto_reload_read(struct perf_event *event);

+void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr);
+
 void intel_ds_init(void);

 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
--- a/arch/x86/include/asm/intel_ds.h
+++ b/arch/x86/include/asm/intel_ds.h
@ -8,7 +8,7 @@

 /* The maximal number of PEBS events: */
 #define MAX_PEBS_EVENTS		8
-#define MAX_FIXED_PEBS_EVENTS	3
+#define MAX_FIXED_PEBS_EVENTS	4

 /*
 * A debug store configuration.
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@ -116,6 +116,7 @@
 #define LBR_INFO_CYCLES			0xffff

 #define MSR_IA32_PEBS_ENABLE		0x000003f1
+#define MSR_PEBS_DATA_CFG		0x000003f2
 #define MSR_IA32_DS_AREA		0x00000600
 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
 #define MSR_PEBS_LD_LAT_THRESHOLD	0x000003f6
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@ -7,7 +7,7 @@
 */

 #define INTEL_PMC_MAX_GENERIC				       32
-#define INTEL_PMC_MAX_FIXED					3
+#define INTEL_PMC_MAX_FIXED					4
 #define INTEL_PMC_IDX_FIXED				       32

 #define X86_PMC_IDX_MAX					       64
@ -32,6 +32,8 @@

 #define HSW_IN_TX					(1ULL << 32)
 #define HSW_IN_TX_CHECKPOINTED				(1ULL << 33)
+#define ICL_EVENTSEL_ADAPTIVE				(1ULL << 34)
+#define ICL_FIXED_0_ADAPTIVE				(1ULL << 32)

 #define AMD64_EVENTSEL_INT_CORE_ENABLE			(1ULL << 36)
 #define AMD64_EVENTSEL_GUESTONLY			(1ULL << 40)
@ -87,6 +89,12 @@
 #define ARCH_PERFMON_BRANCH_MISSES_RETIRED		6
 #define ARCH_PERFMON_EVENTS_COUNT			7

+#define PEBS_DATACFG_MEMINFO	BIT_ULL(0)
+#define PEBS_DATACFG_GP	BIT_ULL(1)
+#define PEBS_DATACFG_XMMS	BIT_ULL(2)
+#define PEBS_DATACFG_LBRS	BIT_ULL(3)
+#define PEBS_DATACFG_LBR_SHIFT	24
+
 /*
 * Intel "Architectural Performance Monitoring" CPUID
 * detection/enumeration details:
@ -176,6 +184,41 @@ struct x86_pmu_capability {
 #define GLOBAL_STATUS_LBRS_FROZEN			BIT_ULL(58)
 #define GLOBAL_STATUS_TRACE_TOPAPMI			BIT_ULL(55)

+/*
+ * Adaptive PEBS v4
+ */
+
+struct pebs_basic {
+	u64 format_size;
+	u64 ip;
+	u64 applicable_counters;
+	u64 tsc;
+};
+
+struct pebs_meminfo {
+	u64 address;
+	u64 aux;
+	u64 latency;
+	u64 tsx_tuning;
+};
+
+struct pebs_gprs {
+	u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di;
+	u64 r8, r9, r10, r11, r12, r13, r14, r15;
+};
+
+struct pebs_xmm {
+	u64 xmm[16*2];	/* two entries for each register */
+};
+
+struct pebs_lbr_entry {
+	u64 from, to, info;
+};
+
+struct pebs_lbr {
+	struct pebs_lbr_entry lbr[0]; /* Variable length */
+};
+
 /*
 * IBS cpuid feature detection
 */
@ -248,6 +291,11 @@ extern void perf_events_lapic_init(void);
 #define PERF_EFLAGS_VM		(1UL << 5)

 struct pt_regs;
+struct x86_perf_regs {
+	struct pt_regs	regs;
+	u64		*xmm_regs;
+};
+
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)	perf_misc_flags(regs)
@ -260,14 +308,9 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
 */
 #define perf_arch_fetch_caller_regs(regs, __ip)		{	\
 	(regs)->ip = (__ip);					\
-	(regs)->bp = caller_frame_pointer();			\
+	(regs)->sp = (unsigned long)__builtin_frame_address(0);	\
 	(regs)->cs = __KERNEL_CS;				\
 	regs->flags = 0;					\
-	asm volatile(						\
-		_ASM_MOV "%%"_ASM_SP ", %0\n"			\
-		: "=m" ((regs)->sp)				\
-		:: "memory"					\
-	);							\
 }

 struct perf_guest_switch_msr {
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@ -98,19 +98,6 @@ struct stack_frame_ia32 {
    u32 return_address;
 };

-static inline unsigned long caller_frame_pointer(void)
-{
-	struct stack_frame *frame;
-
-	frame = __builtin_frame_address(0);
-
-#ifdef CONFIG_FRAME_POINTER
-	frame = frame->next_frame;
-#endif
-
-	return (unsigned long)frame;
-}
-
 void show_opcodes(struct pt_regs *regs, const char *loglvl);
 void show_ip(struct pt_regs *regs, const char *loglvl);
 #endif /* _ASM_X86_STACKTRACE_H */
--- a/arch/x86/include/uapi/asm/perf_regs.h
+++ b/arch/x86/include/uapi/asm/perf_regs.h
@ -27,8 +27,29 @@ enum perf_event_x86_regs {
 	PERF_REG_X86_R13,
 	PERF_REG_X86_R14,
 	PERF_REG_X86_R15,
-
+	/* These are the limits for the GPRs. */
 	PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1,
 	PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1,
+
+	/* These all need two bits set because they are 128bit */
+	PERF_REG_X86_XMM0  = 32,
+	PERF_REG_X86_XMM1  = 34,
+	PERF_REG_X86_XMM2  = 36,
+	PERF_REG_X86_XMM3  = 38,
+	PERF_REG_X86_XMM4  = 40,
+	PERF_REG_X86_XMM5  = 42,
+	PERF_REG_X86_XMM6  = 44,
+	PERF_REG_X86_XMM7  = 46,
+	PERF_REG_X86_XMM8  = 48,
+	PERF_REG_X86_XMM9  = 50,
+	PERF_REG_X86_XMM10 = 52,
+	PERF_REG_X86_XMM11 = 54,
+	PERF_REG_X86_XMM12 = 56,
+	PERF_REG_X86_XMM13 = 58,
+	PERF_REG_X86_XMM14 = 60,
+	PERF_REG_X86_XMM15 = 62,
+
+	/* These include both GPRs and XMMX registers */
+	PERF_REG_X86_XMM_MAX = PERF_REG_X86_XMM15 + 2,
 };
 #endif /* _ASM_X86_PERF_REGS_H */
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@ -59,18 +59,34 @@ static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {

 u64 perf_reg_value(struct pt_regs *regs, int idx)
 {
+	struct x86_perf_regs *perf_regs;
+
+	if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) {
+		perf_regs = container_of(regs, struct x86_perf_regs, regs);
+		if (!perf_regs->xmm_regs)
+			return 0;
+		return perf_regs->xmm_regs[idx - PERF_REG_X86_XMM0];
+	}
+
 	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset)))
 		return 0;

 	return regs_get_register(regs, pt_regs_offset[idx]);
 }

-#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL))
-
 #ifdef CONFIG_X86_32
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \
+		       (1ULL << PERF_REG_X86_R9) | \
+		       (1ULL << PERF_REG_X86_R10) | \
+		       (1ULL << PERF_REG_X86_R11) | \
+		       (1ULL << PERF_REG_X86_R12) | \
+		       (1ULL << PERF_REG_X86_R13) | \
+		       (1ULL << PERF_REG_X86_R14) | \
+		       (1ULL << PERF_REG_X86_R15))
+
 int perf_reg_validate(u64 mask)
 {
-	if (!mask || mask & REG_RESERVED)
+	if (!mask || (mask & REG_NOSUPPORT))
 		return -EINVAL;

 	return 0;
@ -96,10 +112,7 @@ void perf_get_regs_user(struct perf_regs *regs_user,

 int perf_reg_validate(u64 mask)
 {
-	if (!mask || mask & REG_RESERVED)
-		return -EINVAL;
-
-	if (mask & REG_NOSUPPORT)
+	if (!mask || (mask & REG_NOSUPPORT))
 		return -EINVAL;

 	return 0;
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@ -463,7 +463,7 @@ enum perf_addr_filter_action_t {
 /**
 * struct perf_addr_filter - address range filter definition
 * @entry:	event's filter list linkage
- * @inode:	object file's inode for file-based filters
+ * @path:	object file's path for file-based filters
 * @offset:	filter range offset
 * @size:	filter range size (size==0 means single address trigger)
 * @action:	filter/start/stop
@ -887,6 +887,9 @@ extern void perf_sched_cb_dec(struct pmu *pmu);
 extern void perf_sched_cb_inc(struct pmu *pmu);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
+
+extern void perf_pmu_resched(struct pmu *pmu);
+
 extern int perf_event_refresh(struct perf_event *event, int refresh);
 extern void perf_event_update_userpage(struct perf_event *event);
 extern int perf_event_release_kernel(struct perf_event *event);
@ -1054,12 +1057,18 @@ static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned lo
 #endif

 /*
- * Take a snapshot of the regs. Skip ip and frame pointer to
- * the nth caller. We only need a few of the regs:
+ * When generating a perf sample in-line, instead of from an interrupt /
+ * exception, we lack a pt_regs. This is typically used from software events
+ * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints.
+ *
+ * We typically don't need a full set, but (for x86) do require:
 * - ip for PERF_SAMPLE_IP
 * - cs for user_mode() tests
- * - bp for callchains
- * - eflags, for future purposes, just in case
+ * - sp for PERF_SAMPLE_CALLCHAIN
+ * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs())
+ *
+ * NOTE: assumes @regs is otherwise already 0 filled; this is important for
+ * things like PERF_SAMPLE_REGS_INTR.
 */
 static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 {
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@ -2478,6 +2478,16 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 	perf_pmu_enable(cpuctx->ctx.pmu);
 }

+void perf_pmu_resched(struct pmu *pmu)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+	struct perf_event_context *task_ctx = cpuctx->task_ctx;
+
+	perf_ctx_lock(cpuctx, task_ctx);
+	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+	perf_ctx_unlock(cpuctx, task_ctx);
+}
+
 /*
 * Cross CPU call to install and enable a performance event
 *
@ -11917,7 +11927,7 @@ static void __init perf_event_init_all_cpus(void)
 	}
 }

-void perf_swevent_init_cpu(unsigned int cpu)
+static void perf_swevent_init_cpu(unsigned int cpu)
 {
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@ -590,7 +590,7 @@ static void lockup_detector_reconfigure(void)
 * Create the watchdog thread infrastructure and configure the detector(s).
 *
 * The threads are not unparked as watchdog_allowed_mask is empty.  When
- * the threads are sucessfully initialized, take the proper locks and
+ * the threads are successfully initialized, take the proper locks and
 * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
 */
 static __init void lockup_detector_setup(void)
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@ -67,6 +67,7 @@ FEATURE_TESTS_BASIC :=                  \
        sdt				\
        setns				\
        libaio				\
+        libzstd				\
        disassembler-four-args

 # FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
@ -120,6 +121,7 @@ FEATURE_DISPLAY ?=              \
         get_cpuid              \
         bpf			\
         libaio			\
+         libzstd		\
         disassembler-four-args

 # Set FEATURE_CHECK_(C|LD)FLAGS-all for all FEATURE_TESTS features.
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@ -62,7 +62,8 @@ FILES=                                          \
         test-clang.bin				\
         test-llvm.bin				\
         test-llvm-version.bin			\
-         test-libaio.bin
+         test-libaio.bin			\
+         test-libzstd.bin

 FILES := $(addprefix $(OUTPUT),$(FILES))

@ -301,6 +302,9 @@ $(OUTPUT)test-clang.bin:
 $(OUTPUT)test-libaio.bin:
 	$(BUILD) -lrt

+$(OUTPUT)test-libzstd.bin:
+	$(BUILD) -lzstd
+
 ###############################

 clean:
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@ -182,6 +182,10 @@
 # include "test-disassembler-four-args.c"
 #undef main

+#define main main_test_zstd
+# include "test-libzstd.c"
+#undef main
+
 int main(int argc, char *argv[])
 {
 	main_test_libpython();
@ -224,6 +228,7 @@ int main(int argc, char *argv[])
 	main_test_libaio();
 	main_test_reallocarray();
 	main_test_disassembler_four_args();
+	main_test_libzstd();

 	return 0;
 }
--- a/tools/build/feature/test-libzstd.c
+++ b/tools/build/feature/test-libzstd.c
@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <zstd.h>
+
+int main(void)
+{
+	ZSTD_CStream	*cstream;
+
+	cstream = ZSTD_createCStream();
+	ZSTD_freeCStream(cstream);
+
+	return 0;
+}
--- a/tools/lib/traceevent/event-parse-api.c
+++ b/tools/lib/traceevent/event-parse-api.c
@ -8,6 +8,22 @@
 #include "event-parse-local.h"
 #include "event-utils.h"

+/**
+ * tep_get_event - returns the event with the given index
+ * @tep: a handle to the tep_handle
+ * @index: index of the requested event, in the range 0 .. nr_events
+ *
+ * This returns pointer to the element of the events array with the given index
+ * If @tep is NULL, or @index is not in the range 0 .. nr_events, NULL is returned.
+ */
+struct tep_event *tep_get_event(struct tep_handle *tep, int index)
+{
+	if (tep && tep->events && index < tep->nr_events)
+		return tep->events[index];
+
+	return NULL;
+}
+
 /**
 * tep_get_first_event - returns the first event in the events array
 * @tep: a handle to the tep_handle
@ -17,10 +33,7 @@
 */
 struct tep_event *tep_get_first_event(struct tep_handle *tep)
 {
-	if (tep && tep->events)
-		return tep->events[0];
-
-	return NULL;
+	return tep_get_event(tep, 0);
 }

 /**
@ -32,7 +45,7 @@ struct tep_event *tep_get_first_event(struct tep_handle *tep)
 */
 int tep_get_events_count(struct tep_handle *tep)
 {
-	if(tep)
+	if (tep)
 		return tep->nr_events;
 	return 0;
 }
@ -43,19 +56,47 @@ int tep_get_events_count(struct tep_handle *tep)
 * @flag: flag, or combination of flags to be set
 * can be any combination from enum tep_flag
 *
- * This sets a flag or mbination of flags  from enum tep_flag
-  */
+ * This sets a flag or combination of flags from enum tep_flag
+ */
 void tep_set_flag(struct tep_handle *tep, int flag)
 {
-	if(tep)
+	if (tep)
 		tep->flags |= flag;
 }

-unsigned short tep_data2host2(struct tep_handle *pevent, unsigned short data)
+/**
+ * tep_clear_flag - clear event parser flag
+ * @tep: a handle to the tep_handle
+ * @flag: flag to be cleared
+ *
+ * This clears a tep flag
+ */
+void tep_clear_flag(struct tep_handle *tep, enum tep_flag flag)
+{
+	if (tep)
+		tep->flags &= ~flag;
+}
+
+/**
+ * tep_test_flag - check the state of event parser flag
+ * @tep: a handle to the tep_handle
+ * @flag: flag to be checked
+ *
+ * This returns the state of the requested tep flag.
+ * Returns: true if the flag is set, false otherwise.
+ */
+bool tep_test_flag(struct tep_handle *tep, enum tep_flag flag)
+{
+	if (tep)
+		return tep->flags & flag;
+	return false;
+}
+
+unsigned short tep_data2host2(struct tep_handle *tep, unsigned short data)
 {
 	unsigned short swap;

-	if (!pevent || pevent->host_bigendian == pevent->file_bigendian)
+	if (!tep || tep->host_bigendian == tep->file_bigendian)
 		return data;

 	swap = ((data & 0xffULL) << 8) |
@ -64,11 +105,11 @@ unsigned short tep_data2host2(struct tep_handle *pevent, unsigned short data)
 	return swap;
 }

-unsigned int tep_data2host4(struct tep_handle *pevent, unsigned int data)
+unsigned int tep_data2host4(struct tep_handle *tep, unsigned int data)
 {
 	unsigned int swap;

-	if (!pevent || pevent->host_bigendian == pevent->file_bigendian)
+	if (!tep || tep->host_bigendian == tep->file_bigendian)
 		return data;

 	swap = ((data & 0xffULL) << 24) |
@ -80,11 +121,11 @@ unsigned int tep_data2host4(struct tep_handle *pevent, unsigned int data)
 }

 unsigned long long
-tep_data2host8(struct tep_handle *pevent, unsigned long long data)
+tep_data2host8(struct tep_handle *tep, unsigned long long data)
 {
 	unsigned long long swap;

-	if (!pevent || pevent->host_bigendian == pevent->file_bigendian)
+	if (!tep || tep->host_bigendian == tep->file_bigendian)
 		return data;

 	swap = ((data & 0xffULL) << 56) |
@ -101,175 +142,232 @@ tep_data2host8(struct tep_handle *pevent, unsigned long long data)

 /**
 * tep_get_header_page_size - get size of the header page
- * @pevent: a handle to the tep_handle
+ * @tep: a handle to the tep_handle
 *
 * This returns size of the header page
- * If @pevent is NULL, 0 is returned.
+ * If @tep is NULL, 0 is returned.
 */
-int tep_get_header_page_size(struct tep_handle *pevent)
+int tep_get_header_page_size(struct tep_handle *tep)
 {
-	if(pevent)
-		return pevent->header_page_size_size;
+	if (tep)
+		return tep->header_page_size_size;
+	return 0;
+}
+
+/**
+ * tep_get_header_timestamp_size - get size of the timestamp in the header page
+ * @tep: a handle to the tep_handle
+ *
+ * This returns size of the timestamp in the header page
+ * If @tep is NULL, 0 is returned.
+ */
+int tep_get_header_timestamp_size(struct tep_handle *tep)
+{
+	if (tep)
+		return tep->header_page_ts_size;
 	return 0;
 }

 /**
 * tep_get_cpus - get the number of CPUs
- * @pevent: a handle to the tep_handle
+ * @tep: a handle to the tep_handle
 *
 * This returns the number of CPUs
- * If @pevent is NULL, 0 is returned.
+ * If @tep is NULL, 0 is returned.
 */
-int tep_get_cpus(struct tep_handle *pevent)
+int tep_get_cpus(struct tep_handle *tep)
 {
-	if(pevent)
-		return pevent->cpus;
+	if (tep)
+		return tep->cpus;
 	return 0;
 }

 /**
 * tep_set_cpus - set the number of CPUs
- * @pevent: a handle to the tep_handle
+ * @tep: a handle to the tep_handle
 *
 * This sets the number of CPUs
 */
-void tep_set_cpus(struct tep_handle *pevent, int cpus)
+void tep_set_cpus(struct tep_handle *tep, int cpus)
 {
-	if(pevent)
-		pevent->cpus = cpus;
+	if (tep)
+		tep->cpus = cpus;
 }

 /**
- * tep_get_long_size - get the size of a long integer on the current machine
- * @pevent: a handle to the tep_handle
+ * tep_get_long_size - get the size of a long integer on the traced machine
+ * @tep: a handle to the tep_handle
 *
- * This returns the size of a long integer on the current machine
- * If @pevent is NULL, 0 is returned.
+ * This returns the size of a long integer on the traced machine
+ * If @tep is NULL, 0 is returned.
 */
-int tep_get_long_size(struct tep_handle *pevent)
+int tep_get_long_size(struct tep_handle *tep)
 {
-	if(pevent)
-		return pevent->long_size;
+	if (tep)
+		return tep->long_size;
 	return 0;
 }

 /**
- * tep_set_long_size - set the size of a long integer on the current machine
- * @pevent: a handle to the tep_handle
+ * tep_set_long_size - set the size of a long integer on the traced machine
+ * @tep: a handle to the tep_handle
 * @size: size, in bytes, of a long integer
 *
- * This sets the size of a long integer on the current machine
+ * This sets the size of a long integer on the traced machine
 */
-void tep_set_long_size(struct tep_handle *pevent, int long_size)
+void tep_set_long_size(struct tep_handle *tep, int long_size)
 {
-	if(pevent)
-		pevent->long_size = long_size;
+	if (tep)
+		tep->long_size = long_size;
 }

 /**
- * tep_get_page_size - get the size of a memory page on the current machine
- * @pevent: a handle to the tep_handle
+ * tep_get_page_size - get the size of a memory page on the traced machine
+ * @tep: a handle to the tep_handle
 *
- * This returns the size of a memory page on the current machine
- * If @pevent is NULL, 0 is returned.
+ * This returns the size of a memory page on the traced machine
+ * If @tep is NULL, 0 is returned.
 */
-int tep_get_page_size(struct tep_handle *pevent)
+int tep_get_page_size(struct tep_handle *tep)
 {
-	if(pevent)
-		return pevent->page_size;
+	if (tep)
+		return tep->page_size;
 	return 0;
 }

 /**
- * tep_set_page_size - set the size of a memory page on the current machine
- * @pevent: a handle to the tep_handle
+ * tep_set_page_size - set the size of a memory page on the traced machine
+ * @tep: a handle to the tep_handle
 * @_page_size: size of a memory page, in bytes
 *
- * This sets the size of a memory page on the current machine
+ * This sets the size of a memory page on the traced machine
 */
-void tep_set_page_size(struct tep_handle *pevent, int _page_size)
+void tep_set_page_size(struct tep_handle *tep, int _page_size)
 {
-	if(pevent)
-		pevent->page_size = _page_size;
+	if (tep)
+		tep->page_size = _page_size;
 }

 /**
- * tep_file_bigendian - get if the file is in big endian order
- * @pevent: a handle to the tep_handle
+ * tep_is_file_bigendian - return the endian of the file
+ * @tep: a handle to the tep_handle
 *
- * This returns if the file is in big endian order
- * If @pevent is NULL, 0 is returned.
+ * This returns true if the file is in big endian order
+ * If @tep is NULL, false is returned.
 */
-int tep_file_bigendian(struct tep_handle *pevent)
+bool tep_is_file_bigendian(struct tep_handle *tep)
 {
-	if(pevent)
-		return pevent->file_bigendian;
-	return 0;
+	if (tep)
+		return (tep->file_bigendian == TEP_BIG_ENDIAN);
+	return false;
 }

 /**
 * tep_set_file_bigendian - set if the file is in big endian order
- * @pevent: a handle to the tep_handle
+ * @tep: a handle to the tep_handle
 * @endian: non zero, if the file is in big endian order
 *
 * This sets if the file is in big endian order
 */
-void tep_set_file_bigendian(struct tep_handle *pevent, enum tep_endian endian)
+void tep_set_file_bigendian(struct tep_handle *tep, enum tep_endian endian)
 {
-	if(pevent)
-		pevent->file_bigendian = endian;
+	if (tep)
+		tep->file_bigendian = endian;
 }

 /**
- * tep_is_host_bigendian - get if the order of the current host is big endian
- * @pevent: a handle to the tep_handle
+ * tep_is_local_bigendian - return the endian of the saved local machine
+ * @tep: a handle to the tep_handle
 *
- * This gets if the order of the current host is big endian
- * If @pevent is NULL, 0 is returned.
+ * This returns true if the saved local machine in @tep is big endian.
+ * If @tep is NULL, false is returned.
 */
-int tep_is_host_bigendian(struct tep_handle *pevent)
+bool tep_is_local_bigendian(struct tep_handle *tep)
 {
-	if(pevent)
-		return pevent->host_bigendian;
+	if (tep)
+		return (tep->host_bigendian == TEP_BIG_ENDIAN);
 	return 0;
 }

 /**
- * tep_set_host_bigendian - set the order of the local host
- * @pevent: a handle to the tep_handle
+ * tep_set_local_bigendian - set the stored local machine endian order
+ * @tep: a handle to the tep_handle
 * @endian: non zero, if the local host has big endian order
 *
- * This sets the order of the local host
+ * This sets the endian order for the local machine.
 */
-void tep_set_host_bigendian(struct tep_handle *pevent, enum tep_endian endian)
+void tep_set_local_bigendian(struct tep_handle *tep, enum tep_endian endian)
 {
-	if(pevent)
-		pevent->host_bigendian = endian;
+	if (tep)
+		tep->host_bigendian = endian;
 }

 /**
 * tep_is_latency_format - get if the latency output format is configured
- * @pevent: a handle to the tep_handle
+ * @tep: a handle to the tep_handle
 *
- * This gets if the latency output format is configured
- * If @pevent is NULL, 0 is returned.
+ * This returns true if the latency output format is configured
+ * If @tep is NULL, false is returned.
 */
-int tep_is_latency_format(struct tep_handle *pevent)
+bool tep_is_latency_format(struct tep_handle *tep)
 {
-	if(pevent)
-		return pevent->latency_format;
-	return 0;
+	if (tep)
+		return (tep->latency_format);
+	return false;
 }

 /**
 * tep_set_latency_format - set the latency output format
- * @pevent: a handle to the tep_handle
+ * @tep: a handle to the tep_handle
 * @lat: non zero for latency output format
 *
 * This sets the latency output format
  */
-void tep_set_latency_format(struct tep_handle *pevent, int lat)
+void tep_set_latency_format(struct tep_handle *tep, int lat)
 {
-	if(pevent)
-		pevent->latency_format = lat;
+	if (tep)
+		tep->latency_format = lat;
+}
+
+/**
+ * tep_is_old_format - get if an old kernel is used
+ * @tep: a handle to the tep_handle
+ *
+ * This returns true, if an old kernel is used to generate the tracing events or
+ * false if a new kernel is used. Old kernels did not have header page info.
+ * If @tep is NULL, false is returned.
+ */
+bool tep_is_old_format(struct tep_handle *tep)
+{
+	if (tep)
+		return tep->old_format;
+	return false;
+}
+
+/**
+ * tep_set_print_raw - set a flag to force print in raw format
+ * @tep: a handle to the tep_handle
+ * @print_raw: the new value of the print_raw flag
+ *
+ * This sets a flag to force print in raw format
+ */
+void tep_set_print_raw(struct tep_handle *tep, int print_raw)
+{
+	if (tep)
+		tep->print_raw = print_raw;
+}
+
+/**
+ * tep_set_test_filters - set a flag to test a filter string
+ * @tep: a handle to the tep_handle
+ * @test_filters: the new value of the test_filters flag
+ *
+ * This sets a flag to test a filter string. If this flag is set, when
+ * tep_filter_add_filter_str() API as called,it will print the filter string
+ * instead of adding it.
+ */
+void tep_set_test_filters(struct tep_handle *tep, int test_filters)
+{
+	if (tep)
+		tep->test_filters = test_filters;
 }
--- a/tools/lib/traceevent/event-parse-local.h
+++ b/tools/lib/traceevent/event-parse-local.h
@ -92,8 +92,8 @@ struct tep_handle {
 void tep_free_event(struct tep_event *event);
 void tep_free_format_field(struct tep_format_field *field);

-unsigned short tep_data2host2(struct tep_handle *pevent, unsigned short data);
-unsigned int tep_data2host4(struct tep_handle *pevent, unsigned int data);
-unsigned long long tep_data2host8(struct tep_handle *pevent, unsigned long long data);
+unsigned short tep_data2host2(struct tep_handle *tep, unsigned short data);
+unsigned int tep_data2host4(struct tep_handle *tep, unsigned int data);
+unsigned long long tep_data2host8(struct tep_handle *tep, unsigned long long data);

 #endif /* _PARSE_EVENTS_INT_H */
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@ -64,8 +64,8 @@ typedef int (*tep_event_handler_func)(struct trace_seq *s,
 				      struct tep_event *event,
 				      void *context);

-typedef int (*tep_plugin_load_func)(struct tep_handle *pevent);
-typedef int (*tep_plugin_unload_func)(struct tep_handle *pevent);
+typedef int (*tep_plugin_load_func)(struct tep_handle *tep);
+typedef int (*tep_plugin_unload_func)(struct tep_handle *tep);

 struct tep_plugin_option {
 	struct tep_plugin_option	*next;
@ -85,12 +85,12 @@ struct tep_plugin_option {
 * TEP_PLUGIN_LOADER:  (required)
 *   The function name to initialized the plugin.
 *
- *   int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
+ *   int TEP_PLUGIN_LOADER(struct tep_handle *tep)
 *
 * TEP_PLUGIN_UNLOADER:  (optional)
 *   The function called just before unloading
 *
- *   int TEP_PLUGIN_UNLOADER(struct tep_handle *pevent)
+ *   int TEP_PLUGIN_UNLOADER(struct tep_handle *tep)
 *
 * TEP_PLUGIN_OPTIONS:  (optional)
 *   Plugin options that can be set before loading
@ -278,7 +278,7 @@ struct tep_print_fmt {
 };

 struct tep_event {
-	struct tep_handle	*pevent;
+	struct tep_handle	*tep;
 	char			*name;
 	int			id;
 	int			flags;
@ -393,9 +393,9 @@ struct tep_plugin_list;

 #define INVALID_PLUGIN_LIST_OPTION	((char **)((unsigned long)-1))

-struct tep_plugin_list *tep_load_plugins(struct tep_handle *pevent);
+struct tep_plugin_list *tep_load_plugins(struct tep_handle *tep);
 void tep_unload_plugins(struct tep_plugin_list *plugin_list,
-			struct tep_handle *pevent);
+			struct tep_handle *tep);
 char **tep_plugin_list_options(void);
 void tep_plugin_free_options_list(char **list);
 int tep_plugin_add_options(const char *name,
@ -409,8 +409,10 @@ void tep_print_plugins(struct trace_seq *s,
 typedef char *(tep_func_resolver_t)(void *priv,
 				    unsigned long long *addrp, char **modp);
 void tep_set_flag(struct tep_handle *tep, int flag);
+void tep_clear_flag(struct tep_handle *tep, enum tep_flag flag);
+bool tep_test_flag(struct tep_handle *tep, enum tep_flag flags);

-static inline int tep_host_bigendian(void)
+static inline int tep_is_bigendian(void)
 {
 	unsigned char str[] = { 0x1, 0x2, 0x3, 0x4 };
 	unsigned int val;
@ -428,37 +430,37 @@ enum trace_flag_type {
 	TRACE_FLAG_SOFTIRQ		= 0x10,
 };

-int tep_set_function_resolver(struct tep_handle *pevent,
+int tep_set_function_resolver(struct tep_handle *tep,
 			      tep_func_resolver_t *func, void *priv);
-void tep_reset_function_resolver(struct tep_handle *pevent);
-int tep_register_comm(struct tep_handle *pevent, const char *comm, int pid);
-int tep_override_comm(struct tep_handle *pevent, const char *comm, int pid);
-int tep_register_trace_clock(struct tep_handle *pevent, const char *trace_clock);
-int tep_register_function(struct tep_handle *pevent, char *name,
+void tep_reset_function_resolver(struct tep_handle *tep);
+int tep_register_comm(struct tep_handle *tep, const char *comm, int pid);
+int tep_override_comm(struct tep_handle *tep, const char *comm, int pid);
+int tep_register_trace_clock(struct tep_handle *tep, const char *trace_clock);
+int tep_register_function(struct tep_handle *tep, char *name,
 			  unsigned long long addr, char *mod);
-int tep_register_print_string(struct tep_handle *pevent, const char *fmt,
+int tep_register_print_string(struct tep_handle *tep, const char *fmt,
 			      unsigned long long addr);
-int tep_pid_is_registered(struct tep_handle *pevent, int pid);
+bool tep_is_pid_registered(struct tep_handle *tep, int pid);

-void tep_print_event_task(struct tep_handle *pevent, struct trace_seq *s,
+void tep_print_event_task(struct tep_handle *tep, struct trace_seq *s,
 			  struct tep_event *event,
 			  struct tep_record *record);
-void tep_print_event_time(struct tep_handle *pevent, struct trace_seq *s,
+void tep_print_event_time(struct tep_handle *tep, struct trace_seq *s,
 			  struct tep_event *event,
 			  struct tep_record *record,
 			  bool use_trace_clock);
-void tep_print_event_data(struct tep_handle *pevent, struct trace_seq *s,
+void tep_print_event_data(struct tep_handle *tep, struct trace_seq *s,
 			  struct tep_event *event,
 			  struct tep_record *record);
-void tep_print_event(struct tep_handle *pevent, struct trace_seq *s,
+void tep_print_event(struct tep_handle *tep, struct trace_seq *s,
 		     struct tep_record *record, bool use_trace_clock);

-int tep_parse_header_page(struct tep_handle *pevent, char *buf, unsigned long size,
+int tep_parse_header_page(struct tep_handle *tep, char *buf, unsigned long size,
 			  int long_size);

-enum tep_errno tep_parse_event(struct tep_handle *pevent, const char *buf,
+enum tep_errno tep_parse_event(struct tep_handle *tep, const char *buf,
 			       unsigned long size, const char *sys);
-enum tep_errno tep_parse_format(struct tep_handle *pevent,
+enum tep_errno tep_parse_format(struct tep_handle *tep,
 				struct tep_event **eventp,
 				const char *buf,
 				unsigned long size, const char *sys);
@ -490,50 +492,50 @@ enum tep_reg_handler {
 	TEP_REGISTER_SUCCESS_OVERWRITE,
 };

-int tep_register_event_handler(struct tep_handle *pevent, int id,
+int tep_register_event_handler(struct tep_handle *tep, int id,
 			       const char *sys_name, const char *event_name,
 			       tep_event_handler_func func, void *context);
-int tep_unregister_event_handler(struct tep_handle *pevent, int id,
+int tep_unregister_event_handler(struct tep_handle *tep, int id,
 				 const char *sys_name, const char *event_name,
 				 tep_event_handler_func func, void *context);
-int tep_register_print_function(struct tep_handle *pevent,
+int tep_register_print_function(struct tep_handle *tep,
 				tep_func_handler func,
 				enum tep_func_arg_type ret_type,
 				char *name, ...);
-int tep_unregister_print_function(struct tep_handle *pevent,
+int tep_unregister_print_function(struct tep_handle *tep,
 				  tep_func_handler func, char *name);

 struct tep_format_field *tep_find_common_field(struct tep_event *event, const char *name);
 struct tep_format_field *tep_find_field(struct tep_event *event, const char *name);
 struct tep_format_field *tep_find_any_field(struct tep_event *event, const char *name);

-const char *tep_find_function(struct tep_handle *pevent, unsigned long long addr);
+const char *tep_find_function(struct tep_handle *tep, unsigned long long addr);
 unsigned long long
-tep_find_function_address(struct tep_handle *pevent, unsigned long long addr);
-unsigned long long tep_read_number(struct tep_handle *pevent, const void *ptr, int size);
+tep_find_function_address(struct tep_handle *tep, unsigned long long addr);
+unsigned long long tep_read_number(struct tep_handle *tep, const void *ptr, int size);
 int tep_read_number_field(struct tep_format_field *field, const void *data,
 			  unsigned long long *value);

 struct tep_event *tep_get_first_event(struct tep_handle *tep);
 int tep_get_events_count(struct tep_handle *tep);
-struct tep_event *tep_find_event(struct tep_handle *pevent, int id);
+struct tep_event *tep_find_event(struct tep_handle *tep, int id);

 struct tep_event *
-tep_find_event_by_name(struct tep_handle *pevent, const char *sys, const char *name);
+tep_find_event_by_name(struct tep_handle *tep, const char *sys, const char *name);
 struct tep_event *
-tep_find_event_by_record(struct tep_handle *pevent, struct tep_record *record);
+tep_find_event_by_record(struct tep_handle *tep, struct tep_record *record);

-void tep_data_lat_fmt(struct tep_handle *pevent,
-		      struct trace_seq *s, struct tep_record *record);
-int tep_data_type(struct tep_handle *pevent, struct tep_record *rec);
-int tep_data_pid(struct tep_handle *pevent, struct tep_record *rec);
-int tep_data_preempt_count(struct tep_handle *pevent, struct tep_record *rec);
-int tep_data_flags(struct tep_handle *pevent, struct tep_record *rec);
-const char *tep_data_comm_from_pid(struct tep_handle *pevent, int pid);
+void tep_data_latency_format(struct tep_handle *tep,
+			     struct trace_seq *s, struct tep_record *record);
+int tep_data_type(struct tep_handle *tep, struct tep_record *rec);
+int tep_data_pid(struct tep_handle *tep, struct tep_record *rec);
+int tep_data_preempt_count(struct tep_handle *tep, struct tep_record *rec);
+int tep_data_flags(struct tep_handle *tep, struct tep_record *rec);
+const char *tep_data_comm_from_pid(struct tep_handle *tep, int pid);
 struct tep_cmdline;
-struct tep_cmdline *tep_data_pid_from_comm(struct tep_handle *pevent, const char *comm,
+struct tep_cmdline *tep_data_pid_from_comm(struct tep_handle *tep, const char *comm,
 					   struct tep_cmdline *next);
-int tep_cmdline_pid(struct tep_handle *pevent, struct tep_cmdline *cmdline);
+int tep_cmdline_pid(struct tep_handle *tep, struct tep_cmdline *cmdline);

 void tep_print_field(struct trace_seq *s, void *data,
 		     struct tep_format_field *field);
@ -541,10 +543,12 @@ void tep_print_fields(struct trace_seq *s, void *data,
 		      int size __maybe_unused, struct tep_event *event);
 void tep_event_info(struct trace_seq *s, struct tep_event *event,
 		    struct tep_record *record);
-int tep_strerror(struct tep_handle *pevent, enum tep_errno errnum,
+int tep_strerror(struct tep_handle *tep, enum tep_errno errnum,
 		 char *buf, size_t buflen);

-struct tep_event **tep_list_events(struct tep_handle *pevent, enum tep_event_sort_type);
+struct tep_event **tep_list_events(struct tep_handle *tep, enum tep_event_sort_type);
+struct tep_event **tep_list_events_copy(struct tep_handle *tep,
+					enum tep_event_sort_type);
 struct tep_format_field **tep_event_common_fields(struct tep_event *event);
 struct tep_format_field **tep_event_fields(struct tep_event *event);

@ -552,24 +556,28 @@ enum tep_endian {
        TEP_LITTLE_ENDIAN = 0,
        TEP_BIG_ENDIAN
 };
-int tep_get_cpus(struct tep_handle *pevent);
-void tep_set_cpus(struct tep_handle *pevent, int cpus);
-int tep_get_long_size(struct tep_handle *pevent);
-void tep_set_long_size(struct tep_handle *pevent, int long_size);
-int tep_get_page_size(struct tep_handle *pevent);
-void tep_set_page_size(struct tep_handle *pevent, int _page_size);
-int tep_file_bigendian(struct tep_handle *pevent);
-void tep_set_file_bigendian(struct tep_handle *pevent, enum tep_endian endian);
-int tep_is_host_bigendian(struct tep_handle *pevent);
-void tep_set_host_bigendian(struct tep_handle *pevent, enum tep_endian endian);
-int tep_is_latency_format(struct tep_handle *pevent);
-void tep_set_latency_format(struct tep_handle *pevent, int lat);
-int tep_get_header_page_size(struct tep_handle *pevent);
+int tep_get_cpus(struct tep_handle *tep);
+void tep_set_cpus(struct tep_handle *tep, int cpus);
+int tep_get_long_size(struct tep_handle *tep);
+void tep_set_long_size(struct tep_handle *tep, int long_size);
+int tep_get_page_size(struct tep_handle *tep);
+void tep_set_page_size(struct tep_handle *tep, int _page_size);
+bool tep_is_file_bigendian(struct tep_handle *tep);
+void tep_set_file_bigendian(struct tep_handle *tep, enum tep_endian endian);
+bool tep_is_local_bigendian(struct tep_handle *tep);
+void tep_set_local_bigendian(struct tep_handle *tep, enum tep_endian endian);
+bool tep_is_latency_format(struct tep_handle *tep);
+void tep_set_latency_format(struct tep_handle *tep, int lat);
+int tep_get_header_page_size(struct tep_handle *tep);
+int tep_get_header_timestamp_size(struct tep_handle *tep);
+bool tep_is_old_format(struct tep_handle *tep);
+void tep_set_print_raw(struct tep_handle *tep, int print_raw);
+void tep_set_test_filters(struct tep_handle *tep, int test_filters);

 struct tep_handle *tep_alloc(void);
-void tep_free(struct tep_handle *pevent);
-void tep_ref(struct tep_handle *pevent);
-void tep_unref(struct tep_handle *pevent);
+void tep_free(struct tep_handle *tep);
+void tep_ref(struct tep_handle *tep);
+void tep_unref(struct tep_handle *tep);
 int tep_get_ref(struct tep_handle *tep);

 /* access to the internal parser */
@ -581,8 +589,8 @@ const char *tep_get_input_buf(void);
 unsigned long long tep_get_input_buf_ptr(void);

 /* for debugging */
-void tep_print_funcs(struct tep_handle *pevent);
-void tep_print_printk(struct tep_handle *pevent);
+void tep_print_funcs(struct tep_handle *tep);
+void tep_print_printk(struct tep_handle *tep);

 /* ----------------------- filtering ----------------------- */

@ -709,13 +717,13 @@ struct tep_filter_type {
 #define TEP_FILTER_ERROR_BUFSZ  1024

 struct tep_event_filter {
-	struct tep_handle	*pevent;
+	struct tep_handle	*tep;
 	int			filters;
 	struct tep_filter_type	*event_filters;
 	char			error_buffer[TEP_FILTER_ERROR_BUFSZ];
 };

-struct tep_event_filter *tep_filter_alloc(struct tep_handle *pevent);
+struct tep_event_filter *tep_filter_alloc(struct tep_handle *tep);

 /* for backward compatibility */
 #define FILTER_NONE		TEP_ERRNO__NO_FILTER
@ -723,12 +731,6 @@ struct tep_event_filter *tep_filter_alloc(struct tep_handle *pevent);
 #define FILTER_MISS		TEP_ERRNO__FILTER_MISS
 #define FILTER_MATCH		TEP_ERRNO__FILTER_MATCH

-enum tep_filter_trivial_type {
-	TEP_FILTER_TRIVIAL_FALSE,
-	TEP_FILTER_TRIVIAL_TRUE,
-	TEP_FILTER_TRIVIAL_BOTH,
-};
-
 enum tep_errno tep_filter_add_filter_str(struct tep_event_filter *filter,
 					 const char *filter_str);

@ -743,9 +745,6 @@ int tep_event_filtered(struct tep_event_filter *filter,

 void tep_filter_reset(struct tep_event_filter *filter);

-int tep_filter_clear_trivial(struct tep_event_filter *filter,
-			     enum tep_filter_trivial_type type);
-
 void tep_filter_free(struct tep_event_filter *filter);

 char *tep_filter_make_string(struct tep_event_filter *filter, int event_id);
@ -753,15 +752,8 @@ char *tep_filter_make_string(struct tep_event_filter *filter, int event_id);
 int tep_filter_remove_event(struct tep_event_filter *filter,
 			    int event_id);

-int tep_filter_event_has_trivial(struct tep_event_filter *filter,
-				 int event_id,
-				 enum tep_filter_trivial_type type);
-
 int tep_filter_copy(struct tep_event_filter *dest, struct tep_event_filter *source);

-int tep_update_trivial(struct tep_event_filter *dest, struct tep_event_filter *source,
-			enum tep_filter_trivial_type type);
-
 int tep_filter_compare(struct tep_event_filter *filter1, struct tep_event_filter *filter2);

 #endif /* _PARSE_EVENTS_H */
--- a/tools/lib/traceevent/event-plugin.c
+++ b/tools/lib/traceevent/event-plugin.c
@ -269,7 +269,7 @@ void tep_print_plugins(struct trace_seq *s,
 }

 static void
-load_plugin(struct tep_handle *pevent, const char *path,
+load_plugin(struct tep_handle *tep, const char *path,
 	    const char *file, void *data)
 {
 	struct tep_plugin_list **plugin_list = data;
@ -316,7 +316,7 @@ load_plugin(struct tep_handle *pevent, const char *path,
 	*plugin_list = list;

 	pr_stat("registering plugin: %s", plugin);
-	func(pevent);
+	func(tep);
 	return;

 out_free:
@ -324,9 +324,9 @@ load_plugin(struct tep_handle *pevent, const char *path,
 }

 static void
-load_plugins_dir(struct tep_handle *pevent, const char *suffix,
+load_plugins_dir(struct tep_handle *tep, const char *suffix,
 		 const char *path,
-		 void (*load_plugin)(struct tep_handle *pevent,
+		 void (*load_plugin)(struct tep_handle *tep,
 				     const char *path,
 				     const char *name,
 				     void *data),
@ -359,15 +359,15 @@ load_plugins_dir(struct tep_handle *pevent, const char *suffix,
 		if (strcmp(name + (strlen(name) - strlen(suffix)), suffix) != 0)
 			continue;

-		load_plugin(pevent, path, name, data);
+		load_plugin(tep, path, name, data);
 	}

 	closedir(dir);
 }

 static void
-load_plugins(struct tep_handle *pevent, const char *suffix,
-	     void (*load_plugin)(struct tep_handle *pevent,
+load_plugins(struct tep_handle *tep, const char *suffix,
+	     void (*load_plugin)(struct tep_handle *tep,
 				 const char *path,
 				 const char *name,
 				 void *data),
@ -378,7 +378,7 @@ load_plugins(struct tep_handle *pevent, const char *suffix,
 	char *envdir;
 	int ret;

-	if (pevent->flags & TEP_DISABLE_PLUGINS)
+	if (tep->flags & TEP_DISABLE_PLUGINS)
 		return;

 	/*
@ -386,8 +386,8 @@ load_plugins(struct tep_handle *pevent, const char *suffix,
 	 * check that first.
 	 */
 #ifdef PLUGIN_DIR
-	if (!(pevent->flags & TEP_DISABLE_SYS_PLUGINS))
-		load_plugins_dir(pevent, suffix, PLUGIN_DIR,
+	if (!(tep->flags & TEP_DISABLE_SYS_PLUGINS))
+		load_plugins_dir(tep, suffix, PLUGIN_DIR,
 				 load_plugin, data);
 #endif

@ -397,7 +397,7 @@ load_plugins(struct tep_handle *pevent, const char *suffix,
 	 */
 	envdir = getenv("TRACEEVENT_PLUGIN_DIR");
 	if (envdir)
-		load_plugins_dir(pevent, suffix, envdir, load_plugin, data);
+		load_plugins_dir(tep, suffix, envdir, load_plugin, data);

 	/*
 	 * Now let the home directory override the environment
@ -413,22 +413,22 @@ load_plugins(struct tep_handle *pevent, const char *suffix,
 		return;
 	}

-	load_plugins_dir(pevent, suffix, path, load_plugin, data);
+	load_plugins_dir(tep, suffix, path, load_plugin, data);

 	free(path);
 }

 struct tep_plugin_list*
-tep_load_plugins(struct tep_handle *pevent)
+tep_load_plugins(struct tep_handle *tep)
 {
 	struct tep_plugin_list *list = NULL;

-	load_plugins(pevent, ".so", load_plugin, &list);
+	load_plugins(tep, ".so", load_plugin, &list);
 	return list;
 }

 void
-tep_unload_plugins(struct tep_plugin_list *plugin_list, struct tep_handle *pevent)
+tep_unload_plugins(struct tep_plugin_list *plugin_list, struct tep_handle *tep)
 {
 	tep_plugin_unload_func func;
 	struct tep_plugin_list *list;
@ -438,7 +438,7 @@ tep_unload_plugins(struct tep_plugin_list *plugin_list, struct tep_handle *peven
 		plugin_list = list->next;
 		func = dlsym(list->handle, TEP_PLUGIN_UNLOADER_NAME);
 		if (func)
-			func(pevent);
+			func(tep);
 		dlclose(list->handle);
 		free(list->name);
 		free(list);
--- a/tools/lib/traceevent/kbuffer-parse.c
+++ b/tools/lib/traceevent/kbuffer-parse.c
@ -727,3 +727,52 @@ int kbuffer_start_of_data(struct kbuffer *kbuf)
 {
 	return kbuf->start;
 }
+
+/**
+ * kbuffer_raw_get - get raw buffer info
+ * @kbuf:	The kbuffer
+ * @subbuf:	Start of mapped subbuffer
+ * @info:	Info descriptor to fill in
+ *
+ * For debugging. This can return internals of the ring buffer.
+ * Expects to have info->next set to what it will read.
+ * The type, length and timestamp delta will be filled in, and
+ * @info->next will be updated to the next element.
+ * The @subbuf is used to know if the info is passed the end of
+ * data and NULL will be returned if it is.
+ */
+struct kbuffer_raw_info *
+kbuffer_raw_get(struct kbuffer *kbuf, void *subbuf, struct kbuffer_raw_info *info)
+{
+	unsigned long long flags;
+	unsigned long long delta;
+	unsigned int type_len;
+	unsigned int size;
+	int start;
+	int length;
+	void *ptr = info->next;
+
+	if (!kbuf || !subbuf)
+		return NULL;
+
+	if (kbuf->flags & KBUFFER_FL_LONG_8)
+		start = 16;
+	else
+		start = 12;
+
+	flags = read_long(kbuf, subbuf + 8);
+	size = (unsigned int)flags & COMMIT_MASK;
+
+	if (ptr < subbuf || ptr >= subbuf + start + size)
+		return NULL;
+
+	type_len = translate_data(kbuf, ptr, &ptr, &delta, &length);
+
+	info->next = ptr + length;
+
+	info->type = type_len;
+	info->delta = delta;
+	info->length = length;
+
+	return info;
+}
--- a/tools/lib/traceevent/kbuffer.h
+++ b/tools/lib/traceevent/kbuffer.h
@ -65,4 +65,17 @@ int kbuffer_subbuffer_size(struct kbuffer *kbuf);
 void kbuffer_set_old_format(struct kbuffer *kbuf);
 int kbuffer_start_of_data(struct kbuffer *kbuf);

+/* Debugging */
+
+struct kbuffer_raw_info {
+	int			type;
+	int			length;
+	unsigned long long	delta;
+	void			*next;
+};
+
+/* Read raw data */
+struct kbuffer_raw_info *kbuffer_raw_get(struct kbuffer *kbuf, void *subbuf,
+					 struct kbuffer_raw_info *info);
+
 #endif /* _K_BUFFER_H */
--- a/tools/lib/traceevent/parse-filter.c
+++ b/tools/lib/traceevent/parse-filter.c
@ -154,7 +154,7 @@ add_filter_type(struct tep_event_filter *filter, int id)

 	filter_type = &filter->event_filters[i];
 	filter_type->event_id = id;
-	filter_type->event = tep_find_event(filter->pevent, id);
+	filter_type->event = tep_find_event(filter->tep, id);
 	filter_type->filter = NULL;

 	filter->filters++;
@ -164,9 +164,9 @@ add_filter_type(struct tep_event_filter *filter, int id)

 /**
 * tep_filter_alloc - create a new event filter
- * @pevent: The pevent that this filter is associated with
+ * @tep: The tep that this filter is associated with
 */
-struct tep_event_filter *tep_filter_alloc(struct tep_handle *pevent)
+struct tep_event_filter *tep_filter_alloc(struct tep_handle *tep)
 {
 	struct tep_event_filter *filter;

@ -175,8 +175,8 @@ struct tep_event_filter *tep_filter_alloc(struct tep_handle *pevent)
 		return NULL;

 	memset(filter, 0, sizeof(*filter));
-	filter->pevent = pevent;
-	tep_ref(pevent);
+	filter->tep = tep;
+	tep_ref(tep);

 	return filter;
 }
@ -256,7 +256,7 @@ static int event_match(struct tep_event *event,
 }

 static enum tep_errno
-find_event(struct tep_handle *pevent, struct event_list **events,
+find_event(struct tep_handle *tep, struct event_list **events,
 	   char *sys_name, char *event_name)
 {
 	struct tep_event *event;
@ -299,8 +299,8 @@ find_event(struct tep_handle *pevent, struct event_list **events,
 		}
 	}

-	for (i = 0; i < pevent->nr_events; i++) {
-		event = pevent->events[i];
+	for (i = 0; i < tep->nr_events; i++) {
+		event = tep->events[i];
 		if (event_match(event, sys_name ? &sreg : NULL, &ereg)) {
 			match = 1;
 			if (add_event(events, event) < 0) {
@ -1257,7 +1257,7 @@ static void filter_init_error_buf(struct tep_event_filter *filter)
 enum tep_errno tep_filter_add_filter_str(struct tep_event_filter *filter,
 					 const char *filter_str)
 {
-	struct tep_handle *pevent = filter->pevent;
+	struct tep_handle *tep = filter->tep;
 	struct event_list *event;
 	struct event_list *events = NULL;
 	const char *filter_start;
@ -1313,7 +1313,7 @@ enum tep_errno tep_filter_add_filter_str(struct tep_event_filter *filter,
 		}

 		/* Find this event */
-		ret = find_event(pevent, &events, strim(sys_name), strim(event_name));
+		ret = find_event(tep, &events, strim(sys_name), strim(event_name));
 		if (ret < 0) {
 			free_events(events);
 			free(this_event);
@ -1334,7 +1334,7 @@ enum tep_errno tep_filter_add_filter_str(struct tep_event_filter *filter,
 		if (ret < 0)
 			rtn = ret;

-		if (ret >= 0 && pevent->test_filters) {
+		if (ret >= 0 && tep->test_filters) {
 			char *test;
 			test = tep_filter_make_string(filter, event->event->id);
 			if (test) {
@ -1346,9 +1346,6 @@ enum tep_errno tep_filter_add_filter_str(struct tep_event_filter *filter,

 	free_events(events);

-	if (rtn >= 0 && pevent->test_filters)
-		exit(0);
-
 	return rtn;
 }

@ -1380,7 +1377,7 @@ int tep_filter_strerror(struct tep_event_filter *filter, enum tep_errno err,
 		return 0;
 	}

-	return tep_strerror(filter->pevent, err, buf, buflen);
+	return tep_strerror(filter->tep, err, buf, buflen);
 }

 /**
@ -1443,7 +1440,7 @@ void tep_filter_reset(struct tep_event_filter *filter)

 void tep_filter_free(struct tep_event_filter *filter)
 {
-	tep_unref(filter->pevent);
+	tep_unref(filter->tep);

 	tep_filter_reset(filter);

@ -1462,10 +1459,10 @@ static int copy_filter_type(struct tep_event_filter *filter,
 	const char *name;
 	char *str;

-	/* Can't assume that the pevent's are the same */
+	/* Can't assume that the tep's are the same */
 	sys = filter_type->event->system;
 	name = filter_type->event->name;
-	event = tep_find_event_by_name(filter->pevent, sys, name);
+	event = tep_find_event_by_name(filter->tep, sys, name);
 	if (!event)
 		return -1;

@ -1522,167 +1519,6 @@ int tep_filter_copy(struct tep_event_filter *dest, struct tep_event_filter *sour
 	return ret;
 }

-
-/**
- * tep_update_trivial - update the trivial filters with the given filter
- * @dest - the filter to update
- * @source - the filter as the source of the update
- * @type - the type of trivial filter to update.
- *
- * Scan dest for trivial events matching @type to replace with the source.
- *
- * Returns 0 on success and -1 if there was a problem updating, but
- *   events may have still been updated on error.
- */
-int tep_update_trivial(struct tep_event_filter *dest, struct tep_event_filter *source,
-		       enum tep_filter_trivial_type type)
-{
-	struct tep_handle *src_pevent;
-	struct tep_handle *dest_pevent;
-	struct tep_event *event;
-	struct tep_filter_type *filter_type;
-	struct tep_filter_arg *arg;
-	char *str;
-	int i;
-
-	src_pevent = source->pevent;
-	dest_pevent = dest->pevent;
-
-	/* Do nothing if either of the filters has nothing to filter */
-	if (!dest->filters || !source->filters)
-		return 0;
-
-	for (i = 0; i < dest->filters; i++) {
-		filter_type = &dest->event_filters[i];
-		arg = filter_type->filter;
-		if (arg->type != TEP_FILTER_ARG_BOOLEAN)
-			continue;
-		if ((arg->boolean.value && type == TEP_FILTER_TRIVIAL_FALSE) ||
-		    (!arg->boolean.value && type == TEP_FILTER_TRIVIAL_TRUE))
-			continue;
-
-		event = filter_type->event;
-
-		if (src_pevent != dest_pevent) {
-			/* do a look up */
-			event = tep_find_event_by_name(src_pevent,
-						       event->system,
-						       event->name);
-			if (!event)
-				return -1;
-		}
-
-		str = tep_filter_make_string(source, event->id);
-		if (!str)
-			continue;
-
-		/* Don't bother if the filter is trivial too */
-		if (strcmp(str, "TRUE") != 0 && strcmp(str, "FALSE") != 0)
-			filter_event(dest, event, str, NULL);
-		free(str);
-	}
-	return 0;
-}
-
-/**
- * tep_filter_clear_trivial - clear TRUE and FALSE filters
- * @filter: the filter to remove trivial filters from
- * @type: remove only true, false, or both
- *
- * Removes filters that only contain a TRUE or FALES boolean arg.
- *
- * Returns 0 on success and -1 if there was a problem.
- */
-int tep_filter_clear_trivial(struct tep_event_filter *filter,
-			     enum tep_filter_trivial_type type)
-{
-	struct tep_filter_type *filter_type;
-	int count = 0;
-	int *ids = NULL;
-	int i;
-
-	if (!filter->filters)
-		return 0;
-
-	/*
-	 * Two steps, first get all ids with trivial filters.
-	 *  then remove those ids.
-	 */
-	for (i = 0; i < filter->filters; i++) {
-		int *new_ids;
-
-		filter_type = &filter->event_filters[i];
-		if (filter_type->filter->type != TEP_FILTER_ARG_BOOLEAN)
-			continue;
-		switch (type) {
-		case TEP_FILTER_TRIVIAL_FALSE:
-			if (filter_type->filter->boolean.value)
-				continue;
-			break;
-		case TEP_FILTER_TRIVIAL_TRUE:
-			if (!filter_type->filter->boolean.value)
-				continue;
-		default:
-			break;
-		}
-
-		new_ids = realloc(ids, sizeof(*ids) * (count + 1));
-		if (!new_ids) {
-			free(ids);
-			return -1;
-		}
-
-		ids = new_ids;
-		ids[count++] = filter_type->event_id;
-	}
-
-	if (!count)
-		return 0;
-
-	for (i = 0; i < count; i++)
-		tep_filter_remove_event(filter, ids[i]);
-
-	free(ids);
-	return 0;
-}
-
-/**
- * tep_filter_event_has_trivial - return true event contains trivial filter
- * @filter: the filter with the information
- * @event_id: the id of the event to test
- * @type: trivial type to test for (TRUE, FALSE, EITHER)
- *
- * Returns 1 if the event contains a matching trivial type
- *  otherwise 0.
- */
-int tep_filter_event_has_trivial(struct tep_event_filter *filter,
-				 int event_id,
-				 enum tep_filter_trivial_type type)
-{
-	struct tep_filter_type *filter_type;
-
-	if (!filter->filters)
-		return 0;
-
-	filter_type = find_filter_type(filter, event_id);
-
-	if (!filter_type)
-		return 0;
-
-	if (filter_type->filter->type != TEP_FILTER_ARG_BOOLEAN)
-		return 0;
-
-	switch (type) {
-	case TEP_FILTER_TRIVIAL_FALSE:
-		return !filter_type->filter->boolean.value;
-
-	case TEP_FILTER_TRIVIAL_TRUE:
-		return filter_type->filter->boolean.value;
-	default:
-		return 1;
-	}
-}
-
 static int test_filter(struct tep_event *event, struct tep_filter_arg *arg,
 		       struct tep_record *record, enum tep_errno *err);

@ -1692,8 +1528,8 @@ get_comm(struct tep_event *event, struct tep_record *record)
 	const char *comm;
 	int pid;

-	pid = tep_data_pid(event->pevent, record);
-	comm = tep_data_comm_from_pid(event->pevent, pid);
+	pid = tep_data_pid(event->tep, record);
+	comm = tep_data_comm_from_pid(event->tep, pid);
 	return comm;
 }

@ -1861,7 +1697,7 @@ static int test_num(struct tep_event *event, struct tep_filter_arg *arg,
 static const char *get_field_str(struct tep_filter_arg *arg, struct tep_record *record)
 {
 	struct tep_event *event;
-	struct tep_handle *pevent;
+	struct tep_handle *tep;
 	unsigned long long addr;
 	const char *val = NULL;
 	unsigned int size;
@ -1891,12 +1727,12 @@ static const char *get_field_str(struct tep_filter_arg *arg, struct tep_record *

 	} else {
 		event = arg->str.field->event;
-		pevent = event->pevent;
+		tep = event->tep;
 		addr = get_value(event, arg->str.field, record);

 		if (arg->str.field->flags & (TEP_FIELD_IS_POINTER | TEP_FIELD_IS_LONG))
 			/* convert to a kernel symbol */
-			val = tep_find_function(pevent, addr);
+			val = tep_find_function(tep, addr);

 		if (val == NULL) {
 			/* just use the hex of the string name */
@ -2036,7 +1872,7 @@ int tep_event_filtered(struct tep_event_filter *filter, int event_id)
 enum tep_errno tep_filter_match(struct tep_event_filter *filter,
 				struct tep_record *record)
 {
-	struct tep_handle *pevent = filter->pevent;
+	struct tep_handle *tep = filter->tep;
 	struct tep_filter_type *filter_type;
 	int event_id;
 	int ret;
@ -2047,7 +1883,7 @@ enum tep_errno tep_filter_match(struct tep_event_filter *filter,
 	if (!filter->filters)
 		return TEP_ERRNO__NO_FILTER;

-	event_id = tep_data_type(pevent, record);
+	event_id = tep_data_type(tep, record);

 	filter_type = find_filter_type(filter, event_id);
 	if (!filter_type)
@ -2409,14 +2245,6 @@ int tep_filter_compare(struct tep_event_filter *filter1, struct tep_event_filter
 			break;
 		if (filter_type1->filter->type != filter_type2->filter->type)
 			break;
-		switch (filter_type1->filter->type) {
-		case TEP_FILTER_TRIVIAL_FALSE:
-		case TEP_FILTER_TRIVIAL_TRUE:
-			/* trivial types just need the type compared */
-			continue;
-		default:
-			break;
-		}
 		/* The best way to compare complex filters is with strings */
 		str1 = arg_to_str(filter1, filter_type1->filter);
 		str2 = arg_to_str(filter2, filter_type2->filter);
--- a/tools/lib/traceevent/plugin_cfg80211.c
+++ b/tools/lib/traceevent/plugin_cfg80211.c
@ -25,9 +25,9 @@ process___le16_to_cpup(struct trace_seq *s, unsigned long long *args)
 	return val ? (long long) le16toh(*val) : 0;
 }

-int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
+int TEP_PLUGIN_LOADER(struct tep_handle *tep)
 {
-	tep_register_print_function(pevent,
+	tep_register_print_function(tep,
 				    process___le16_to_cpup,
 				    TEP_FUNC_ARG_INT,
 				    "__le16_to_cpup",
@ -36,8 +36,8 @@ int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
 	return 0;
 }

-void TEP_PLUGIN_UNLOADER(struct tep_handle *pevent)
+void TEP_PLUGIN_UNLOADER(struct tep_handle *tep)
 {
-	tep_unregister_print_function(pevent, process___le16_to_cpup,
+	tep_unregister_print_function(tep, process___le16_to_cpup,
 				      "__le16_to_cpup");
 }
--- a/tools/lib/traceevent/plugin_function.c
+++ b/tools/lib/traceevent/plugin_function.c
@ -126,7 +126,7 @@ static int add_and_get_index(const char *parent, const char *child, int cpu)
 static int function_handler(struct trace_seq *s, struct tep_record *record,
 			    struct tep_event *event, void *context)
 {
-	struct tep_handle *pevent = event->pevent;
+	struct tep_handle *tep = event->tep;
 	unsigned long long function;
 	unsigned long long pfunction;
 	const char *func;
@ -136,12 +136,12 @@ static int function_handler(struct trace_seq *s, struct tep_record *record,
 	if (tep_get_field_val(s, event, "ip", record, &function, 1))
 		return trace_seq_putc(s, '!');

-	func = tep_find_function(pevent, function);
+	func = tep_find_function(tep, function);

 	if (tep_get_field_val(s, event, "parent_ip", record, &pfunction, 1))
 		return trace_seq_putc(s, '!');

-	parent = tep_find_function(pevent, pfunction);
+	parent = tep_find_function(tep, pfunction);

 	if (parent && ftrace_indent->set)
 		index = add_and_get_index(parent, func, record->cpu);
@ -164,9 +164,9 @@ static int function_handler(struct trace_seq *s, struct tep_record *record,
 	return 0;
 }

-int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
+int TEP_PLUGIN_LOADER(struct tep_handle *tep)
 {
-	tep_register_event_handler(pevent, -1, "ftrace", "function",
+	tep_register_event_handler(tep, -1, "ftrace", "function",
 				   function_handler, NULL);

 	tep_plugin_add_options("ftrace", plugin_options);
@ -174,11 +174,11 @@ int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
 	return 0;
 }

-void TEP_PLUGIN_UNLOADER(struct tep_handle *pevent)
+void TEP_PLUGIN_UNLOADER(struct tep_handle *tep)
 {
 	int i, x;

-	tep_unregister_event_handler(pevent, -1, "ftrace", "function",
+	tep_unregister_event_handler(tep, -1, "ftrace", "function",
 				     function_handler, NULL);

 	for (i = 0; i <= cpus; i++) {
--- a/tools/lib/traceevent/plugin_hrtimer.c
+++ b/tools/lib/traceevent/plugin_hrtimer.c
@ -67,23 +67,23 @@ static int timer_start_handler(struct trace_seq *s,
 	return 0;
 }

-int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
+int TEP_PLUGIN_LOADER(struct tep_handle *tep)
 {
-	tep_register_event_handler(pevent, -1,
+	tep_register_event_handler(tep, -1,
 				   "timer", "hrtimer_expire_entry",
 				   timer_expire_handler, NULL);

-	tep_register_event_handler(pevent, -1, "timer", "hrtimer_start",
+	tep_register_event_handler(tep, -1, "timer", "hrtimer_start",
 				   timer_start_handler, NULL);
 	return 0;
 }

-void TEP_PLUGIN_UNLOADER(struct tep_handle *pevent)
+void TEP_PLUGIN_UNLOADER(struct tep_handle *tep)
 {
-	tep_unregister_event_handler(pevent, -1,
+	tep_unregister_event_handler(tep, -1,
 				     "timer", "hrtimer_expire_entry",
 				     timer_expire_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "timer", "hrtimer_start",
+	tep_unregister_event_handler(tep, -1, "timer", "hrtimer_start",
 				     timer_start_handler, NULL);
 }
--- a/tools/lib/traceevent/plugin_jbd2.c
+++ b/tools/lib/traceevent/plugin_jbd2.c
@ -48,16 +48,16 @@ process_jiffies_to_msecs(struct trace_seq *s, unsigned long long *args)
 	return jiffies;
 }

-int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
+int TEP_PLUGIN_LOADER(struct tep_handle *tep)
 {
-	tep_register_print_function(pevent,
+	tep_register_print_function(tep,
 				    process_jbd2_dev_to_name,
 				    TEP_FUNC_ARG_STRING,
 				    "jbd2_dev_to_name",
 				    TEP_FUNC_ARG_INT,
 				    TEP_FUNC_ARG_VOID);

-	tep_register_print_function(pevent,
+	tep_register_print_function(tep,
 				    process_jiffies_to_msecs,
 				    TEP_FUNC_ARG_LONG,
 				    "jiffies_to_msecs",
@ -66,11 +66,11 @@ int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
 	return 0;
 }

-void TEP_PLUGIN_UNLOADER(struct tep_handle *pevent)
+void TEP_PLUGIN_UNLOADER(struct tep_handle *tep)
 {
-	tep_unregister_print_function(pevent, process_jbd2_dev_to_name,
+	tep_unregister_print_function(tep, process_jbd2_dev_to_name,
 				      "jbd2_dev_to_name");

-	tep_unregister_print_function(pevent, process_jiffies_to_msecs,
+	tep_unregister_print_function(tep, process_jiffies_to_msecs,
 				      "jiffies_to_msecs");
 }
--- a/tools/lib/traceevent/plugin_kmem.c
+++ b/tools/lib/traceevent/plugin_kmem.c
@ -39,57 +39,57 @@ static int call_site_handler(struct trace_seq *s, struct tep_record *record,
 	if (tep_read_number_field(field, data, &val))
 		return 1;

-	func = tep_find_function(event->pevent, val);
+	func = tep_find_function(event->tep, val);
 	if (!func)
 		return 1;

-	addr = tep_find_function_address(event->pevent, val);
+	addr = tep_find_function_address(event->tep, val);

 	trace_seq_printf(s, "(%s+0x%x) ", func, (int)(val - addr));
 	return 1;
 }

-int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
+int TEP_PLUGIN_LOADER(struct tep_handle *tep)
 {
-	tep_register_event_handler(pevent, -1, "kmem", "kfree",
+	tep_register_event_handler(tep, -1, "kmem", "kfree",
 				   call_site_handler, NULL);

-	tep_register_event_handler(pevent, -1, "kmem", "kmalloc",
+	tep_register_event_handler(tep, -1, "kmem", "kmalloc",
 				   call_site_handler, NULL);

-	tep_register_event_handler(pevent, -1, "kmem", "kmalloc_node",
+	tep_register_event_handler(tep, -1, "kmem", "kmalloc_node",
 				   call_site_handler, NULL);

-	tep_register_event_handler(pevent, -1, "kmem", "kmem_cache_alloc",
+	tep_register_event_handler(tep, -1, "kmem", "kmem_cache_alloc",
 				   call_site_handler, NULL);

-	tep_register_event_handler(pevent, -1, "kmem",
+	tep_register_event_handler(tep, -1, "kmem",
 				   "kmem_cache_alloc_node",
 				   call_site_handler, NULL);

-	tep_register_event_handler(pevent, -1, "kmem", "kmem_cache_free",
+	tep_register_event_handler(tep, -1, "kmem", "kmem_cache_free",
 				   call_site_handler, NULL);
 	return 0;
 }

-void TEP_PLUGIN_UNLOADER(struct tep_handle *pevent)
+void TEP_PLUGIN_UNLOADER(struct tep_handle *tep)
 {
-	tep_unregister_event_handler(pevent, -1, "kmem", "kfree",
+	tep_unregister_event_handler(tep, -1, "kmem", "kfree",
 				     call_site_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "kmem", "kmalloc",
+	tep_unregister_event_handler(tep, -1, "kmem", "kmalloc",
 				     call_site_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "kmem", "kmalloc_node",
+	tep_unregister_event_handler(tep, -1, "kmem", "kmalloc_node",
 				     call_site_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "kmem", "kmem_cache_alloc",
+	tep_unregister_event_handler(tep, -1, "kmem", "kmem_cache_alloc",
 				     call_site_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "kmem",
+	tep_unregister_event_handler(tep, -1, "kmem",
 				     "kmem_cache_alloc_node",
 				     call_site_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "kmem", "kmem_cache_free",
+	tep_unregister_event_handler(tep, -1, "kmem", "kmem_cache_free",
 				     call_site_handler, NULL);
 }
--- a/tools/lib/traceevent/plugin_kvm.c
+++ b/tools/lib/traceevent/plugin_kvm.c
@ -389,8 +389,8 @@ static int kvm_mmu_print_role(struct trace_seq *s, struct tep_record *record,
 	 * We can only use the structure if file is of the same
 	 * endianness.
 	 */
-	if (tep_file_bigendian(event->pevent) ==
-	    tep_is_host_bigendian(event->pevent)) {
+	if (tep_is_file_bigendian(event->tep) ==
+	    tep_is_local_bigendian(event->tep)) {

 		trace_seq_printf(s, "%u q%u%s %s%s %spae %snxe %swp%s%s%s",
 				 role.level,
@ -445,40 +445,40 @@ process_is_writable_pte(struct trace_seq *s, unsigned long long *args)
 	return pte & PT_WRITABLE_MASK;
 }

-int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
+int TEP_PLUGIN_LOADER(struct tep_handle *tep)
 {
 	init_disassembler();

-	tep_register_event_handler(pevent, -1, "kvm", "kvm_exit",
+	tep_register_event_handler(tep, -1, "kvm", "kvm_exit",
 				   kvm_exit_handler, NULL);

-	tep_register_event_handler(pevent, -1, "kvm", "kvm_emulate_insn",
+	tep_register_event_handler(tep, -1, "kvm", "kvm_emulate_insn",
 				   kvm_emulate_insn_handler, NULL);

-	tep_register_event_handler(pevent, -1, "kvm", "kvm_nested_vmexit",
+	tep_register_event_handler(tep, -1, "kvm", "kvm_nested_vmexit",
 				   kvm_nested_vmexit_handler, NULL);

-	tep_register_event_handler(pevent, -1, "kvm", "kvm_nested_vmexit_inject",
+	tep_register_event_handler(tep, -1, "kvm", "kvm_nested_vmexit_inject",
 				   kvm_nested_vmexit_inject_handler, NULL);

-	tep_register_event_handler(pevent, -1, "kvmmmu", "kvm_mmu_get_page",
+	tep_register_event_handler(tep, -1, "kvmmmu", "kvm_mmu_get_page",
 				   kvm_mmu_get_page_handler, NULL);

-	tep_register_event_handler(pevent, -1, "kvmmmu", "kvm_mmu_sync_page",
+	tep_register_event_handler(tep, -1, "kvmmmu", "kvm_mmu_sync_page",
 				   kvm_mmu_print_role, NULL);

-	tep_register_event_handler(pevent, -1,
+	tep_register_event_handler(tep, -1,
 				   "kvmmmu", "kvm_mmu_unsync_page",
 				   kvm_mmu_print_role, NULL);

-	tep_register_event_handler(pevent, -1, "kvmmmu", "kvm_mmu_zap_page",
+	tep_register_event_handler(tep, -1, "kvmmmu", "kvm_mmu_zap_page",
 				   kvm_mmu_print_role, NULL);

-	tep_register_event_handler(pevent, -1, "kvmmmu",
+	tep_register_event_handler(tep, -1, "kvmmmu",
 			"kvm_mmu_prepare_zap_page", kvm_mmu_print_role,
 			NULL);

-	tep_register_print_function(pevent,
+	tep_register_print_function(tep,
 				    process_is_writable_pte,
 				    TEP_FUNC_ARG_INT,
 				    "is_writable_pte",
@ -487,37 +487,37 @@ int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
 	return 0;
 }

-void TEP_PLUGIN_UNLOADER(struct tep_handle *pevent)
+void TEP_PLUGIN_UNLOADER(struct tep_handle *tep)
 {
-	tep_unregister_event_handler(pevent, -1, "kvm", "kvm_exit",
+	tep_unregister_event_handler(tep, -1, "kvm", "kvm_exit",
 				     kvm_exit_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "kvm", "kvm_emulate_insn",
+	tep_unregister_event_handler(tep, -1, "kvm", "kvm_emulate_insn",
 				     kvm_emulate_insn_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "kvm", "kvm_nested_vmexit",
+	tep_unregister_event_handler(tep, -1, "kvm", "kvm_nested_vmexit",
 				     kvm_nested_vmexit_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "kvm", "kvm_nested_vmexit_inject",
+	tep_unregister_event_handler(tep, -1, "kvm", "kvm_nested_vmexit_inject",
 				     kvm_nested_vmexit_inject_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "kvmmmu", "kvm_mmu_get_page",
+	tep_unregister_event_handler(tep, -1, "kvmmmu", "kvm_mmu_get_page",
 				     kvm_mmu_get_page_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "kvmmmu", "kvm_mmu_sync_page",
+	tep_unregister_event_handler(tep, -1, "kvmmmu", "kvm_mmu_sync_page",
 				     kvm_mmu_print_role, NULL);

-	tep_unregister_event_handler(pevent, -1,
+	tep_unregister_event_handler(tep, -1,
 				     "kvmmmu", "kvm_mmu_unsync_page",
 				     kvm_mmu_print_role, NULL);

-	tep_unregister_event_handler(pevent, -1, "kvmmmu", "kvm_mmu_zap_page",
+	tep_unregister_event_handler(tep, -1, "kvmmmu", "kvm_mmu_zap_page",
 				     kvm_mmu_print_role, NULL);

-	tep_unregister_event_handler(pevent, -1, "kvmmmu",
+	tep_unregister_event_handler(tep, -1, "kvmmmu",
 			"kvm_mmu_prepare_zap_page", kvm_mmu_print_role,
 			NULL);

-	tep_unregister_print_function(pevent, process_is_writable_pte,
+	tep_unregister_print_function(tep, process_is_writable_pte,
 				      "is_writable_pte");
 }
--- a/tools/lib/traceevent/plugin_mac80211.c
+++ b/tools/lib/traceevent/plugin_mac80211.c
@ -87,17 +87,17 @@ static int drv_bss_info_changed(struct trace_seq *s,
 	return 0;
 }

-int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
+int TEP_PLUGIN_LOADER(struct tep_handle *tep)
 {
-	tep_register_event_handler(pevent, -1, "mac80211",
+	tep_register_event_handler(tep, -1, "mac80211",
 				   "drv_bss_info_changed",
 				   drv_bss_info_changed, NULL);
 	return 0;
 }

-void TEP_PLUGIN_UNLOADER(struct tep_handle *pevent)
+void TEP_PLUGIN_UNLOADER(struct tep_handle *tep)
 {
-	tep_unregister_event_handler(pevent, -1, "mac80211",
+	tep_unregister_event_handler(tep, -1, "mac80211",
 				     "drv_bss_info_changed",
 				     drv_bss_info_changed, NULL);
 }
--- a/tools/lib/traceevent/plugin_sched_switch.c
+++ b/tools/lib/traceevent/plugin_sched_switch.c
@ -62,7 +62,7 @@ static void write_and_save_comm(struct tep_format_field *field,
 	comm = &s->buffer[len];

 	/* Help out the comm to ids. This will handle dups */
-	tep_register_comm(field->event->pevent, comm, pid);
+	tep_register_comm(field->event->tep, comm, pid);
 }

 static int sched_wakeup_handler(struct trace_seq *s,
@ -135,27 +135,27 @@ static int sched_switch_handler(struct trace_seq *s,
 	return 0;
 }

-int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
+int TEP_PLUGIN_LOADER(struct tep_handle *tep)
 {
-	tep_register_event_handler(pevent, -1, "sched", "sched_switch",
+	tep_register_event_handler(tep, -1, "sched", "sched_switch",
 				   sched_switch_handler, NULL);

-	tep_register_event_handler(pevent, -1, "sched", "sched_wakeup",
+	tep_register_event_handler(tep, -1, "sched", "sched_wakeup",
 				   sched_wakeup_handler, NULL);

-	tep_register_event_handler(pevent, -1, "sched", "sched_wakeup_new",
+	tep_register_event_handler(tep, -1, "sched", "sched_wakeup_new",
 				   sched_wakeup_handler, NULL);
 	return 0;
 }

-void TEP_PLUGIN_UNLOADER(struct tep_handle *pevent)
+void TEP_PLUGIN_UNLOADER(struct tep_handle *tep)
 {
-	tep_unregister_event_handler(pevent, -1, "sched", "sched_switch",
+	tep_unregister_event_handler(tep, -1, "sched", "sched_switch",
 				     sched_switch_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "sched", "sched_wakeup",
+	tep_unregister_event_handler(tep, -1, "sched", "sched_wakeup",
 				     sched_wakeup_handler, NULL);

-	tep_unregister_event_handler(pevent, -1, "sched", "sched_wakeup_new",
+	tep_unregister_event_handler(tep, -1, "sched", "sched_wakeup_new",
 				     sched_wakeup_handler, NULL);
 }
--- a/tools/lib/traceevent/plugin_scsi.c
+++ b/tools/lib/traceevent/plugin_scsi.c
@ -414,9 +414,9 @@ unsigned long long process_scsi_trace_parse_cdb(struct trace_seq *s,
 	return 0;
 }

-int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
+int TEP_PLUGIN_LOADER(struct tep_handle *tep)
 {
-	tep_register_print_function(pevent,
+	tep_register_print_function(tep,
 				    process_scsi_trace_parse_cdb,
 				    TEP_FUNC_ARG_STRING,
 				    "scsi_trace_parse_cdb",
@ -427,8 +427,8 @@ int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
 	return 0;
 }

-void TEP_PLUGIN_UNLOADER(struct tep_handle *pevent)
+void TEP_PLUGIN_UNLOADER(struct tep_handle *tep)
 {
-	tep_unregister_print_function(pevent, process_scsi_trace_parse_cdb,
+	tep_unregister_print_function(tep, process_scsi_trace_parse_cdb,
 				      "scsi_trace_parse_cdb");
 }
--- a/tools/lib/traceevent/plugin_xen.c
+++ b/tools/lib/traceevent/plugin_xen.c
@ -120,9 +120,9 @@ unsigned long long process_xen_hypercall_name(struct trace_seq *s,
 	return 0;
 }

-int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
+int TEP_PLUGIN_LOADER(struct tep_handle *tep)
 {
-	tep_register_print_function(pevent,
+	tep_register_print_function(tep,
 				    process_xen_hypercall_name,
 				    TEP_FUNC_ARG_STRING,
 				    "xen_hypercall_name",
@ -131,8 +131,8 @@ int TEP_PLUGIN_LOADER(struct tep_handle *pevent)
 	return 0;
 }

-void TEP_PLUGIN_UNLOADER(struct tep_handle *pevent)
+void TEP_PLUGIN_UNLOADER(struct tep_handle *tep)
 {
-	tep_unregister_print_function(pevent, process_xen_hypercall_name,
+	tep_unregister_print_function(tep, process_xen_hypercall_name,
 				      "xen_hypercall_name");
 }
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@ -459,6 +459,25 @@ Set affinity mask of trace reading thread according to the policy defined by 'mo
  node - thread affinity mask is set to NUMA node cpu mask of the processed mmap buffer
  cpu  - thread affinity mask is set to cpu of the processed mmap buffer

+--mmap-flush=number::
+
+Specify minimal number of bytes that is extracted from mmap data pages and
+processed for output. One can specify the number using B/K/M/G suffixes.
+
+The maximal allowed value is a quarter of the size of mmaped data pages.
+
+The default option value is 1 byte which means that every time that the output
+writing thread finds some new data in the mmaped buffer the data is extracted,
+possibly compressed (-z) and written to the output, perf.data or pipe.
+
+Larger data chunks are compressed more effectively in comparison to smaller
+chunks so extraction of larger chunks from the mmap data pages is preferable
+from the perspective of output size reduction.
+
+Also at some cases executing less output write syscalls with bigger data size
+can take less time than executing more output write syscalls with smaller data
+size thus lowering runtime profiling overhead.
+
 --all-kernel::
 Configure all used events to run in kernel space.

--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@ -152,6 +152,13 @@ endif
 FEATURE_CHECK_CFLAGS-libbabeltrace := $(LIBBABELTRACE_CFLAGS)
 FEATURE_CHECK_LDFLAGS-libbabeltrace := $(LIBBABELTRACE_LDFLAGS) -lbabeltrace-ctf

+ifdef LIBZSTD_DIR
+  LIBZSTD_CFLAGS  := -I$(LIBZSTD_DIR)/lib
+  LIBZSTD_LDFLAGS := -L$(LIBZSTD_DIR)/lib
+endif
+FEATURE_CHECK_CFLAGS-libzstd := $(LIBZSTD_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libzstd := $(LIBZSTD_LDFLAGS)
+
 FEATURE_CHECK_CFLAGS-bpf = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(SRCARCH)/include/uapi -I$(srctree)/tools/include/uapi
 # include ARCH specific config
 -include $(src-perf)/arch/$(SRCARCH)/Makefile
@ -787,6 +794,19 @@ ifndef NO_LZMA
  endif
 endif

+ifndef NO_LIBZSTD
+  ifeq ($(feature-libzstd), 1)
+    CFLAGS += -DHAVE_ZSTD_SUPPORT
+    CFLAGS += $(LIBZSTD_CFLAGS)
+    LDFLAGS += $(LIBZSTD_LDFLAGS)
+    EXTLIBS += -lzstd
+    $(call detected,CONFIG_ZSTD)
+  else
+    msg := $(warning No libzstd found, disables trace compression, please install libzstd-dev[el] and/or set LIBZSTD_DIR);
+    NO_LIBZSTD := 1
+  endif
+endif
+
 ifndef NO_BACKTRACE
  ifeq ($(feature-backtrace), 1)
    CFLAGS += -DHAVE_BACKTRACE_SUPPORT
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@ -108,6 +108,9 @@ include ../scripts/utilities.mak
 # streaming for record mode. Currently Posix AIO trace streaming is
 # supported only when linking with glibc.
 #
+# Define NO_LIBZSTD if you do not want support of Zstandard based runtime
+# trace compression in record mode.
+#

 # As per kernel Makefile, avoid funny character set dependencies
 unexport LC_ALL
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@ -1975,7 +1975,7 @@ int cmd_kmem(int argc, const char **argv)
 			goto out_delete;
 		}

-		kmem_page_size = tep_get_page_size(evsel->tp_format->pevent);
+		kmem_page_size = tep_get_page_size(evsel->tp_format->tep);
 		symbol_conf.use_callchain = true;
 	}

--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@ -70,10 +70,11 @@ int cmd_list(int argc, const char **argv)
 			print_symbol_events(NULL, PERF_TYPE_HARDWARE,
 					event_symbols_hw, PERF_COUNT_HW_MAX, raw_dump);
 		else if (strcmp(argv[i], "sw") == 0 ||
-			 strcmp(argv[i], "software") == 0)
+			 strcmp(argv[i], "software") == 0) {
 			print_symbol_events(NULL, PERF_TYPE_SOFTWARE,
 					event_symbols_sw, PERF_COUNT_SW_MAX, raw_dump);
-		else if (strcmp(argv[i], "cache") == 0 ||
+			print_tool_events(NULL, raw_dump);
+		} else if (strcmp(argv[i], "cache") == 0 ||
 			 strcmp(argv[i], "hwcache") == 0)
 			print_hwcache_events(NULL, raw_dump);
 		else if (strcmp(argv[i], "pmu") == 0)
@ -113,6 +114,7 @@ int cmd_list(int argc, const char **argv)
 					    event_symbols_hw, PERF_COUNT_HW_MAX, raw_dump);
 			print_symbol_events(s, PERF_TYPE_SOFTWARE,
 					    event_symbols_sw, PERF_COUNT_SW_MAX, raw_dump);
+			print_tool_events(s, raw_dump);
 			print_hwcache_events(s, raw_dump);
 			print_pmu_events(s, raw_dump, !desc_flag,
 						long_desc_flag,
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@ -337,6 +337,41 @@ static int record__aio_enabled(struct record *rec)
 	return rec->opts.nr_cblocks > 0;
 }

+#define MMAP_FLUSH_DEFAULT 1
+static int record__mmap_flush_parse(const struct option *opt,
+				    const char *str,
+				    int unset)
+{
+	int flush_max;
+	struct record_opts *opts = (struct record_opts *)opt->value;
+	static struct parse_tag tags[] = {
+			{ .tag  = 'B', .mult = 1       },
+			{ .tag  = 'K', .mult = 1 << 10 },
+			{ .tag  = 'M', .mult = 1 << 20 },
+			{ .tag  = 'G', .mult = 1 << 30 },
+			{ .tag  = 0 },
+	};
+
+	if (unset)
+		return 0;
+
+	if (str) {
+		opts->mmap_flush = parse_tag_value(str, tags);
+		if (opts->mmap_flush == (int)-1)
+			opts->mmap_flush = strtol(str, NULL, 0);
+	}
+
+	if (!opts->mmap_flush)
+		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
+
+	flush_max = perf_evlist__mmap_size(opts->mmap_pages);
+	flush_max /= 4;
+	if (opts->mmap_flush > flush_max)
+		opts->mmap_flush = flush_max;
+
+	return 0;
+}
+
 static int process_synthesized_event(struct perf_tool *tool,
 				     union perf_event *event,
 				     struct perf_sample *sample __maybe_unused,
@ -546,7 +581,8 @@ static int record__mmap_evlist(struct record *rec,
 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
 				 opts->auxtrace_mmap_pages,
 				 opts->auxtrace_snapshot_mode,
-				 opts->nr_cblocks, opts->affinity) < 0) {
+				 opts->nr_cblocks, opts->affinity,
+				 opts->mmap_flush) < 0) {
 		if (errno == EPERM) {
 			pr_err("Permission error mapping pages.\n"
 			       "Consider increasing "
@ -736,7 +772,7 @@ static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
 }

 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
-				    bool overwrite)
+				    bool overwrite, bool synch)
 {
 	u64 bytes_written = rec->bytes_written;
 	int i;
@ -759,12 +795,19 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
 		off = record__aio_get_pos(trace_fd);

 	for (i = 0; i < evlist->nr_mmaps; i++) {
+		u64 flush = 0;
 		struct perf_mmap *map = &maps[i];

 		if (map->base) {
 			record__adjust_affinity(rec, map);
+			if (synch) {
+				flush = map->flush;
+				map->flush = 1;
+			}
 			if (!record__aio_enabled(rec)) {
 				if (perf_mmap__push(map, rec, record__pushfn) != 0) {
+					if (synch)
+						map->flush = flush;
 					rc = -1;
 					goto out;
 				}
@ -777,10 +820,14 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
 				idx = record__aio_sync(map, false);
 				if (perf_mmap__aio_push(map, rec, idx, record__aio_pushfn, &off) != 0) {
 					record__aio_set_pos(trace_fd, off);
+					if (synch)
+						map->flush = flush;
 					rc = -1;
 					goto out;
 				}
 			}
+			if (synch)
+				map->flush = flush;
 		}

 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
@ -806,15 +853,15 @@ out:
 	return rc;
 }

-static int record__mmap_read_all(struct record *rec)
+static int record__mmap_read_all(struct record *rec, bool synch)
 {
 	int err;

-	err = record__mmap_read_evlist(rec, rec->evlist, false);
+	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
 	if (err)
 		return err;

-	return record__mmap_read_evlist(rec, rec->evlist, true);
+	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
 }

 static void record__init_features(struct record *rec)
@ -1340,7 +1387,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);

-		if (record__mmap_read_all(rec) < 0) {
+		if (record__mmap_read_all(rec, false) < 0) {
 			trigger_error(&auxtrace_snapshot_trigger);
 			trigger_error(&switch_output_trigger);
 			err = -1;
@ -1441,6 +1488,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		record__synthesize_workload(rec, true);

 out_child:
+	record__mmap_read_all(rec, true);
 	record__aio_mmap_read_sync(rec);

 	if (forks) {
@ -1846,6 +1894,7 @@ static struct record record = {
 			.uses_mmap   = true,
 			.default_per_cpu = true,
 		},
+		.mmap_flush          = MMAP_FLUSH_DEFAULT,
 	},
 	.tool = {
 		.sample		= process_sample_event,
@ -1912,6 +1961,9 @@ static struct option __record_options[] = {
 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
 		     "number of mmap data pages and AUX area tracing mmap pages",
 		     record__parse_mmap_pages),
+	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
+		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
+		     record__mmap_flush_parse),
 	OPT_BOOLEAN(0, "group", &record.opts.group,
 		    "put the counters into a counter group"),
 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
@ -2224,6 +2276,7 @@ int cmd_record(int argc, const char **argv)
 		pr_info("nr_cblocks: %d\n", rec->opts.nr_cblocks);

 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
+	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);

 	err = __cmd_record(&record, argc, argv);
 out:
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@ -244,11 +244,25 @@ perf_evsel__write_stat_event(struct perf_evsel *counter, u32 cpu, u32 thread,
 					   process_synthesized_event, NULL);
 }

+static int read_single_counter(struct perf_evsel *counter, int cpu,
+			       int thread, struct timespec *rs)
+{
+	if (counter->tool_event == PERF_TOOL_DURATION_TIME) {
+		u64 val = rs->tv_nsec + rs->tv_sec*1000000000ULL;
+		struct perf_counts_values *count =
+			perf_counts(counter->counts, cpu, thread);
+		count->ena = count->run = val;
+		count->val = val;
+		return 0;
+	}
+	return perf_evsel__read_counter(counter, cpu, thread);
+}
+
 /*
 * Read out the results of a single counter:
 * do not aggregate counts across CPUs in system-wide mode
 */
-static int read_counter(struct perf_evsel *counter)
+static int read_counter(struct perf_evsel *counter, struct timespec *rs)
 {
 	int nthreads = thread_map__nr(evsel_list->threads);
 	int ncpus, cpu, thread;
@ -275,7 +289,7 @@ static int read_counter(struct perf_evsel *counter)
 			 * (via perf_evsel__read_counter) and sets threir count->loaded.
 			 */
 			if (!count->loaded &&
-			    perf_evsel__read_counter(counter, cpu, thread)) {
+			    read_single_counter(counter, cpu, thread, rs)) {
 				counter->counts->scaled = -1;
 				perf_counts(counter->counts, cpu, thread)->ena = 0;
 				perf_counts(counter->counts, cpu, thread)->run = 0;
@ -304,13 +318,13 @@ static int read_counter(struct perf_evsel *counter)
 	return 0;
 }

-static void read_counters(void)
+static void read_counters(struct timespec *rs)
 {
 	struct perf_evsel *counter;
 	int ret;

 	evlist__for_each_entry(evsel_list, counter) {
-		ret = read_counter(counter);
+		ret = read_counter(counter, rs);
 		if (ret)
 			pr_debug("failed to read counter %s\n", counter->name);

@ -323,11 +337,11 @@ static void process_interval(void)
 {
 	struct timespec ts, rs;

-	read_counters();
-
 	clock_gettime(CLOCK_MONOTONIC, &ts);
 	diff_timespec(&rs, &ts, &ref_time);

+	read_counters(&rs);
+
 	if (STAT_RECORD) {
 		if (WRITE_STAT_ROUND_EVENT(rs.tv_sec * NSEC_PER_SEC + rs.tv_nsec, INTERVAL))
 			pr_err("failed to write stat round event\n");
@ -593,7 +607,7 @@ try_again:
 	 * avoid arbitrary skew, we must read all counters before closing any
 	 * group leaders.
 	 */
-	read_counters();
+	read_counters(&(struct timespec) { .tv_nsec = t1-t0 });
 	perf_evlist__close(evsel_list);

 	return WEXITSTATUS(status);
--- a/tools/perf/builtin-version.c
+++ b/tools/perf/builtin-version.c
@ -78,6 +78,8 @@ static void library_status(void)
 	STATUS(HAVE_LZMA_SUPPORT, lzma);
 	STATUS(HAVE_AUXTRACE_SUPPORT, get_cpuid);
 	STATUS(HAVE_LIBBPF_SUPPORT, bpf);
+	STATUS(HAVE_AIO_SUPPORT, aio);
+	STATUS(HAVE_ZSTD_SUPPORT, zstd);
 }

 int cmd_version(int argc, const char **argv)
--- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
+++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c
@ -15,6 +15,7 @@
 */

 #include <unistd.h>
+#include <linux/limits.h>
 #include <pid_filter.h>

 /* bpf-output associated map */
@ -41,32 +42,110 @@ struct syscall_exit_args {
 struct augmented_filename {
 	unsigned int	size;
 	int		reserved;
-	char		value[256];
+	char		value[PATH_MAX];
 };

-#define SYS_OPEN 2
-#define SYS_ACCESS 21
-#define SYS_OPENAT 257
+/* syscalls where the first arg is a string */
+#define SYS_OPEN                 2
+#define SYS_STAT                 4
+#define SYS_LSTAT                6
+#define SYS_ACCESS              21
+#define SYS_EXECVE              59
+#define SYS_TRUNCATE            76
+#define SYS_CHDIR               80
+#define SYS_RENAME              82
+#define SYS_MKDIR               83
+#define SYS_RMDIR               84
+#define SYS_CREAT               85
+#define SYS_LINK                86
+#define SYS_UNLINK              87
+#define SYS_SYMLINK             88
+#define SYS_READLINK            89
+#define SYS_CHMOD               90
+#define SYS_CHOWN               92
+#define SYS_LCHOWN              94
+#define SYS_MKNOD              133
+#define SYS_STATFS             137
+#define SYS_PIVOT_ROOT         155
+#define SYS_CHROOT             161
+#define SYS_ACCT               163
+#define SYS_SWAPON             167
+#define SYS_SWAPOFF            168
+#define SYS_DELETE_MODULE      176
+#define SYS_SETXATTR           188
+#define SYS_LSETXATTR          189
+#define SYS_GETXATTR           191
+#define SYS_LGETXATTR          192
+#define SYS_LISTXATTR          194
+#define SYS_LLISTXATTR         195
+#define SYS_REMOVEXATTR        197
+#define SYS_LREMOVEXATTR       198
+#define SYS_MQ_OPEN            240
+#define SYS_MQ_UNLINK          241
+#define SYS_ADD_KEY            248
+#define SYS_REQUEST_KEY        249
+#define SYS_SYMLINKAT          266
+#define SYS_MEMFD_CREATE       319
+
+/* syscalls where the first arg is a string */
+
+#define SYS_PWRITE64            18
+#define SYS_EXECVE              59
+#define SYS_RENAME              82
+#define SYS_QUOTACTL           179
+#define SYS_FSETXATTR          190
+#define SYS_FGETXATTR          193
+#define SYS_FREMOVEXATTR       199
+#define SYS_MQ_TIMEDSEND       242
+#define SYS_REQUEST_KEY        249
+#define SYS_INOTIFY_ADD_WATCH  254
+#define SYS_OPENAT             257
+#define SYS_MKDIRAT            258
+#define SYS_MKNODAT            259
+#define SYS_FCHOWNAT           260
+#define SYS_FUTIMESAT          261
+#define SYS_NEWFSTATAT         262
+#define SYS_UNLINKAT           263
+#define SYS_RENAMEAT           264
+#define SYS_LINKAT             265
+#define SYS_READLINKAT         267
+#define SYS_FCHMODAT           268
+#define SYS_FACCESSAT          269
+#define SYS_UTIMENSAT          280
+#define SYS_NAME_TO_HANDLE_AT  303
+#define SYS_FINIT_MODULE       313
+#define SYS_RENAMEAT2          316
+#define SYS_EXECVEAT           322
+#define SYS_STATX              332

 pid_filter(pids_filtered);

+struct augmented_args_filename {
+       struct syscall_enter_args args;
+       struct augmented_filename filename;
+};
+
+bpf_map(augmented_filename_map, PERCPU_ARRAY, int, struct augmented_args_filename, 1);
+
 SEC("raw_syscalls:sys_enter")
 int sys_enter(struct syscall_enter_args *args)
 {
-	struct {
-		struct syscall_enter_args args;
-		struct augmented_filename filename;
-	} augmented_args;
-	struct syscall *syscall;
-	unsigned int len = sizeof(augmented_args);
+	struct augmented_args_filename *augmented_args;
+	unsigned int len = sizeof(*augmented_args);
 	const void *filename_arg = NULL;
+	struct syscall *syscall;
+	int key = 0;
+
+        augmented_args = bpf_map_lookup_elem(&augmented_filename_map, &key);
+        if (augmented_args == NULL)
+                return 1;

 	if (pid_filter__has(&pids_filtered, getpid()))
 		return 0;

-	probe_read(&augmented_args.args, sizeof(augmented_args.args), args);
+	probe_read(&augmented_args->args, sizeof(augmented_args->args), args);

-	syscall = bpf_map_lookup_elem(&syscalls, &augmented_args.args.syscall_nr);
+	syscall = bpf_map_lookup_elem(&syscalls, &augmented_args->args.syscall_nr);
 	if (syscall == NULL || !syscall->enabled)
 		return 0;
 	/*
@ -109,30 +188,105 @@ int sys_enter(struct syscall_enter_args *args)
 	 *
 	 * 	 after the ctx memory access to prevent their down stream merging.
 	 */
-	switch (augmented_args.args.syscall_nr) {
+	/*
+	 * This table of what args are strings will be provided by userspace,
+	 * in the syscalls map, i.e. we will already have to do the lookup to
+	 * see if this specific syscall is filtered, so we can as well get more
+	 * info about what syscall args are strings or pointers, and how many
+	 * bytes to copy, per arg, etc.
+	 *
+	 * For now hard code it, till we have all the basic mechanisms in place
+	 * to automate everything and make the kernel part be completely driven
+	 * by information obtained in userspace for each kernel version and
+	 * processor architecture, making the kernel part the same no matter what
+	 * kernel version or processor architecture it runs on.
+	 */
+	switch (augmented_args->args.syscall_nr) {
+	case SYS_ACCT:
+	case SYS_ADD_KEY:
+	case SYS_CHDIR:
+	case SYS_CHMOD:
+	case SYS_CHOWN:
+	case SYS_CHROOT:
+	case SYS_CREAT:
+	case SYS_DELETE_MODULE:
+	case SYS_EXECVE:
+	case SYS_GETXATTR:
+	case SYS_LCHOWN:
+	case SYS_LGETXATTR:
+	case SYS_LINK:
+	case SYS_LISTXATTR:
+	case SYS_LLISTXATTR:
+	case SYS_LREMOVEXATTR:
+	case SYS_LSETXATTR:
+	case SYS_LSTAT:
+	case SYS_MEMFD_CREATE:
+	case SYS_MKDIR:
+	case SYS_MKNOD:
+	case SYS_MQ_OPEN:
+	case SYS_MQ_UNLINK:
+	case SYS_PIVOT_ROOT:
+	case SYS_READLINK:
+	case SYS_REMOVEXATTR:
+	case SYS_RENAME:
+	case SYS_REQUEST_KEY:
+	case SYS_RMDIR:
+	case SYS_SETXATTR:
+	case SYS_STAT:
+	case SYS_STATFS:
+	case SYS_SWAPOFF:
+	case SYS_SWAPON:
+	case SYS_SYMLINK:
+	case SYS_SYMLINKAT:
+	case SYS_TRUNCATE:
+	case SYS_UNLINK:
 	case SYS_ACCESS:
 	case SYS_OPEN:	 filename_arg = (const void *)args->args[0];
 			__asm__ __volatile__("": : :"memory");
 			 break;
+	case SYS_EXECVEAT:
+	case SYS_FACCESSAT:
+	case SYS_FCHMODAT:
+	case SYS_FCHOWNAT:
+	case SYS_FGETXATTR:
+	case SYS_FINIT_MODULE:
+	case SYS_FREMOVEXATTR:
+	case SYS_FSETXATTR:
+	case SYS_FUTIMESAT:
+	case SYS_INOTIFY_ADD_WATCH:
+	case SYS_LINKAT:
+	case SYS_MKDIRAT:
+	case SYS_MKNODAT:
+	case SYS_MQ_TIMEDSEND:
+	case SYS_NAME_TO_HANDLE_AT:
+	case SYS_NEWFSTATAT:
+	case SYS_PWRITE64:
+	case SYS_QUOTACTL:
+	case SYS_READLINKAT:
+	case SYS_RENAMEAT:
+	case SYS_RENAMEAT2:
+	case SYS_STATX:
+	case SYS_UNLINKAT:
+	case SYS_UTIMENSAT:
 	case SYS_OPENAT: filename_arg = (const void *)args->args[1];
 			 break;
 	}

 	if (filename_arg != NULL) {
-		augmented_args.filename.reserved = 0;
-		augmented_args.filename.size = probe_read_str(&augmented_args.filename.value,
-							      sizeof(augmented_args.filename.value),
+		augmented_args->filename.reserved = 0;
+		augmented_args->filename.size = probe_read_str(&augmented_args->filename.value,
+							      sizeof(augmented_args->filename.value),
 							      filename_arg);
-		if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) {
-			len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size;
-			len &= sizeof(augmented_args.filename.value) - 1;
+		if (augmented_args->filename.size < sizeof(augmented_args->filename.value)) {
+			len -= sizeof(augmented_args->filename.value) - augmented_args->filename.size;
+			len &= sizeof(augmented_args->filename.value) - 1;
 		}
 	} else {
-		len = sizeof(augmented_args.args);
+		len = sizeof(augmented_args->args);
 	}

 	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
-	return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len);
+	return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, augmented_args, len);
 }

 SEC("raw_syscalls:sys_exit")
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@ -85,6 +85,7 @@ struct record_opts {
 	u64          clockid_res_ns;
 	int	     nr_cblocks;
 	int	     affinity;
+	int	     mmap_flush;
 };

 enum perf_affinity {
--- a/tools/perf/pmu-events/arch/s390/cf_z14/extended.json
+++ b/tools/perf/pmu-events/arch/s390/cf_z14/extended.json
@ -4,7 +4,7 @@
 		"EventCode": "128",
 		"EventName": "L1D_RO_EXCL_WRITES",
 		"BriefDescription": "L1D Read-only Exclusive Writes",
-		"PublicDescription": "Counter:128	Name:L1D_RO_EXCL_WRITES A directory write to the Level-1 Data cache where the line was originally in a Read-Only state in the cache but has been updated to be in the Exclusive state that allows stores to the cache line"
+		"PublicDescription": "L1D_RO_EXCL_WRITES A directory write to the Level-1 Data cache where the line was originally in a Read-Only state in the cache but has been updated to be in the Exclusive state that allows stores to the cache line"
 	},
 	{
 		"Unit": "CPU-M-CF",
--- a/tools/perf/pmu-events/arch/x86/bonnell/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/bonnell/frontend.json
@ -77,7 +77,7 @@
        "UMask": "0x1",
        "EventName": "UOPS.MS_CYCLES",
        "SampleAfterValue": "2000000",
-        "BriefDescription": "This event counts the cycles where 1 or more uops are issued by the micro-sequencer (MS), including microcode assists and inserted flows, and written to the IQ. ",
+        "BriefDescription": "This event counts the cycles where 1 or more uops are issued by the micro-sequencer (MS), including microcode assists and inserted flows, and written to the IQ.",
        "CounterMask": "1"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/bonnell/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/bonnell/pipeline.json
@ -189,7 +189,7 @@
        "UMask": "0x8",
        "EventName": "BR_MISSP_TYPE_RETIRED.IND_CALL",
        "SampleAfterValue": "200000",
-        "BriefDescription": "Mispredicted indirect calls, including both register and memory indirect. "
+        "BriefDescription": "Mispredicted indirect calls, including both register and memory indirect."
    },
    {
        "EventCode": "0x89",
--- a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json
@ -1,164 +1,352 @@
 [
    {
-        "BriefDescription": "Instructions Per Cycle (per logical thread)",
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Frontend_Bound"
+    },
+    {
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Frontend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Bad_Speculation"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Bad_Speculation_SMT"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Backend_Bound"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Backend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. ",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Retiring"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Retiring_SMT"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Instructions Per Cycle (per logical thread)",
        "MetricGroup": "TopDownL1",
        "MetricName": "IPC"
    },
    {
-        "BriefDescription": "Uops Per Instruction",
        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY",
-        "MetricGroup": "Pipeline",
+        "BriefDescription": "Uops Per Instruction",
+        "MetricGroup": "Pipeline;Retiring",
        "MetricName": "UPI"
    },
    {
-        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Instruction per taken branch",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "IpTB"
+    },
+    {
+        "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Branch instructions per taken branch. ",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "BpTB"
+    },
+    {
        "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )",
-        "MetricGroup": "Frontend",
+        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely (includes speculatively fetches) consumed by program instructions",
+        "MetricGroup": "PGO",
        "MetricName": "IFetch_Line_Utilization"
    },
    {
-        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)",
-        "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )",
-        "MetricGroup": "DSB; Frontend_Bandwidth",
+        "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )",
+        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
+        "MetricGroup": "DSB;Frontend_Bandwidth",
        "MetricName": "DSB_Coverage"
    },
    {
-        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
+        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
    {
-        "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Per-thread actual clocks when the logical processor is active.",
        "MetricGroup": "Summary",
        "MetricName": "CLKS"
    },
    {
-        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
+        "MetricExpr": "4 * cycles",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
    {
-        "BriefDescription": "Total number of retired Instructions",
+        "MetricExpr": "4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
+        "MetricGroup": "TopDownL1_SMT",
+        "MetricName": "SLOTS_SMT"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS",
+        "BriefDescription": "Instructions per Load (lower number means loads are more frequent)",
+        "MetricGroup": "Instruction_Type;L1_Bound",
+        "MetricName": "IpL"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES",
+        "BriefDescription": "Instructions per Store",
+        "MetricGroup": "Instruction_Type;Store_Bound",
+        "MetricName": "IpS"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Instructions per Branch",
+        "MetricGroup": "Branches;Instruction_Type;Port_5;Port_6",
+        "MetricName": "IpB"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
+        "BriefDescription": "Instruction per (near) call",
+        "MetricGroup": "Branches",
+        "MetricName": "IpCall"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY",
+        "BriefDescription": "Total number of retired Instructions",
        "MetricGroup": "Summary",
        "MetricName": "Instructions"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / cycles",
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Instructions Per Cycle (per physical core)",
+        "MetricGroup": "SMT",
+        "MetricName": "CoreIPC_SMT"
+    },
+    {
+        "MetricExpr": "(( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / cycles",
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricGroup": "FLOPS",
+        "MetricName": "FLOPc"
+    },
+    {
+        "MetricExpr": "(( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricGroup": "FLOPS_SMT",
+        "MetricName": "FLOPc_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
-        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL  - (( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED )) ) / RS_EVENTS.EMPTY_END)",
-        "MetricGroup": "Unknown_Branches",
-        "MetricName": "BAClear_Cost"
+        "MetricExpr": "( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * cycles)) * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / cycles) / (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * cycles)) ) * (4 * cycles) / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Branch Misprediction Cost: Fraction of TopDown slots wasted per branch misprediction (jeclear and baclear)",
+        "MetricGroup": "Branch_Mispredicts",
+        "MetricName": "Branch_Misprediction_Cost"
    },
    {
+        "MetricExpr": "( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / cycles) / (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Branch Misprediction Cost: Fraction of TopDown slots wasted per branch misprediction (jeclear and baclear)",
+        "MetricGroup": "Branch_Mispredicts_SMT",
+        "MetricName": "Branch_Misprediction_Cost_SMT"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)",
+        "MetricGroup": "Branch_Mispredicts",
+        "MetricName": "IpMispredict"
+    },
+    {
+        "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
        "BriefDescription": "Core actual clocks when any thread is active on the physical core",
-        "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD",
        "MetricGroup": "SMT",
        "MetricName": "CORE_CLKS"
    },
    {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads",
        "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )",
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads (in core cycles)",
        "MetricGroup": "Memory_Bound;Memory_Lat",
        "MetricName": "Load_Miss_Real_Latency"
    },
    {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-thread)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
+        "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / cycles",
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED)) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
    {
-        "BriefDescription": "Average CPU Utilization",
+        "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricGroup": "TLB_SMT",
+        "MetricName": "Page_Walks_Utilization_SMT"
+    },
+    {
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L1D_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L2_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time",
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L3_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L1MPKI"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI"
+    },
+    {
+        "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache misses per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2HPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L3MPKI"
+    },
+    {
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
+        "BriefDescription": "Average CPU Utilization",
        "MetricGroup": "Summary",
        "MetricName": "CPU_Utilization"
    },
    {
+        "MetricExpr": "( (( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 ) / duration_time",
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },
    {
-        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricGroup": "Power",
        "MetricName": "Turbo_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0",
+        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricGroup": "SMT;Summary",
        "MetricName": "SMT_2T_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricGroup": "Summary",
        "MetricName": "Kernel_Utilization"
    },
    {
-        "BriefDescription": "C3 residency percent per core",
+        "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000",
+        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "DRAM_BW_Use"
+    },
+    {
        "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per core",
        "MetricName": "C3_Core_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per core",
        "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per core",
        "MetricName": "C6_Core_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per core",
        "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per core",
        "MetricName": "C7_Core_Residency"
    },
    {
-        "BriefDescription": "C2 residency percent per package",
        "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C2 residency percent per package",
        "MetricName": "C2_Pkg_Residency"
    },
    {
-        "BriefDescription": "C3 residency percent per package",
        "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per package",
        "MetricName": "C3_Pkg_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per package",
        "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per package",
        "MetricName": "C6_Pkg_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per package",
        "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per package",
        "MetricName": "C7_Pkg_Residency"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/broadwell/cache.json
+++ b/tools/perf/pmu-events/arch/x86/broadwell/cache.json
--- a/tools/perf/pmu-events/arch/x86/broadwell/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/broadwell/floating-point.json
@ -1,24 +1,26 @@
 [
    {
-        "PublicDescription": "This event counts the number of transitions from AVX-256 to legacy SSE when penalty is applicable.",
+        "PEBS": "1",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts the number of transitions from AVX-256 to legacy SSE when penalty is applicable.",
        "EventCode": "0xC1",
        "Counter": "0,1,2,3",
        "UMask": "0x8",
        "Errata": "BDM30",
        "EventName": "OTHER_ASSISTS.AVX_TO_SSE",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of transitions from AVX-256 to legacy SSE when penalty applicable.",
+        "BriefDescription": "Number of transitions from AVX-256 to legacy SSE when penalty applicable (Precise Event)",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "This event counts the number of transitions from legacy SSE to AVX-256 when penalty is applicable.",
+        "PEBS": "1",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts the number of transitions from legacy SSE to AVX-256 when penalty is applicable.",
        "EventCode": "0xC1",
        "Counter": "0,1,2,3",
        "UMask": "0x10",
        "Errata": "BDM30",
        "EventName": "OTHER_ASSISTS.SSE_TO_AVX",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of transitions from SSE to AVX-256 when penalty applicable.",
+        "BriefDescription": "Number of transitions from legacy SSE to AVX-256 when penalty applicable (Precise Event)",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
@ -45,7 +47,7 @@
        "UMask": "0x3",
        "EventName": "FP_ARITH_INST_RETIRED.SCALAR",
        "SampleAfterValue": "2000003",
-        "BriefDescription": "Number of SSE/AVX computational scalar floating-point instructions retired. Applies to SSE* and AVX* scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RSQRT RCP SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "BriefDescription": "Number of SSE/AVX computational scalar floating-point instructions retired. Applies to SSE* and AVX* scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RSQRT RCP SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. (RSQRT for single precision?)",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -54,7 +56,7 @@
        "UMask": "0x4",
        "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE",
        "SampleAfterValue": "2000003",
-        "BriefDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired.  Each count represents 2 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "BriefDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired.  Each count represents 2 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -63,7 +65,7 @@
        "UMask": "0x8",
        "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE",
        "SampleAfterValue": "2000003",
-        "BriefDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired.  Each count represents 4 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "BriefDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired.  Each count represents 4 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -72,7 +74,7 @@
        "UMask": "0x10",
        "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE",
        "SampleAfterValue": "2000003",
-        "BriefDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired.  Each count represents 4 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "BriefDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired.  Each count represents 4 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -81,7 +83,7 @@
        "UMask": "0x15",
        "EventName": "FP_ARITH_INST_RETIRED.DOUBLE",
        "SampleAfterValue": "2000006",
-        "BriefDescription": "Number of SSE/AVX computational double precision floating-point instructions retired. Applies to SSE* and AVX*scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.  ?.",
+        "BriefDescription": "Number of SSE/AVX computational double precision floating-point instructions retired. Applies to SSE* and AVX*scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -90,7 +92,7 @@
        "UMask": "0x20",
        "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE",
        "SampleAfterValue": "2000003",
-        "BriefDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired.  Each count represents 8 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "BriefDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired.  Each count represents 8 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -99,7 +101,7 @@
        "UMask": "0x2a",
        "EventName": "FP_ARITH_INST_RETIRED.SINGLE",
        "SampleAfterValue": "2000005",
-        "BriefDescription": "Number of SSE/AVX computational single precision floating-point instructions retired. Applies to SSE* and AVX*scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ?.",
+        "BriefDescription": "Number of SSE/AVX computational single precision floating-point instructions retired. Applies to SSE* and AVX*scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -108,57 +110,62 @@
        "UMask": "0x3c",
        "EventName": "FP_ARITH_INST_RETIRED.PACKED",
        "SampleAfterValue": "2000004",
-        "BriefDescription": "Number of SSE/AVX computational packed floating-point instructions retired. Applies to SSE* and AVX*, packed, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RSQRT RCP SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "BriefDescription": "Number of SSE/AVX computational packed floating-point instructions retired. Applies to SSE* and AVX*, packed, double and single precision floating-point: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. (RSQRT for single-precision?)",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "This event counts the number of x87 floating point (FP) micro-code assist (numeric overflow/underflow, inexact result) when the output value (destination register) is invalid.",
+        "PEBS": "1",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts the number of x87 floating point (FP) micro-code assist (numeric overflow/underflow, inexact result) when the output value (destination register) is invalid.",
        "EventCode": "0xCA",
        "Counter": "0,1,2,3",
        "UMask": "0x2",
        "EventName": "FP_ASSIST.X87_OUTPUT",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of X87 assists due to output value.",
+        "BriefDescription": "output - Numeric Overflow, Numeric Underflow, Inexact Result  (Precise Event)",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "This event counts x87 floating point (FP) micro-code assist (invalid operation, denormal operand, SNaN operand) when the input value (one of the source operands to an FP instruction) is invalid.",
+        "PEBS": "1",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts x87 floating point (FP) micro-code assist (invalid operation, denormal operand, SNaN operand) when the input value (one of the source operands to an FP instruction) is invalid.",
        "EventCode": "0xCA",
        "Counter": "0,1,2,3",
        "UMask": "0x4",
        "EventName": "FP_ASSIST.X87_INPUT",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of X87 assists due to input value.",
+        "BriefDescription": "input - Invalid Operation, Denormal Operand, SNaN Operand  (Precise Event)",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "This event counts the number of SSE* floating point (FP) micro-code assist (numeric overflow/underflow) when the output value (destination register) is invalid. Counting covers only cases involving penalties that require micro-code assist intervention.",
+        "PEBS": "1",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts the number of SSE* floating point (FP) micro-code assist (numeric overflow/underflow) when the output value (destination register) is invalid. Counting covers only cases involving penalties that require micro-code assist intervention.",
        "EventCode": "0xCA",
        "Counter": "0,1,2,3",
        "UMask": "0x8",
        "EventName": "FP_ASSIST.SIMD_OUTPUT",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of SIMD FP assists due to Output values",
+        "BriefDescription": "SSE* FP micro-code assist when output value is invalid. (Precise Event)",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "This event counts any input SSE* FP assist - invalid operation, denormal operand, dividing by zero, SNaN operand. Counting includes only cases involving penalties that required micro-code assist intervention.",
+        "PEBS": "1",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts any input SSE* floating-point (FP) assist - invalid operation, denormal operand, dividing by zero, SNaN operand. Counting includes only cases involving penalties that required micro-code assist intervention.",
        "EventCode": "0xCA",
        "Counter": "0,1,2,3",
        "UMask": "0x10",
        "EventName": "FP_ASSIST.SIMD_INPUT",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of SIMD FP assists due to input values",
+        "BriefDescription": "Any input SSE* FP Assist -   (Precise Event)",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "This event counts cycles with any input and output SSE or x87 FP assist. If an input and output assist are detected on the same cycle the event increments by 1.",
+        "PEBS": "1",
+        "PublicDescription": "This event counts cycles with any input and output SSE or x87 FP assist. If an input and output assist are detected on the same cycle the event increments by 1. Uses PEBS.",
        "EventCode": "0xCA",
        "Counter": "0,1,2,3",
        "UMask": "0x1e",
        "EventName": "FP_ASSIST.ANY",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Cycles with any input/output SSE or FP assist",
+        "BriefDescription": "Counts any FP_ASSIST umask was incrementing   (Precise Event)",
        "CounterMask": "1",
        "CounterHTOff": "0,1,2,3"
    }
--- a/tools/perf/pmu-events/arch/x86/broadwell/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/broadwell/frontend.json
@ -211,7 +211,7 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "This event counts the number of uops not delivered to Resource Allocation Table (RAT) per thread adding 4  x when Resource Allocation Table (RAT) is not stalled and Instruction Decode Queue (IDQ) delivers x uops to Resource Allocation Table (RAT) (where x belongs to {0,1,2,3}). Counting does not cover cases when:\n a. IDQ-Resource Allocation Table (RAT) pipe serves the other thread;\n b. Resource Allocation Table (RAT) is stalled for the thread (including uop drops and clear BE conditions); \n c. Instruction Decode Queue (IDQ) delivers four uops.",
+        "PublicDescription": "This event counts the number of uops not delivered to Resource Allocation Table (RAT) per thread adding \u201c4 \u2013 x\u201d when Resource Allocation Table (RAT) is not stalled and Instruction Decode Queue (IDQ) delivers x uops to Resource Allocation Table (RAT) (where x belongs to {0,1,2,3}). Counting does not cover cases when:\n a. IDQ-Resource Allocation Table (RAT) pipe serves the other thread;\n b. Resource Allocation Table (RAT) is stalled for the thread (including uop drops and clear BE conditions); \n c. Instruction Decode Queue (IDQ) delivers four uops.",
        "EventCode": "0x9C",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
@ -274,7 +274,7 @@
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "This event counts Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles. These cycles do not include uops routed through because of the switch itself, for example, when Instruction Decode Queue (IDQ) pre-allocation is unavailable, or Instruction Decode Queue (IDQ) is full. SBD-to-MITE switch true penalty cycles happen after the merge mux (MM) receives Decode Stream Buffer (DSB) Sync-indication until receiving the first MITE uop. \nMM is placed before Instruction Decode Queue (IDQ) to merge uops being fed from the MITE and Decode Stream Buffer (DSB) paths. Decode Stream Buffer (DSB) inserts the Sync-indication whenever a Decode Stream Buffer (DSB)-to-MITE switch occurs.\nPenalty: A Decode Stream Buffer (DSB) hit followed by a Decode Stream Buffer (DSB) miss can cost up to six cycles in which no uops are delivered to the IDQ. Most often, such switches from the Decode Stream Buffer (DSB) to the legacy pipeline cost 02 cycles.",
+        "PublicDescription": "This event counts Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles. These cycles do not include uops routed through because of the switch itself, for example, when Instruction Decode Queue (IDQ) pre-allocation is unavailable, or Instruction Decode Queue (IDQ) is full. SBD-to-MITE switch true penalty cycles happen after the merge mux (MM) receives Decode Stream Buffer (DSB) Sync-indication until receiving the first MITE uop. \nMM is placed before Instruction Decode Queue (IDQ) to merge uops being fed from the MITE and Decode Stream Buffer (DSB) paths. Decode Stream Buffer (DSB) inserts the Sync-indication whenever a Decode Stream Buffer (DSB)-to-MITE switch occurs.\nPenalty: A Decode Stream Buffer (DSB) hit followed by a Decode Stream Buffer (DSB) miss can cost up to six cycles in which no uops are delivered to the IDQ. Most often, such switches from the Decode Stream Buffer (DSB) to the legacy pipeline cost 0\u20132 cycles.",
        "EventCode": "0xAB",
        "Counter": "0,1,2,3",
        "UMask": "0x2",
--- a/tools/perf/pmu-events/arch/x86/broadwell/memory.json
+++ b/tools/perf/pmu-events/arch/x86/broadwell/memory.json
--- a/tools/perf/pmu-events/arch/x86/broadwell/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/broadwell/pipeline.json
@ -1,7 +1,6 @@
 [
    {
        "PublicDescription": "This event counts the number of instructions retired from execution. For instructions that consist of multiple micro-ops, this event counts the retirement of the last micro-op of the instruction. Counting continues during hardware interrupts, traps, and inside interrupt handlers. \nNotes: INST_RETIRED.ANY is counted by a designated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. INST_RETIRED.ANY_P is counted by a programmable counter and it is an architectural performance event. \nCounting: Faulting executions of GETSEC/VM entry/VM Exit/MWait will not count as retired instructions.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 0",
        "UMask": "0x1",
        "EventName": "INST_RETIRED.ANY",
@ -11,7 +10,6 @@
    },
    {
        "PublicDescription": "This event counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 1",
        "UMask": "0x2",
        "EventName": "CPU_CLK_UNHALTED.THREAD",
@ -20,7 +18,6 @@
        "CounterHTOff": "Fixed counter 1"
    },
    {
-        "EventCode": "0x00",
        "Counter": "Fixed counter 1",
        "UMask": "0x2",
        "AnyThread": "1",
@ -31,7 +28,6 @@
    },
    {
        "PublicDescription": "This event counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. This event has a constant ratio with the CPU_CLK_UNHALTED.REF_XCLK event. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. \nNote: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'.  This event is clocked by base clock (100 Mhz) on Sandy Bridge. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'.  After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 2",
        "UMask": "0x3",
        "EventName": "CPU_CLK_UNHALTED.REF_TSC",
@ -317,7 +313,7 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "This event counts stalls occurred due to changing prefix length (66, 67 or REX.W when they change the length of the decoded instruction). Occurrences counting is proportional to the number of prefixes in a 16B-line. This may result in the following penalties: three-cycle penalty for each LCP in a 16-byte chunk.",
+        "PublicDescription": "This event counts stalls occured due to changing prefix length (66, 67 or REX.W when they change the length of the decoded instruction). Occurrences counting is proportional to the number of prefixes in a 16B-line. This may result in the following penalties: three-cycle penalty for each LCP in a 16-byte chunk.",
        "EventCode": "0x87",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
@ -786,8 +782,8 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "This event counts resource-related stall cycles. Reasons for stalls can be as follows:\n - *any* u-arch structure got full (LB, SB, RS, ROB, BOB, LM, Physical Register Reclaim Table (PRRT), or Physical History Table (PHT) slots)\n - *any* u-arch structure got empty (like INT/SIMD FreeLists)\n - FPU control word (FPCW), MXCSR\nand others. This counts cycles that the pipeline backend blocked uop delivery from the front end.",
-        "EventCode": "0xA2",
+        "PublicDescription": "This event counts resource-related stall cycles.",
+        "EventCode": "0xa2",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "RESOURCE_STALLS.ANY",
@ -973,6 +969,7 @@
        "CounterHTOff": "2"
    },
    {
+        "PublicDescription": "Number of Uops delivered by the LSD.",
        "EventCode": "0xA8",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
@ -1147,7 +1144,8 @@
        "CounterHTOff": "1"
    },
    {
-        "PublicDescription": "This event counts FP operations retired. For X87 FP operations that have no exceptions counting also includes flows that have several X87, or flows that use X87 uops in the exception handling.",
+        "PEBS": "1",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts FP operations retired. For X87 FP operations that have no exceptions counting also includes flows that have several X87, or flows that use X87 uops in the exception handling.",
        "EventCode": "0xC0",
        "Counter": "0,1,2,3",
        "UMask": "0x2",
@ -1157,12 +1155,12 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
+        "PEBS": "1",
        "EventCode": "0xC1",
        "Counter": "0,1,2,3",
        "UMask": "0x40",
        "EventName": "OTHER_ASSISTS.ANY_WB_ASSIST",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of times any microcode assist is invoked by HW upon uop writeback.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
@ -1178,26 +1176,28 @@
        "Data_LA": "1"
    },
    {
-        "PublicDescription": "This event counts cycles without actually retired uops.",
+        "PEBS": "1",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts cycles without actually retired uops.",
        "EventCode": "0xC2",
        "Invert": "1",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "UOPS_RETIRED.STALL_CYCLES",
        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles without actually retired uops.",
+        "BriefDescription": "Cycles no executable uops retired (Precise Event)",
        "CounterMask": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Number of cycles using always true condition (uops_ret < 16) applied to non PEBS uops retired event.",
+        "PEBS": "1",
+        "PublicDescription": "Number of cycles using always true condition (uops_ret < 16) applied to  PEBS uops retired event.",
        "EventCode": "0xC2",
        "Invert": "1",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "UOPS_RETIRED.TOTAL_CYCLES",
        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles with less than 10 actually retired uops.",
+        "BriefDescription": "Number of cycles using always true condition applied to  PEBS uops retired event.",
        "CounterMask": "10",
        "CounterHTOff": "0,1,2,3"
    },
@ -1320,13 +1320,14 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "This event counts not taken branch instructions retired.",
+        "PEBS": "1",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts not taken branch instructions retired.",
        "EventCode": "0xC4",
        "Counter": "0,1,2,3",
        "UMask": "0x10",
        "EventName": "BR_INST_RETIRED.NOT_TAKEN",
        "SampleAfterValue": "400009",
-        "BriefDescription": "Not taken branch instructions retired.",
+        "BriefDescription": "Counts all not taken macro branch instructions retired. (Precise Event)",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
@ -1341,14 +1342,15 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "This event counts far branch instructions retired.",
+        "PEBS": "1",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts far branch instructions retired.",
        "EventCode": "0xC4",
        "Counter": "0,1,2,3",
        "UMask": "0x40",
        "Errata": "BDW98",
        "EventName": "BR_INST_RETIRED.FAR_BRANCH",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Far branch instructions retired.",
+        "BriefDescription": "Counts the number of far branch instructions retired.(Precise Event)",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
--- a/tools/perf/pmu-events/arch/x86/broadwellde/cache.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellde/cache.json
@ -439,7 +439,7 @@
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts line-split load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts line-splitted load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -451,7 +451,7 @@
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.SPLIT_STORES",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts line-split store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts line-splitted store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
        "SampleAfterValue": "100003",
        "L1_Hit_Indication": "1",
        "CounterHTOff": "0,1,2,3"
--- a/tools/perf/pmu-events/arch/x86/broadwellde/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellde/pipeline.json
@ -1,6 +1,5 @@
 [
    {
-        "EventCode": "0x00",
        "UMask": "0x1",
        "BriefDescription": "Instructions retired from execution.",
        "Counter": "Fixed counter 0",
@ -10,7 +9,6 @@
        "CounterHTOff": "Fixed counter 0"
    },
    {
-        "EventCode": "0x00",
        "UMask": "0x2",
        "BriefDescription": "Core cycles when the thread is not in halt state",
        "Counter": "Fixed counter 1",
@ -20,7 +18,6 @@
        "CounterHTOff": "Fixed counter 1"
    },
    {
-        "EventCode": "0x00",
        "UMask": "0x2",
        "BriefDescription": "Core cycles when at least one thread on the physical core is not in halt state.",
        "Counter": "Fixed counter 1",
@ -30,7 +27,6 @@
        "CounterHTOff": "Fixed counter 1"
    },
    {
-        "EventCode": "0x00",
        "UMask": "0x3",
        "BriefDescription": "Reference cycles when the core is not in halt state.",
        "Counter": "Fixed counter 2",
@ -322,7 +318,7 @@
        "BriefDescription": "Stalls caused by changing prefix length of the instruction.",
        "Counter": "0,1,2,3",
        "EventName": "ILD_STALL.LCP",
-        "PublicDescription": "This event counts stalls occurred due to changing prefix length (66, 67 or REX.W when they change the length of the decoded instruction). Occurrences counting is proportional to the number of prefixes in a 16B-line. This may result in the following penalties: three-cycle penalty for each LCP in a 16-byte chunk.",
+        "PublicDescription": "This event counts stalls occured due to changing prefix length (66, 67 or REX.W when they change the length of the decoded instruction). Occurrences counting is proportional to the number of prefixes in a 16B-line. This may result in the following penalties: three-cycle penalty for each LCP in a 16-byte chunk.",
        "SampleAfterValue": "2000003",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
--- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json
@ -1,164 +1,370 @@
 [
    {
-        "BriefDescription": "Instructions Per Cycle (per logical thread)",
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Frontend_Bound"
+    },
+    {
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Frontend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Bad_Speculation"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Bad_Speculation_SMT"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Backend_Bound"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Backend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. ",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Retiring"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Retiring_SMT"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Instructions Per Cycle (per logical thread)",
        "MetricGroup": "TopDownL1",
        "MetricName": "IPC"
    },
    {
-        "BriefDescription": "Uops Per Instruction",
        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY",
-        "MetricGroup": "Pipeline",
+        "BriefDescription": "Uops Per Instruction",
+        "MetricGroup": "Pipeline;Retiring",
        "MetricName": "UPI"
    },
    {
-        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Instruction per taken branch",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "IpTB"
+    },
+    {
+        "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Branch instructions per taken branch. ",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "BpTB"
+    },
+    {
        "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )",
-        "MetricGroup": "Frontend",
+        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely (includes speculatively fetches) consumed by program instructions",
+        "MetricGroup": "PGO",
        "MetricName": "IFetch_Line_Utilization"
    },
    {
-        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)",
-        "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )",
-        "MetricGroup": "DSB; Frontend_Bandwidth",
+        "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )",
+        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
+        "MetricGroup": "DSB;Frontend_Bandwidth",
        "MetricName": "DSB_Coverage"
    },
    {
-        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
+        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
    {
-        "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Per-thread actual clocks when the logical processor is active.",
        "MetricGroup": "Summary",
        "MetricName": "CLKS"
    },
    {
-        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
+        "MetricExpr": "4 * cycles",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
    {
-        "BriefDescription": "Total number of retired Instructions",
+        "MetricExpr": "4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
+        "MetricGroup": "TopDownL1_SMT",
+        "MetricName": "SLOTS_SMT"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS",
+        "BriefDescription": "Instructions per Load (lower number means loads are more frequent)",
+        "MetricGroup": "Instruction_Type;L1_Bound",
+        "MetricName": "IpL"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES",
+        "BriefDescription": "Instructions per Store",
+        "MetricGroup": "Instruction_Type;Store_Bound",
+        "MetricName": "IpS"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Instructions per Branch",
+        "MetricGroup": "Branches;Instruction_Type;Port_5;Port_6",
+        "MetricName": "IpB"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
+        "BriefDescription": "Instruction per (near) call",
+        "MetricGroup": "Branches",
+        "MetricName": "IpCall"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY",
+        "BriefDescription": "Total number of retired Instructions",
        "MetricGroup": "Summary",
        "MetricName": "Instructions"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / cycles",
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Instructions Per Cycle (per physical core)",
+        "MetricGroup": "SMT",
+        "MetricName": "CoreIPC_SMT"
+    },
+    {
+        "MetricExpr": "(( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / cycles",
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricGroup": "FLOPS",
+        "MetricName": "FLOPc"
+    },
+    {
+        "MetricExpr": "(( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricGroup": "FLOPS_SMT",
+        "MetricName": "FLOPc_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
-        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL  - (( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED )) ) / RS_EVENTS.EMPTY_END)",
-        "MetricGroup": "Unknown_Branches",
-        "MetricName": "BAClear_Cost"
+        "MetricExpr": "( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * cycles)) * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / cycles) / (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * cycles)) ) * (4 * cycles) / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Branch Misprediction Cost: Fraction of TopDown slots wasted per branch misprediction (jeclear and baclear)",
+        "MetricGroup": "Branch_Mispredicts",
+        "MetricName": "Branch_Misprediction_Cost"
    },
    {
+        "MetricExpr": "( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / cycles) / (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Branch Misprediction Cost: Fraction of TopDown slots wasted per branch misprediction (jeclear and baclear)",
+        "MetricGroup": "Branch_Mispredicts_SMT",
+        "MetricName": "Branch_Misprediction_Cost_SMT"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)",
+        "MetricGroup": "Branch_Mispredicts",
+        "MetricName": "IpMispredict"
+    },
+    {
+        "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
        "BriefDescription": "Core actual clocks when any thread is active on the physical core",
-        "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD",
        "MetricGroup": "SMT",
        "MetricName": "CORE_CLKS"
    },
    {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads",
        "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )",
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads (in core cycles)",
        "MetricGroup": "Memory_Bound;Memory_Lat",
        "MetricName": "Load_Miss_Real_Latency"
    },
    {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-thread)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / ( 2 * cycles )",
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED) ) / (2*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles))",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
    {
-        "BriefDescription": "Average CPU Utilization",
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / ( 2 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) )",
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricGroup": "TLB_SMT",
+        "MetricName": "Page_Walks_Utilization_SMT"
+    },
+    {
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L1D_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L2_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time",
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L3_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L1MPKI"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI"
+    },
+    {
+        "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache misses per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2HPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L3MPKI"
+    },
+    {
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
+        "BriefDescription": "Average CPU Utilization",
        "MetricGroup": "Summary",
        "MetricName": "CPU_Utilization"
    },
    {
+        "MetricExpr": "( (( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 ) / duration_time",
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },
    {
-        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricGroup": "Power",
        "MetricName": "Turbo_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0",
+        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricGroup": "SMT;Summary",
        "MetricName": "SMT_2T_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricGroup": "Summary",
        "MetricName": "Kernel_Utilization"
    },
    {
-        "BriefDescription": "C3 residency percent per core",
+        "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time",
+        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "DRAM_BW_Use"
+    },
+    {
+        "MetricExpr": "1000000000 * ( cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x35\\,umask\\=0x3\\,filter_opc\\=0x182@ ) / ( cbox_0@event\\=0x0@ / duration_time )",
+        "BriefDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches",
+        "MetricGroup": "Memory_Lat",
+        "MetricName": "DRAM_Read_Latency"
+    },
+    {
+        "MetricExpr": "cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182\\,thresh\\=1@",
+        "BriefDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "DRAM_Parallel_Reads"
+    },
+    {
+        "MetricExpr": "cbox_0@event\\=0x0@",
+        "BriefDescription": "Socket actual clocks when any core is active on that socket",
+        "MetricGroup": "",
+        "MetricName": "Socket_CLKS"
+    },
+    {
        "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per core",
        "MetricName": "C3_Core_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per core",
        "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per core",
        "MetricName": "C6_Core_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per core",
        "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per core",
        "MetricName": "C7_Core_Residency"
    },
    {
-        "BriefDescription": "C2 residency percent per package",
        "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C2 residency percent per package",
        "MetricName": "C2_Pkg_Residency"
    },
    {
-        "BriefDescription": "C3 residency percent per package",
        "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per package",
        "MetricName": "C3_Pkg_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per package",
        "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per package",
        "MetricName": "C6_Pkg_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per package",
        "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per package",
        "MetricName": "C7_Pkg_Residency"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/broadwellx/cache.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/cache.json
@ -57,17 +57,17 @@
    },
    {
        "EventCode": "0x24",
-        "UMask": "0x41",
+        "UMask": "0xc1",
        "BriefDescription": "Demand Data Read requests that hit L2 cache",
        "Counter": "0,1,2,3",
        "EventName": "L2_RQSTS.DEMAND_DATA_RD_HIT",
-        "PublicDescription": "This event counts the number of demand Data Read requests that hit L2 cache. Only not rejected loads are counted.",
+        "PublicDescription": "Counts the number of demand Data Read requests, initiated by load instructions, that hit L2 cache.",
        "SampleAfterValue": "200003",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
        "EventCode": "0x24",
-        "UMask": "0x42",
+        "UMask": "0xc2",
        "BriefDescription": "RFO requests that hit L2 cache.",
        "Counter": "0,1,2,3",
        "EventName": "L2_RQSTS.RFO_HIT",
@ -76,7 +76,7 @@
    },
    {
        "EventCode": "0x24",
-        "UMask": "0x44",
+        "UMask": "0xc4",
        "BriefDescription": "L2 cache hits when fetching instructions, code reads.",
        "Counter": "0,1,2,3",
        "EventName": "L2_RQSTS.CODE_RD_HIT",
@ -85,7 +85,7 @@
    },
    {
        "EventCode": "0x24",
-        "UMask": "0x50",
+        "UMask": "0xd0",
        "BriefDescription": "L2 prefetch requests that hit L2 cache",
        "Counter": "0,1,2,3",
        "EventName": "L2_RQSTS.L2_PF_HIT",
@ -396,24 +396,24 @@
    {
        "EventCode": "0xD0",
        "UMask": "0x11",
-        "BriefDescription": "Retired load uops that miss the STLB. (Precise Event - PEBS)",
+        "BriefDescription": "Retired load uops that miss the STLB.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.STLB_MISS_LOADS",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts load uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault.",
+        "PublicDescription": "This event counts load uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault.",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD0",
        "UMask": "0x12",
-        "BriefDescription": "Retired store uops that miss the STLB. (Precise Event - PEBS)",
+        "BriefDescription": "Retired store uops that miss the STLB.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.STLB_MISS_STORES",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts store uops true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault.",
+        "PublicDescription": "This event counts store uops with true STLB miss retired to the architected path. True STLB miss is an uop triggering page walk that gets completed without blocks, and later gets retired. This page walk can end up with or without a fault.",
        "SampleAfterValue": "100003",
        "L1_Hit_Indication": "1",
        "CounterHTOff": "0,1,2,3"
@ -421,37 +421,37 @@
    {
        "EventCode": "0xD0",
        "UMask": "0x21",
-        "BriefDescription": "Retired load uops with locked access. (Precise Event - PEBS)",
+        "BriefDescription": "Retired load uops with locked access.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.LOCK_LOADS",
        "Errata": "BDM35",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts load uops with locked access retired to the architected path.",
+        "PublicDescription": "This event counts load uops with locked access retired to the architected path.",
        "SampleAfterValue": "100007",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD0",
        "UMask": "0x41",
-        "BriefDescription": "Retired load uops that split across a cacheline boundary.(Precise Event - PEBS)",
+        "BriefDescription": "Retired load uops that split across a cacheline boundary.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts line-split load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
+        "PublicDescription": "This event counts line-splitted load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD0",
        "UMask": "0x42",
-        "BriefDescription": "Retired store uops that split across a cacheline boundary. (Precise Event - PEBS)",
+        "BriefDescription": "Retired store uops that split across a cacheline boundary.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.SPLIT_STORES",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts line-split store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
+        "PublicDescription": "This event counts line-splitted store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
        "SampleAfterValue": "100003",
        "L1_Hit_Indication": "1",
        "CounterHTOff": "0,1,2,3"
@ -459,24 +459,24 @@
    {
        "EventCode": "0xD0",
        "UMask": "0x81",
-        "BriefDescription": "All retired load uops. (Precise Event - PEBS)",
+        "BriefDescription": "All retired load uops.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.ALL_LOADS",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts load uops retired to the architected path with a filter on bits 0 and 1 applied.\nNote: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement. This event also counts SW prefetches.",
+        "PublicDescription": "This event counts load uops retired to the architected path with a filter on bits 0 and 1 applied.\nNote: This event counts AVX-256bit load/store double-pump memory uops as a single uop at retirement. This event also counts SW prefetches.",
        "SampleAfterValue": "2000003",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD0",
        "UMask": "0x82",
-        "BriefDescription": "Retired store uops that split across a cacheline boundary. (Precise Event - PEBS)",
+        "BriefDescription": "All retired store uops.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.ALL_STORES",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts store uops retired to the architected path with a filter on bits 0 and 1 applied.\nNote: This event ?ounts AVX-256bit load/store double-pump memory uops as a single uop at retirement.",
+        "PublicDescription": "This event counts store uops retired to the architected path with a filter on bits 0 and 1 applied.\nNote: This event counts AVX-256bit load/store double-pump memory uops as a single uop at retirement.",
        "SampleAfterValue": "2000003",
        "L1_Hit_Indication": "1",
        "CounterHTOff": "0,1,2,3"
@ -484,69 +484,69 @@
    {
        "EventCode": "0xD1",
        "UMask": "0x1",
-        "BriefDescription": "Retired load uops with L1 cache hits as data sources. (Precise Event - PEBS)",
+        "BriefDescription": "Retired load uops with L1 cache hits as data sources.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L1_HIT",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts retired load uops which data source were hits in the nearest-level (L1) cache.\nNote: Only two data-sources of L1/FB are applicable for AVX-256bit  even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. This event also counts SW prefetches independent of the actual data source.",
+        "PublicDescription": "This event counts retired load uops which data sources were hits in the nearest-level (L1) cache.\nNote: Only two data-sources of L1/FB are applicable for AVX-256bit  even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load. This event also counts SW prefetches independent of the actual data source.",
        "SampleAfterValue": "2000003",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD1",
        "UMask": "0x2",
-        "BriefDescription": "Retired load uops with L2 cache hits as data sources. (Precise Event - PEBS)",
+        "BriefDescription": "Retired load uops with L2 cache hits as data sources.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT",
        "Errata": "BDM35",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts retired load uops which data sources were hits in the mid-level (L2) cache.",
+        "PublicDescription": "This event counts retired load uops which data sources were hits in the mid-level (L2) cache.",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD1",
        "UMask": "0x4",
-        "BriefDescription": "Hit in last-level (L3) cache. Excludes Unknown data-source. (Precise Event - PEBS)",
+        "BriefDescription": "Retired load uops which data sources were data hits in L3 without snoops required.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT",
        "Errata": "BDM100",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts retired load uops which data sources were data hits in the last-level (L3) cache without snoops required.",
+        "PublicDescription": "This event counts retired load uops which data sources were data hits in the last-level (L3) cache without snoops required.",
        "SampleAfterValue": "50021",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD1",
        "UMask": "0x8",
-        "BriefDescription": "Retired load uops misses in L1 cache as data sources. Uses PEBS.",
+        "BriefDescription": "Retired load uops misses in L1 cache as data sources.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L1_MISS",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts retired load uops which data sources were misses in the nearest-level (L1) cache. Counting excludes unknown and UC data source.",
+        "PublicDescription": "This event counts retired load uops which data sources were misses in the nearest-level (L1) cache. Counting excludes unknown and UC data source.",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD1",
        "UMask": "0x10",
-        "BriefDescription": "Retired load uops with L2 cache misses as data sources. Uses PEBS.",
+        "BriefDescription": "Miss in mid-level (L2) cache. Excludes Unknown data-source.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_MISS",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts retired load uops which data sources were misses in the mid-level (L2) cache. Counting excludes unknown and UC data source.",
+        "PublicDescription": "This event counts retired load uops which data sources were misses in the mid-level (L2) cache. Counting excludes unknown and UC data source.",
        "SampleAfterValue": "50021",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD1",
        "UMask": "0x20",
-        "BriefDescription": "Miss in last-level (L3) cache. Excludes Unknown data-source. (Precise Event - PEBS).",
+        "BriefDescription": "Miss in last-level (L3) cache. Excludes Unknown data-source.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
@ -558,83 +558,84 @@
    {
        "EventCode": "0xD1",
        "UMask": "0x40",
-        "BriefDescription": "Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. (Precise Event - PEBS)",
+        "BriefDescription": "Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.HIT_LFB",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts retired load uops which data sources were load uops missed L1 but hit a fill buffer due to a preceding miss to the same cache line with the data not ready.\nNote: Only two data-sources of L1/FB are applicable for AVX-256bit  even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load.",
+        "PublicDescription": "This event counts retired load uops which data sources were load uops missed L1 but hit a fill buffer due to a preceding miss to the same cache line with the data not ready.\nNote: Only two data-sources of L1/FB are applicable for AVX-256bit  even though the corresponding AVX load could be serviced by a deeper level in the memory hierarchy. Data source is reported for the Low-half load.",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD2",
        "UMask": "0x1",
-        "BriefDescription": "Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache. (Precise Event - PEBS)",
+        "BriefDescription": "Retired load uops which data sources were L3 hit and cross-core snoop missed in on-pkg core cache.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS",
        "Errata": "BDM100",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts retired load uops which data sources were L3 Hit and a cross-core snoop missed in the on-pkg core cache.",
+        "PublicDescription": "This event counts retired load uops which data sources were L3 Hit and a cross-core snoop missed in the on-pkg core cache.",
        "SampleAfterValue": "20011",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD2",
        "UMask": "0x2",
-        "BriefDescription": "Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache. (Precise Event - PEBS)",
+        "BriefDescription": "Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT",
        "Errata": "BDM100",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts retired load uops which data sources were L3 hit and a cross-core snoop hit in the on-pkg core cache.",
+        "PublicDescription": "This event counts retired load uops which data sources were L3 hit and a cross-core snoop hit in the on-pkg core cache.",
        "SampleAfterValue": "20011",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD2",
        "UMask": "0x4",
-        "BriefDescription": "Retired load uops which data sources were HitM responses from shared L3. (Precise Event - PEBS)",
+        "BriefDescription": "Retired load uops which data sources were HitM responses from shared L3.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM",
        "Errata": "BDM100",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts retired load uops which data sources were HitM responses from a core on same socket (shared L3).",
+        "PublicDescription": "This event counts retired load uops which data sources were HitM responses from a core on same socket (shared L3).",
        "SampleAfterValue": "20011",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD2",
        "UMask": "0x8",
-        "BriefDescription": "Retired load uops which data sources were hits in L3 without snoops required. (Precise Event - PEBS)",
+        "BriefDescription": "Retired load uops which data sources were hits in L3 without snoops required.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_NONE",
        "Errata": "BDM100",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts retired load uops which data sources were hits in the last-level (L3) cache without snoops required.",
+        "PublicDescription": "This event counts retired load uops which data sources were hits in the last-level (L3) cache without snoops required.",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD3",
        "UMask": "0x1",
+        "BriefDescription": "Data from local DRAM either Snoop not needed or Snoop Miss (RspI)",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM",
        "Errata": "BDE70, BDM100",
-        "PublicDescription": "This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches. This is a precise event.",
+        "PublicDescription": "Retired load uop whose Data Source was: local DRAM either Snoop not needed or Snoop Miss (RspI).",
        "SampleAfterValue": "100007",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD3",
        "UMask": "0x4",
-        "BriefDescription": "Retired load uop whose Data Source was: remote DRAM either Snoop not needed or Snoop Miss (RspI) (Precise Event)",
+        "BriefDescription": "Retired load uop whose Data Source was: remote DRAM either Snoop not needed or Snoop Miss (RspI)",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
@ -646,7 +647,7 @@
    {
        "EventCode": "0xD3",
        "UMask": "0x10",
-        "BriefDescription": "Retired load uop whose Data Source was: Remote cache HITM (Precise Event)",
+        "BriefDescription": "Retired load uop whose Data Source was: Remote cache HITM",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
@ -658,7 +659,7 @@
    {
        "EventCode": "0xD3",
        "UMask": "0x20",
-        "BriefDescription": "Retired load uop whose Data Source was: forwarded from remote cache (Precise Event)",
+        "BriefDescription": "Retired load uop whose Data Source was: forwarded from remote cache",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
@ -810,12 +811,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all requests that hit in the L3",
-        "MSRValue": "0x3f803c8fff",
+        "BriefDescription": "Counts all requests hit in the L3",
+        "MSRValue": "0x3F803C8FFF",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_REQUESTS.LLC_HIT.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all requests that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all requests hit in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -823,12 +824,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
-        "MSRValue": "0x10003c07f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "MSRValue": "0x10003C07F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_HIT.HITM_OTHER_CORE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -836,12 +837,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
-        "MSRValue": "0x04003c07f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "MSRValue": "0x04003C07F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_HIT.HIT_OTHER_CORE_NO_FWD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -849,12 +850,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch code reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
-        "MSRValue": "0x04003c0244",
+        "BriefDescription": "Counts all demand & prefetch code reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "MSRValue": "0x04003C0244",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_CODE_RD.LLC_HIT.HIT_OTHER_CORE_NO_FWD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch code reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch code reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -862,12 +863,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
-        "MSRValue": "0x10003c0122",
+        "BriefDescription": "Counts all demand & prefetch RFOs hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "MSRValue": "0x10003C0122",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.LLC_HIT.HITM_OTHER_CORE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch RFOs hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -875,12 +876,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
-        "MSRValue": "0x04003c0122",
+        "BriefDescription": "Counts all demand & prefetch RFOs hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "MSRValue": "0x04003C0122",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.LLC_HIT.HIT_OTHER_CORE_NO_FWD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch RFOs hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -888,12 +889,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
-        "MSRValue": "0x10003c0091",
+        "BriefDescription": "Counts all demand & prefetch data reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "MSRValue": "0x10003C0091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_HIT.HITM_OTHER_CORE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -901,12 +902,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
-        "MSRValue": "0x04003c0091",
+        "BriefDescription": "Counts all demand & prefetch data reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "MSRValue": "0x04003C0091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_HIT.HIT_OTHER_CORE_NO_FWD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -914,12 +915,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts prefetch (that bring data to LLC only) code reads that hit in the L3",
-        "MSRValue": "0x3f803c0200",
+        "BriefDescription": "Counts prefetch (that bring data to LLC only) code reads hit in the L3",
+        "MSRValue": "0x3F803C0200",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_LLC_CODE_RD.LLC_HIT.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts prefetch (that bring data to LLC only) code reads that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts prefetch (that bring data to LLC only) code reads hit in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -927,12 +928,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that hit in the L3",
-        "MSRValue": "0x3f803c0100",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs hit in the L3",
+        "MSRValue": "0x3F803C0100",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_LLC_RFO.LLC_HIT.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs hit in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -940,12 +941,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
-        "MSRValue": "0x10003c0002",
+        "BriefDescription": "Counts all demand data writes (RFOs) hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "MSRValue": "0x10003C0002",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -953,12 +954,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand data writes (RFOs) that hit in the L3",
-        "MSRValue": "0x3f803c0002",
+        "BriefDescription": "Counts all demand data writes (RFOs) hit in the L3",
+        "MSRValue": "0x3F803C0002",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand data writes (RFOs) that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) hit in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    }
--- a/tools/perf/pmu-events/arch/x86/broadwellx/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/floating-point.json
@ -42,7 +42,7 @@
    {
        "EventCode": "0xC7",
        "UMask": "0x3",
-        "BriefDescription": "Number of SSE/AVX computational scalar floating-point instructions retired. Applies to SSE* and AVX* scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RSQRT RCP SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "BriefDescription": "Number of SSE/AVX computational scalar floating-point instructions retired. Applies to SSE* and AVX* scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RSQRT RCP SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. (RSQRT for single precision?)",
        "Counter": "0,1,2,3",
        "EventName": "FP_ARITH_INST_RETIRED.SCALAR",
        "SampleAfterValue": "2000003",
@ -51,7 +51,7 @@
    {
        "EventCode": "0xC7",
        "UMask": "0x4",
-        "BriefDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired.  Each count represents 2 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "BriefDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired.  Each count represents 2 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
        "Counter": "0,1,2,3",
        "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE",
        "SampleAfterValue": "2000003",
@ -60,7 +60,7 @@
    {
        "EventCode": "0xC7",
        "UMask": "0x8",
-        "BriefDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired.  Each count represents 4 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "BriefDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired.  Each count represents 4 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
        "Counter": "0,1,2,3",
        "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE",
        "SampleAfterValue": "2000003",
@ -69,7 +69,7 @@
    {
        "EventCode": "0xC7",
        "UMask": "0x10",
-        "BriefDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired.  Each count represents 4 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "BriefDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired.  Each count represents 4 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
        "Counter": "0,1,2,3",
        "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE",
        "SampleAfterValue": "2000003",
@ -78,7 +78,7 @@
    {
        "EventCode": "0xC7",
        "UMask": "0x15",
-        "BriefDescription": "Number of SSE/AVX computational double precision floating-point instructions retired. Applies to SSE* and AVX*scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.  ?.",
+        "BriefDescription": "Number of SSE/AVX computational double precision floating-point instructions retired. Applies to SSE* and AVX*scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
        "Counter": "0,1,2,3",
        "EventName": "FP_ARITH_INST_RETIRED.DOUBLE",
        "SampleAfterValue": "2000006",
@ -87,7 +87,7 @@
    {
        "EventCode": "0xc7",
        "UMask": "0x20",
-        "BriefDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired.  Each count represents 8 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "BriefDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired.  Each count represents 8 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
        "Counter": "0,1,2,3",
        "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE",
        "SampleAfterValue": "2000003",
@ -96,7 +96,7 @@
    {
        "EventCode": "0xC7",
        "UMask": "0x2a",
-        "BriefDescription": "Number of SSE/AVX computational single precision floating-point instructions retired. Applies to SSE* and AVX*scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. ?.",
+        "BriefDescription": "Number of SSE/AVX computational single precision floating-point instructions retired. Applies to SSE* and AVX*scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
        "Counter": "0,1,2,3",
        "EventName": "FP_ARITH_INST_RETIRED.SINGLE",
        "SampleAfterValue": "2000005",
@ -105,7 +105,7 @@
    {
        "EventCode": "0xC7",
        "UMask": "0x3c",
-        "BriefDescription": "Number of SSE/AVX computational packed floating-point instructions retired. Applies to SSE* and AVX*, packed, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RSQRT RCP SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "BriefDescription": "Number of SSE/AVX computational packed floating-point instructions retired. Applies to SSE* and AVX*, packed, double and single precision floating-point: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element. (RSQRT for single-precision?)",
        "Counter": "0,1,2,3",
        "EventName": "FP_ARITH_INST_RETIRED.PACKED",
        "SampleAfterValue": "2000004",
--- a/tools/perf/pmu-events/arch/x86/broadwellx/memory.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/memory.json
@ -170,11 +170,11 @@
    {
        "EventCode": "0xc8",
        "UMask": "0x4",
-        "BriefDescription": "Number of times HLE abort was triggered (PEBS)",
+        "BriefDescription": "Number of times HLE abort was triggered",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "HLE_RETIRED.ABORTED",
-        "PublicDescription": "Number of times HLE abort was triggered (PEBS).",
+        "PublicDescription": "Number of times HLE abort was triggered.",
        "SampleAfterValue": "2000003",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -251,11 +251,11 @@
    {
        "EventCode": "0xc9",
        "UMask": "0x4",
-        "BriefDescription": "Number of times RTM abort was triggered (PEBS)",
+        "BriefDescription": "Number of times RTM abort was triggered",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "RTM_RETIRED.ABORTED",
-        "PublicDescription": "Number of times RTM abort was triggered (PEBS).",
+        "PublicDescription": "Number of times RTM abort was triggered .",
        "SampleAfterValue": "2000003",
        "CounterHTOff": "0,1,2,3"
    },
@ -312,14 +312,14 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 4",
+        "BriefDescription": "Randomly selected loads with latency value being above 4",
        "PEBS": "2",
        "MSRValue": "0x4",
        "Counter": "3",
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4",
        "MSRIndex": "0x3F6",
        "Errata": "BDM100, BDM35",
-        "PublicDescription": "This event counts loads with latency value being above four.",
+        "PublicDescription": "Counts randomly selected loads with latency value being above four.",
        "TakenAlone": "1",
        "SampleAfterValue": "100003",
        "CounterHTOff": "3"
@ -327,14 +327,14 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 8",
+        "BriefDescription": "Randomly selected loads with latency value being above 8",
        "PEBS": "2",
        "MSRValue": "0x8",
        "Counter": "3",
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8",
        "MSRIndex": "0x3F6",
        "Errata": "BDM100, BDM35",
-        "PublicDescription": "This event counts loads with latency value being above eight.",
+        "PublicDescription": "Counts randomly selected loads with latency value being above eight.",
        "TakenAlone": "1",
        "SampleAfterValue": "50021",
        "CounterHTOff": "3"
@ -342,14 +342,14 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 16",
+        "BriefDescription": "Randomly selected loads with latency value being above 16",
        "PEBS": "2",
        "MSRValue": "0x10",
        "Counter": "3",
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16",
        "MSRIndex": "0x3F6",
        "Errata": "BDM100, BDM35",
-        "PublicDescription": "This event counts loads with latency value being above 16.",
+        "PublicDescription": "Counts randomly selected loads with latency value being above 16.",
        "TakenAlone": "1",
        "SampleAfterValue": "20011",
        "CounterHTOff": "3"
@ -357,14 +357,14 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 32",
+        "BriefDescription": "Randomly selected loads with latency value being above 32",
        "PEBS": "2",
        "MSRValue": "0x20",
        "Counter": "3",
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32",
        "MSRIndex": "0x3F6",
        "Errata": "BDM100, BDM35",
-        "PublicDescription": "This event counts loads with latency value being above 32.",
+        "PublicDescription": "Counts randomly selected loads with latency value being above 32.",
        "TakenAlone": "1",
        "SampleAfterValue": "100007",
        "CounterHTOff": "3"
@ -372,14 +372,14 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 64",
+        "BriefDescription": "Randomly selected loads with latency value being above 64",
        "PEBS": "2",
        "MSRValue": "0x40",
        "Counter": "3",
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64",
        "MSRIndex": "0x3F6",
        "Errata": "BDM100, BDM35",
-        "PublicDescription": "This event counts loads with latency value being above 64.",
+        "PublicDescription": "Counts randomly selected loads with latency value being above 64.",
        "TakenAlone": "1",
        "SampleAfterValue": "2003",
        "CounterHTOff": "3"
@ -387,14 +387,14 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 128",
+        "BriefDescription": "Randomly selected loads with latency value being above 128",
        "PEBS": "2",
        "MSRValue": "0x80",
        "Counter": "3",
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128",
        "MSRIndex": "0x3F6",
        "Errata": "BDM100, BDM35",
-        "PublicDescription": "This event counts loads with latency value being above 128.",
+        "PublicDescription": "Counts randomly selected loads with latency value being above 128.",
        "TakenAlone": "1",
        "SampleAfterValue": "1009",
        "CounterHTOff": "3"
@ -402,14 +402,14 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 256",
+        "BriefDescription": "Randomly selected loads with latency value being above 256",
        "PEBS": "2",
        "MSRValue": "0x100",
        "Counter": "3",
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256",
        "MSRIndex": "0x3F6",
        "Errata": "BDM100, BDM35",
-        "PublicDescription": "This event counts loads with latency value being above 256.",
+        "PublicDescription": "Counts randomly selected loads with latency value being above 256.",
        "TakenAlone": "1",
        "SampleAfterValue": "503",
        "CounterHTOff": "3"
@ -417,14 +417,14 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 512",
+        "BriefDescription": "Randomly selected loads with latency value being above 512",
        "PEBS": "2",
        "MSRValue": "0x200",
        "Counter": "3",
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512",
        "MSRIndex": "0x3F6",
        "Errata": "BDM100, BDM35",
-        "PublicDescription": "This event counts loads with latency value being above 512.",
+        "PublicDescription": "Counts randomly selected loads with latency value being above 512.",
        "TakenAlone": "1",
        "SampleAfterValue": "101",
        "CounterHTOff": "3"
@ -433,12 +433,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all requests that miss in the L3",
-        "MSRValue": "0x3fbfc08fff",
+        "BriefDescription": "Counts all requests miss in the L3",
+        "MSRValue": "0x3FBFC08FFF",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_REQUESTS.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all requests that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all requests miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -446,12 +446,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and clean or shared data is transferred from remote cache",
-        "MSRValue": "0x087fc007f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and clean or shared data is transferred from remote cache",
+        "MSRValue": "0x087FC007F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_MISS.REMOTE_HIT_FORWARD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and clean or shared data is transferred from remote cache Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and clean or shared data is transferred from remote cache",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -459,12 +459,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the modified data is transferred from remote cache",
-        "MSRValue": "0x103fc007f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and the modified data is transferred from remote cache",
+        "MSRValue": "0x103FC007F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_MISS.REMOTE_HITM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the modified data is transferred from remote cache Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and the modified data is transferred from remote cache",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -472,12 +472,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the data is returned from remote dram",
-        "MSRValue": "0x063bc007f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and the data is returned from remote dram",
+        "MSRValue": "0x063BC007F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_MISS.REMOTE_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the data is returned from remote dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and the data is returned from remote dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -485,12 +485,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the data is returned from local dram",
-        "MSRValue": "0x06040007f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and the data is returned from local dram",
+        "MSRValue": "0x06040007F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_MISS.LOCAL_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and the data is returned from local dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -498,12 +498,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss in the L3",
-        "MSRValue": "0x3fbfc007f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) miss in the L3",
+        "MSRValue": "0x3FBFC007F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -511,12 +511,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch code reads that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand & prefetch code reads miss the L3 and the data is returned from local dram",
        "MSRValue": "0x0604000244",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_CODE_RD.LLC_MISS.LOCAL_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch code reads that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch code reads miss the L3 and the data is returned from local dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -524,12 +524,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch code reads that miss in the L3",
-        "MSRValue": "0x3fbfc00244",
+        "BriefDescription": "Counts all demand & prefetch code reads miss in the L3",
+        "MSRValue": "0x3FBFC00244",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_CODE_RD.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch code reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch code reads miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -537,12 +537,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch RFOs that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand & prefetch RFOs miss the L3 and the data is returned from local dram",
        "MSRValue": "0x0604000122",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.LLC_MISS.LOCAL_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch RFOs that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch RFOs miss the L3 and the data is returned from local dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -550,12 +550,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch RFOs that miss in the L3",
-        "MSRValue": "0x3fbfc00122",
+        "BriefDescription": "Counts all demand & prefetch RFOs miss in the L3",
+        "MSRValue": "0x3FBFC00122",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch RFOs that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch RFOs miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -563,12 +563,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and clean or shared data is transferred from remote cache",
-        "MSRValue": "0x087fc00091",
+        "BriefDescription": "Counts all demand & prefetch data reads miss the L3 and clean or shared data is transferred from remote cache",
+        "MSRValue": "0x087FC00091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_MISS.REMOTE_HIT_FORWARD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and clean or shared data is transferred from remote cache Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads miss the L3 and clean or shared data is transferred from remote cache",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -576,12 +576,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and the modified data is transferred from remote cache",
-        "MSRValue": "0x103fc00091",
+        "BriefDescription": "Counts all demand & prefetch data reads miss the L3 and the modified data is transferred from remote cache",
+        "MSRValue": "0x103FC00091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_MISS.REMOTE_HITM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and the modified data is transferred from remote cache Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads miss the L3 and the modified data is transferred from remote cache",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -589,12 +589,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from remote dram",
-        "MSRValue": "0x063bc00091",
+        "BriefDescription": "Counts all demand & prefetch data reads miss the L3 and the data is returned from remote dram",
+        "MSRValue": "0x063BC00091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_MISS.REMOTE_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from remote dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads miss the L3 and the data is returned from remote dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -602,12 +602,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand & prefetch data reads miss the L3 and the data is returned from local dram",
        "MSRValue": "0x0604000091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_MISS.LOCAL_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads miss the L3 and the data is returned from local dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -615,12 +615,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that miss in the L3",
-        "MSRValue": "0x3fbfc00091",
+        "BriefDescription": "Counts all demand & prefetch data reads miss in the L3",
+        "MSRValue": "0x3FBFC00091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -628,12 +628,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts prefetch (that bring data to LLC only) code reads that miss in the L3",
-        "MSRValue": "0x3fbfc00200",
+        "BriefDescription": "Counts prefetch (that bring data to LLC only) code reads miss in the L3",
+        "MSRValue": "0x3FBFC00200",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_LLC_CODE_RD.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts prefetch (that bring data to LLC only) code reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts prefetch (that bring data to LLC only) code reads miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -641,12 +641,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss in the L3",
-        "MSRValue": "0x3fbfc00100",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs miss in the L3",
+        "MSRValue": "0x3FBFC00100",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_LLC_RFO.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -654,12 +654,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand data writes (RFOs) that miss the L3 and the modified data is transferred from remote cache",
-        "MSRValue": "0x103fc00002",
+        "BriefDescription": "Counts all demand data writes (RFOs) miss the L3 and the modified data is transferred from remote cache",
+        "MSRValue": "0x103FC00002",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand data writes (RFOs) that miss the L3 and the modified data is transferred from remote cache Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) miss the L3 and the modified data is transferred from remote cache",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -667,12 +667,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand data writes (RFOs) that miss in the L3",
-        "MSRValue": "0x3fbfc00002",
+        "BriefDescription": "Counts all demand data writes (RFOs) miss in the L3",
+        "MSRValue": "0x3FBFC00002",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand data writes (RFOs) that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    }
--- a/tools/perf/pmu-events/arch/x86/broadwellx/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/pipeline.json
@ -1,6 +1,5 @@
 [
    {
-        "EventCode": "0x00",
        "UMask": "0x1",
        "BriefDescription": "Instructions retired from execution.",
        "Counter": "Fixed counter 0",
@ -10,7 +9,6 @@
        "CounterHTOff": "Fixed counter 0"
    },
    {
-        "EventCode": "0x00",
        "UMask": "0x2",
        "BriefDescription": "Core cycles when the thread is not in halt state",
        "Counter": "Fixed counter 1",
@ -20,7 +18,6 @@
        "CounterHTOff": "Fixed counter 1"
    },
    {
-        "EventCode": "0x00",
        "UMask": "0x2",
        "BriefDescription": "Core cycles when at least one thread on the physical core is not in halt state.",
        "Counter": "Fixed counter 1",
@ -30,7 +27,6 @@
        "CounterHTOff": "Fixed counter 1"
    },
    {
-        "EventCode": "0x00",
        "UMask": "0x3",
        "BriefDescription": "Reference cycles when the core is not in halt state.",
        "Counter": "Fixed counter 2",
@ -322,7 +318,7 @@
        "BriefDescription": "Stalls caused by changing prefix length of the instruction.",
        "Counter": "0,1,2,3",
        "EventName": "ILD_STALL.LCP",
-        "PublicDescription": "This event counts stalls occurred due to changing prefix length (66, 67 or REX.W when they change the length of the decoded instruction). Occurrences counting is proportional to the number of prefixes in a 16B-line. This may result in the following penalties: three-cycle penalty for each LCP in a 16-byte chunk.",
+        "PublicDescription": "This event counts stalls occured due to changing prefix length (66, 67 or REX.W when they change the length of the decoded instruction). Occurrences counting is proportional to the number of prefixes in a 16B-line. This may result in the following penalties: three-cycle penalty for each LCP in a 16-byte chunk.",
        "SampleAfterValue": "2000003",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -786,12 +782,12 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "EventCode": "0xA2",
+        "EventCode": "0xa2",
        "UMask": "0x1",
        "BriefDescription": "Resource-related stall cycles",
        "Counter": "0,1,2,3",
        "EventName": "RESOURCE_STALLS.ANY",
-        "PublicDescription": "This event counts resource-related stall cycles. Reasons for stalls can be as follows:\n - *any* u-arch structure got full (LB, SB, RS, ROB, BOB, LM, Physical Register Reclaim Table (PRRT), or Physical History Table (PHT) slots)\n - *any* u-arch structure got empty (like INT/SIMD FreeLists)\n - FPU control word (FPCW), MXCSR\nand others. This counts cycles that the pipeline backend blocked uop delivery from the front end.",
+        "PublicDescription": "This event counts resource-related stall cycles.",
        "SampleAfterValue": "2000003",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -1168,12 +1164,12 @@
    {
        "EventCode": "0xC2",
        "UMask": "0x1",
-        "BriefDescription": "Actually retired uops. (Precise Event - PEBS)",
+        "BriefDescription": "Actually retired uops.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "UOPS_RETIRED.ALL",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts all actually retired uops. Counting increments by two for micro-fused uops, and by one for macro-fused and other uops. Maximal increment value for one cycle is eight.",
+        "PublicDescription": "This event counts all actually retired uops. Counting increments by two for micro-fused uops, and by one for macro-fused and other uops. Maximal increment value for one cycle is eight.",
        "SampleAfterValue": "2000003",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -1204,11 +1200,11 @@
    {
        "EventCode": "0xC2",
        "UMask": "0x2",
-        "BriefDescription": "Retirement slots used. (Precise Event - PEBS)",
+        "BriefDescription": "Retirement slots used.",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "UOPS_RETIRED.RETIRE_SLOTS",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts the number of retirement slots used.",
+        "PublicDescription": "This event counts the number of retirement slots used.",
        "SampleAfterValue": "2000003",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -1266,33 +1262,33 @@
    {
        "EventCode": "0xC4",
        "UMask": "0x1",
-        "BriefDescription": "Conditional branch instructions retired. (Precise Event - PEBS)",
+        "BriefDescription": "Conditional branch instructions retired.",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "BR_INST_RETIRED.CONDITIONAL",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts conditional branch instructions retired.",
+        "PublicDescription": "This event counts conditional branch instructions retired.",
        "SampleAfterValue": "400009",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
        "EventCode": "0xC4",
        "UMask": "0x2",
-        "BriefDescription": "Direct and indirect near call instructions retired. (Precise Event - PEBS)",
+        "BriefDescription": "Direct and indirect near call instructions retired.",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "BR_INST_RETIRED.NEAR_CALL",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts both direct and indirect near call instructions retired.",
+        "PublicDescription": "This event counts both direct and indirect near call instructions retired.",
        "SampleAfterValue": "100007",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
        "EventCode": "0xC4",
        "UMask": "0x2",
-        "BriefDescription": "Direct and indirect macro near call instructions retired (captured in ring 3). (Precise Event - PEBS)",
+        "BriefDescription": "Direct and indirect macro near call instructions retired (captured in ring 3).",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "BR_INST_RETIRED.NEAR_CALL_R3",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts both direct and indirect macro near call instructions retired (captured in ring 3).",
+        "PublicDescription": "This event counts both direct and indirect macro near call instructions retired (captured in ring 3).",
        "SampleAfterValue": "100007",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -1311,11 +1307,11 @@
    {
        "EventCode": "0xC4",
        "UMask": "0x8",
-        "BriefDescription": "Return instructions retired. (Precise Event - PEBS)",
+        "BriefDescription": "Return instructions retired.",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "BR_INST_RETIRED.NEAR_RETURN",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts return instructions retired.",
+        "PublicDescription": "This event counts return instructions retired.",
        "SampleAfterValue": "100007",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -1332,11 +1328,11 @@
    {
        "EventCode": "0xC4",
        "UMask": "0x20",
-        "BriefDescription": "Taken branch instructions retired. (Precise Event - PEBS)",
+        "BriefDescription": "Taken branch instructions retired.",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "BR_INST_RETIRED.NEAR_TAKEN",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts taken branch instructions retired.",
+        "PublicDescription": "This event counts taken branch instructions retired.",
        "SampleAfterValue": "400009",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -1364,11 +1360,11 @@
    {
        "EventCode": "0xC5",
        "UMask": "0x1",
-        "BriefDescription": "Mispredicted conditional branch instructions retired. (Precise Event - PEBS)",
+        "BriefDescription": "Mispredicted conditional branch instructions retired.",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "BR_MISP_RETIRED.CONDITIONAL",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts mispredicted conditional branch instructions retired.",
+        "PublicDescription": "This event counts mispredicted conditional branch instructions retired.",
        "SampleAfterValue": "400009",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -1386,22 +1382,22 @@
    {
        "EventCode": "0xC5",
        "UMask": "0x8",
-        "BriefDescription": "This event counts the number of mispredicted ret instructions retired.(Precise Event)",
+        "BriefDescription": "This event counts the number of mispredicted ret instructions retired. Non PEBS",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "BR_MISP_RETIRED.RET",
-        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts mispredicted return instructions retired.",
+        "PublicDescription": "This event counts mispredicted return instructions retired.",
        "SampleAfterValue": "100007",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
        "EventCode": "0xC5",
        "UMask": "0x20",
-        "BriefDescription": "number of near branch instructions retired that were mispredicted and taken. (Precise Event - PEBS).",
+        "BriefDescription": "number of near branch instructions retired that were mispredicted and taken.",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "BR_MISP_RETIRED.NEAR_TAKEN",
-        "PublicDescription": "Number of near branch instructions retired that were mispredicted and taken. (Precise Event - PEBS).",
+        "PublicDescription": "Number of near branch instructions retired that were mispredicted and taken.",
        "SampleAfterValue": "400009",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
--- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
@ -1,164 +1,394 @@
 [
    {
-        "BriefDescription": "Instructions Per Cycle (per logical thread)",
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Frontend_Bound"
+    },
+    {
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Frontend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Bad_Speculation"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Bad_Speculation_SMT"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Backend_Bound"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Backend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. ",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Retiring"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Retiring_SMT"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Instructions Per Cycle (per logical thread)",
        "MetricGroup": "TopDownL1",
        "MetricName": "IPC"
    },
    {
-        "BriefDescription": "Uops Per Instruction",
        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY",
-        "MetricGroup": "Pipeline",
+        "BriefDescription": "Uops Per Instruction",
+        "MetricGroup": "Pipeline;Retiring",
        "MetricName": "UPI"
    },
    {
-        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ((UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 64 * ( ICACHE_64B.IFTAG_HIT + ICACHE_64B.IFTAG_MISS ) / 4.1) )",
-        "MetricGroup": "Frontend",
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Instruction per taken branch",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "IpTB"
+    },
+    {
+        "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Branch instructions per taken branch. ",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "BpTB"
+    },
+    {
+        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 64 * ( ICACHE_64B.IFTAG_HIT + ICACHE_64B.IFTAG_MISS ) / 4.1 ) )",
+        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely (includes speculatively fetches) consumed by program instructions",
+        "MetricGroup": "PGO",
        "MetricName": "IFetch_Line_Utilization"
    },
    {
-        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)",
-        "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )",
-        "MetricGroup": "DSB; Frontend_Bandwidth",
+        "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ))",
+        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
+        "MetricGroup": "DSB;Frontend_Bandwidth",
        "MetricName": "DSB_Coverage"
    },
    {
-        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
+        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
    {
-        "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Per-thread actual clocks when the logical processor is active.",
        "MetricGroup": "Summary",
        "MetricName": "CLKS"
    },
    {
-        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
+        "MetricExpr": "4 * cycles",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
    {
-        "BriefDescription": "Total number of retired Instructions",
+        "MetricExpr": "4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
+        "MetricGroup": "TopDownL1_SMT",
+        "MetricName": "SLOTS_SMT"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS",
+        "BriefDescription": "Instructions per Load (lower number means loads are more frequent)",
+        "MetricGroup": "Instruction_Type;L1_Bound",
+        "MetricName": "IpL"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES",
+        "BriefDescription": "Instructions per Store",
+        "MetricGroup": "Instruction_Type;Store_Bound",
+        "MetricName": "IpS"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Instructions per Branch",
+        "MetricGroup": "Branches;Instruction_Type;Port_5;Port_6",
+        "MetricName": "IpB"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
+        "BriefDescription": "Instruction per (near) call",
+        "MetricGroup": "Branches",
+        "MetricName": "IpCall"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY",
+        "BriefDescription": "Total number of retired Instructions",
        "MetricGroup": "Summary",
        "MetricName": "Instructions"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / cycles",
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Instructions Per Cycle (per physical core)",
+        "MetricGroup": "SMT",
+        "MetricName": "CoreIPC_SMT"
+    },
+    {
+        "MetricExpr": "(( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )) / cycles",
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricGroup": "FLOPS",
+        "MetricName": "FLOPc"
+    },
+    {
+        "MetricExpr": "(( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )) / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricGroup": "FLOPS_SMT",
+        "MetricName": "FLOPc_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 ) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
-        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE_16B.IFDATA_STALL  - ICACHE_64B.IFTAG_STALL ) / RS_EVENTS.EMPTY_END)",
-        "MetricGroup": "Unknown_Branches",
-        "MetricName": "BAClear_Cost"
+        "MetricExpr": "( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * cycles)) * (( INT_MISC.CLEAR_RESTEER_CYCLES + 9 * BACLEARS.ANY ) / cycles) / (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * cycles)) ) * (4 * cycles) / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Branch Misprediction Cost: Fraction of TopDown slots wasted per branch misprediction (jeclear and baclear)",
+        "MetricGroup": "Branch_Mispredicts",
+        "MetricName": "Branch_Misprediction_Cost"
    },
    {
+        "MetricExpr": "( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) * (( INT_MISC.CLEAR_RESTEER_CYCLES + 9 * BACLEARS.ANY ) / cycles) / (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) ) * (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Branch Misprediction Cost: Fraction of TopDown slots wasted per branch misprediction (jeclear and baclear)",
+        "MetricGroup": "Branch_Mispredicts_SMT",
+        "MetricName": "Branch_Misprediction_Cost_SMT"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)",
+        "MetricGroup": "Branch_Mispredicts",
+        "MetricName": "IpMispredict"
+    },
+    {
+        "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
        "BriefDescription": "Core actual clocks when any thread is active on the physical core",
-        "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD",
        "MetricGroup": "SMT",
        "MetricName": "CORE_CLKS"
    },
    {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads",
        "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )",
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads (in core cycles)",
        "MetricGroup": "Memory_Bound;Memory_Lat",
        "MetricName": "Load_Miss_Real_Latency"
    },
    {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (( L1D_PEND_MISS.PENDING_CYCLES_ANY / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-thread)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
+        "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )",
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles) )",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
    {
-        "BriefDescription": "Average CPU Utilization",
+        "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) )",
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricGroup": "TLB_SMT",
+        "MetricName": "Page_Walks_Utilization_SMT"
+    },
+    {
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L1D_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L2_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time",
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L3_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time",
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L3_Cache_Access_BW"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L1MPKI"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI"
+    },
+    {
+        "MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache misses per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2HPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L3MPKI"
+    },
+    {
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
+        "BriefDescription": "Average CPU Utilization",
        "MetricGroup": "Summary",
        "MetricName": "CPU_Utilization"
    },
    {
+        "MetricExpr": "( (( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE )) / 1000000000 ) / duration_time",
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },
    {
-        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricGroup": "Power",
        "MetricName": "Turbo_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0",
+        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricGroup": "SMT;Summary",
        "MetricName": "SMT_2T_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricGroup": "Summary",
        "MetricName": "Kernel_Utilization"
    },
    {
-        "BriefDescription": "C3 residency percent per core",
+        "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time",
+        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "DRAM_BW_Use"
+    },
+    {
+        "MetricExpr": "1000000000 * ( cha@event\\=0x36\\\\\\,umask\\=0x21@ / cha@event\\=0x35\\\\\\,umask\\=0x21@ ) / ( cha_0@event\\=0x0@ / duration_time )",
+        "BriefDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches",
+        "MetricGroup": "Memory_Lat",
+        "MetricName": "DRAM_Read_Latency"
+    },
+    {
+        "MetricExpr": "cha@event\\=0x36\\\\\\,umask\\=0x21@ / cha@event\\=0x36\\\\\\,umask\\=0x21\\\\\\,thresh\\=1@",
+        "BriefDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "DRAM_Parallel_Reads"
+    },
+    {
+        "MetricExpr": "( 1000000000 * ( imc@event\\=0xe0\\\\\\,umask\\=0x1@ / imc@event\\=0xe3@ ) / imc_0@event\\=0x0@ ) if 1 if 1 == 1 else 0 else 0",
+        "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches",
+        "MetricGroup": "Memory_Lat",
+        "MetricName": "MEM_PMM_Read_Latency"
+    },
+    {
+        "MetricExpr": "( ( 64 * imc@event\\=0xe3@ / 1000000000 ) / duration_time ) if 1 if 1 == 1 else 0 else 0",
+        "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "PMM_Read_BW"
+    },
+    {
+        "MetricExpr": "( ( 64 * imc@event\\=0xe7@ / 1000000000 ) / duration_time ) if 1 if 1 == 1 else 0 else 0",
+        "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "PMM_Write_BW"
+    },
+    {
+        "MetricExpr": "cha_0@event\\=0x0@",
+        "BriefDescription": "Socket actual clocks when any core is active on that socket",
+        "MetricGroup": "",
+        "MetricName": "Socket_CLKS"
+    },
+    {
        "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per core",
        "MetricName": "C3_Core_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per core",
        "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per core",
        "MetricName": "C6_Core_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per core",
        "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per core",
        "MetricName": "C7_Core_Residency"
    },
    {
-        "BriefDescription": "C2 residency percent per package",
        "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C2 residency percent per package",
        "MetricName": "C2_Pkg_Residency"
    },
    {
-        "BriefDescription": "C3 residency percent per package",
        "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per package",
        "MetricName": "C3_Pkg_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per package",
        "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per package",
        "MetricName": "C6_Pkg_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per package",
        "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per package",
        "MetricName": "C7_Pkg_Residency"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/goldmont/cache.json
+++ b/tools/perf/pmu-events/arch/x86/goldmont/cache.json
--- a/tools/perf/pmu-events/arch/x86/goldmont/memory.json
+++ b/tools/perf/pmu-events/arch/x86/goldmont/memory.json
@ -30,265 +30,5 @@
        "EventName": "MACHINE_CLEARS.MEMORY_ORDERING",
        "SampleAfterValue": "200003",
        "BriefDescription": "Machine clears due to memory ordering issue"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x20000032b7 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.ANY_READ.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000000022 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.ANY_RFO.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts data reads (demand & prefetch) that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000003091",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.ANY_DATA_RD.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts data reads (demand & prefetch) that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts data reads generated by L1 or L2 prefetchers that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000003010 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.ANY_PF_DATA_RD.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts data reads generated by L1 or L2 prefetchers that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts requests to the uncore subsystem that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000008000 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.ANY_REQUEST.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts requests to the uncore subsystem that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts any data writes to uncacheable write combining (USWC) memory region  that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000004800 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.STREAMING_STORES.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts any data writes to uncacheable write combining (USWC) memory region  that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts partial cache line data writes to uncacheable write combining (USWC) memory region  that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000004000 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.PARTIAL_STREAMING_STORES.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts partial cache line data writes to uncacheable write combining (USWC) memory region  that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000002000 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.PF_L1_DATA_RD.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts data cache lines requests by software prefetch instructions that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000001000 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.SW_PREFETCH.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts data cache lines requests by software prefetch instructions that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000000800 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.FULL_STREAMING_STORES.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts bus lock and split lock requests that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000000400 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.BUS_LOCKS.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts bus lock and split lock requests that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts code reads in uncacheable (UC) memory region that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000000200 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.UC_CODE_RD.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts code reads in uncacheable (UC) memory region that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts the number of demand write requests (RFO) generated by a write to partial data cache line, including the writes to uncacheable (UC) and write through (WT), and write protected (WP) types of memory that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000000100 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.PARTIAL_WRITES.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts the number of demand write requests (RFO) generated by a write to partial data cache line, including the writes to uncacheable (UC) and write through (WT), and write protected (WP) types of memory that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts demand data partial reads, including data in uncacheable (UC) or uncacheable write combining (USWC) memory types that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000000080 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.PARTIAL_READS.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts demand data partial reads, including data in uncacheable (UC) or uncacheable write combining (USWC) memory types that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000000020 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000000010 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000000008 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.COREWB.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000000004 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000000002 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
-    },
-    {
-        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts demand cacheable data reads of full cache lines that miss the L2 cache and targets non-DRAM system address. Requires MSR_OFFCORE_RESP[0,1] to specify request type and response. (duplicated for both MSRs)",
-        "EventCode": "0xB7",
-        "MSRValue": "0x2000000001 ",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L2_MISS.NON_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts demand cacheable data reads of full cache lines that miss the L2 cache and targets non-DRAM system address.",
-        "Offcore": "1"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/goldmont/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/goldmont/pipeline.json
@ -1,7 +1,6 @@
 [
    {
        "PublicDescription": "Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The counter continues counting during hardware interrupts, traps, and inside interrupt handlers.  This event uses fixed counter 0.  You cannot collect a PEBs record for this event.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 0",
        "UMask": "0x1",
        "EventName": "INST_RETIRED.ANY",
@ -10,7 +9,6 @@
    },
    {
        "PublicDescription": "Counts the number of core cycles while the core is not in a halt state.  The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time.  This event uses fixed counter 1.  You cannot collect a PEBs record for this event.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 1",
        "UMask": "0x2",
        "EventName": "CPU_CLK_UNHALTED.CORE",
@ -19,7 +17,6 @@
    },
    {
        "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction.  In mobile systems the core frequency may change from time.  This event is not affected by core frequency changes but counts as if the core is running at the maximum frequency all the time.  This event uses fixed counter 2.  You cannot collect a PEBs record for this event.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 2",
        "UMask": "0x3",
        "EventName": "CPU_CLK_UNHALTED.REF_TSC",
@ -188,7 +185,7 @@
    },
    {
        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts the number of times that the processor detects that a program is writing to a code section and has to perform a machine clear because of that modification.  Self-modifying code (SMC) causes a severe penalty in all Intel architecture processors.",
+        "PublicDescription": "Counts the number of times that the processor detects that a program is writing to a code section and has to perform a machine clear because of that modification.  Self-modifying code (SMC) causes a severe penalty in all Intel\u00ae architecture processors.",
        "EventCode": "0xC3",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
--- a/tools/perf/pmu-events/arch/x86/goldmont/virtual-memory.json
+++ b/tools/perf/pmu-events/arch/x86/goldmont/virtual-memory.json
@ -48,7 +48,8 @@
        "UMask": "0x11",
        "EventName": "MEM_UOPS_RETIRED.DTLB_MISS_LOADS",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Load uops retired that missed the DTLB (Precise event capable)"
+        "BriefDescription": "Load uops retired that missed the DTLB (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -59,7 +60,8 @@
        "UMask": "0x12",
        "EventName": "MEM_UOPS_RETIRED.DTLB_MISS_STORES",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Store uops retired that missed the DTLB (Precise event capable)"
+        "BriefDescription": "Store uops retired that missed the DTLB (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -70,6 +72,7 @@
        "UMask": "0x13",
        "EventName": "MEM_UOPS_RETIRED.DTLB_MISS",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Memory uops retired that missed the DTLB (Precise event capable)"
+        "BriefDescription": "Memory uops retired that missed the DTLB (Precise event capable)",
+        "Data_LA": "1"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/goldmontplus/cache.json
+++ b/tools/perf/pmu-events/arch/x86/goldmontplus/cache.json
@ -92,7 +92,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.LOCK_LOADS",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Locked load uops retired (Precise event capable)"
+        "BriefDescription": "Locked load uops retired (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -104,7 +105,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Load uops retired that split a cache-line (Precise event capable)"
+        "BriefDescription": "Load uops retired that split a cache-line (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -116,7 +118,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.SPLIT_STORES",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Stores uops retired that split a cache-line (Precise event capable)"
+        "BriefDescription": "Stores uops retired that split a cache-line (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -128,7 +131,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.SPLIT",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Memory uops retired that split a cache-line (Precise event capable)"
+        "BriefDescription": "Memory uops retired that split a cache-line (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -140,7 +144,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.ALL_LOADS",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Load uops retired (Precise event capable)"
+        "BriefDescription": "Load uops retired (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -152,7 +157,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.ALL_STORES",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Store uops retired (Precise event capable)"
+        "BriefDescription": "Store uops retired (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -164,7 +170,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.ALL",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Memory uops retired (Precise event capable)"
+        "BriefDescription": "Memory uops retired (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -176,7 +183,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L1_HIT",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Load uops retired that hit L1 data cache (Precise event capable)"
+        "BriefDescription": "Load uops retired that hit L1 data cache (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -188,7 +196,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Load uops retired that hit L2 (Precise event capable)"
+        "BriefDescription": "Load uops retired that hit L2 (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -200,7 +209,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L1_MISS",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Load uops retired that missed L1 data cache (Precise event capable)"
+        "BriefDescription": "Load uops retired that missed L1 data cache (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -212,7 +222,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_MISS",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Load uops retired that missed L2 (Precise event capable)"
+        "BriefDescription": "Load uops retired that missed L2 (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -224,7 +235,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.HITM",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Memory uop retired where cross core or cross module HITM occurred (Precise event capable)"
+        "BriefDescription": "Memory uop retired where cross core or cross module HITM occurred (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -236,7 +248,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.WCB_HIT",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Loads retired that hit WCB (Precise event capable)"
+        "BriefDescription": "Loads retired that hit WCB (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -248,7 +261,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.DRAM_HIT",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Loads retired that came from DRAM (Precise event capable)"
+        "BriefDescription": "Loads retired that came from DRAM (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "CollectPEBSRecord": "1",
@ -292,7 +306,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts demand cacheable data reads of full cache lines true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts demand cacheable data reads of full cache lines true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -367,7 +381,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts demand reads for ownership (RFO) requests generated by a write to full data cache line true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -442,7 +456,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts demand instruction cacheline and I-side prefetch requests that miss the instruction cache true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -517,7 +531,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts the number of writeback transactions caused by L1 or L2 cache evictions true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -592,7 +606,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts data cacheline reads generated by hardware L2 cache prefetcher true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -667,7 +681,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts reads for ownership (RFO) requests generated by L2 prefetcher true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -742,7 +756,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts bus lock and split lock requests true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts bus lock and split lock requests true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -817,7 +831,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts full cache line data writes to uncacheable write combining (USWC) memory region and full cache-line non-temporal writes true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -892,7 +906,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts data cache lines requests by software prefetch instructions true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts data cache lines requests by software prefetch instructions true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -967,7 +981,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts data cache line reads generated by hardware L1 data cache prefetcher true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -1042,7 +1056,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts any data writes to uncacheable write combining (USWC) memory region  true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts any data writes to uncacheable write combining (USWC) memory region  true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -1117,7 +1131,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts requests to the uncore subsystem true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts requests to the uncore subsystem true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -1192,7 +1206,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts data reads generated by L1 or L2 prefetchers true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts data reads generated by L1 or L2 prefetchers true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -1267,7 +1281,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts data reads (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts data reads (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -1342,7 +1356,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts reads for ownership (RFO) requests (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
@ -1417,7 +1431,7 @@
        "PDIR_COUNTER": "na",
        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module. ",
+        "BriefDescription": "Counts data read, code read, and read for ownership (RFO) requests (demand & prefetch) true miss for the L2 cache with a snoop miss in the other processor module.",
        "Offcore": "1"
    },
    {
--- a/tools/perf/pmu-events/arch/x86/goldmontplus/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/goldmontplus/pipeline.json
@ -3,7 +3,6 @@
        "PEBS": "2",
        "CollectPEBSRecord": "1",
        "PublicDescription": "Counts the number of instructions that retire execution. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. The counter continues counting during hardware interrupts, traps, and inside interrupt handlers.  This event uses fixed counter 0.  You cannot collect a PEBs record for this event.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 0",
        "UMask": "0x1",
        "PEBScounters": "32",
@ -15,7 +14,6 @@
    {
        "CollectPEBSRecord": "1",
        "PublicDescription": "Counts the number of core cycles while the core is not in a halt state.  The core enters the halt state when it is running the HLT instruction. In mobile systems the core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time.  This event uses fixed counter 1.  You cannot collect a PEBs record for this event.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 1",
        "UMask": "0x2",
        "PEBScounters": "33",
@ -27,7 +25,6 @@
    {
        "CollectPEBSRecord": "1",
        "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction.  In mobile systems the core frequency may change from time.  This event is not affected by core frequency changes but counts as if the core is running at the maximum frequency all the time.  This event uses fixed counter 2.  You cannot collect a PEBs record for this event.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 2",
        "UMask": "0x3",
        "PEBScounters": "34",
@ -231,7 +228,7 @@
    },
    {
        "CollectPEBSRecord": "1",
-        "PublicDescription": "Counts the number of times that the processor detects that a program is writing to a code section and has to perform a machine clear because of that modification.  Self-modifying code (SMC) causes a severe penalty in all Intel architecture processors.",
+        "PublicDescription": "Counts the number of times that the processor detects that a program is writing to a code section and has to perform a machine clear because of that modification.  Self-modifying code (SMC) causes a severe penalty in all Intel\u00ae architecture processors.",
        "EventCode": "0xC3",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
--- a/tools/perf/pmu-events/arch/x86/goldmontplus/virtual-memory.json
+++ b/tools/perf/pmu-events/arch/x86/goldmontplus/virtual-memory.json
@ -189,7 +189,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.DTLB_MISS_LOADS",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Load uops retired that missed the DTLB (Precise event capable)"
+        "BriefDescription": "Load uops retired that missed the DTLB (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -201,7 +202,8 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.DTLB_MISS_STORES",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Store uops retired that missed the DTLB (Precise event capable)"
+        "BriefDescription": "Store uops retired that missed the DTLB (Precise event capable)",
+        "Data_LA": "1"
    },
    {
        "PEBS": "2",
@ -213,6 +215,7 @@
        "PEBScounters": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.DTLB_MISS",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Memory uops retired that missed the DTLB (Precise event capable)"
+        "BriefDescription": "Memory uops retired that missed the DTLB (Precise event capable)",
+        "Data_LA": "1"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/haswell/cache.json
+++ b/tools/perf/pmu-events/arch/x86/haswell/cache.json
@ -63,10 +63,10 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "Demand data read requests that hit L2 cache.",
+        "PublicDescription": "Counts the number of demand Data Read requests, initiated by load instructions, that hit L2 cache",
        "EventCode": "0x24",
        "Counter": "0,1,2,3",
-        "UMask": "0x41",
+        "UMask": "0xc1",
        "Errata": "HSD78",
        "EventName": "L2_RQSTS.DEMAND_DATA_RD_HIT",
        "SampleAfterValue": "200003",
@ -77,7 +77,7 @@
        "PublicDescription": "Counts the number of store RFO requests that hit the L2 cache.",
        "EventCode": "0x24",
        "Counter": "0,1,2,3",
-        "UMask": "0x42",
+        "UMask": "0xc2",
        "EventName": "L2_RQSTS.RFO_HIT",
        "SampleAfterValue": "200003",
        "BriefDescription": "RFO requests that hit L2 cache",
@ -87,7 +87,7 @@
        "PublicDescription": "Number of instruction fetches that hit the L2 cache.",
        "EventCode": "0x24",
        "Counter": "0,1,2,3",
-        "UMask": "0x44",
+        "UMask": "0xc4",
        "EventName": "L2_RQSTS.CODE_RD_HIT",
        "SampleAfterValue": "200003",
        "BriefDescription": "L2 cache hits when fetching instructions, code reads.",
@ -97,7 +97,7 @@
        "PublicDescription": "Counts all L2 HW prefetcher requests that hit L2.",
        "EventCode": "0x24",
        "Counter": "0,1,2,3",
-        "UMask": "0x50",
+        "UMask": "0xd0",
        "EventName": "L2_RQSTS.L2_PF_HIT",
        "SampleAfterValue": "200003",
        "BriefDescription": "L2 prefetch requests that hit L2 cache",
@ -610,7 +610,7 @@
        "Errata": "HSD29, HSD25, HSM26, HSM30",
        "EventName": "MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT",
        "SampleAfterValue": "20011",
-        "BriefDescription": "Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache. ",
+        "BriefDescription": "Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache.",
        "CounterHTOff": "0,1,2,3",
        "Data_LA": "1"
    },
@ -623,7 +623,7 @@
        "Errata": "HSD29, HSD25, HSM26, HSM30",
        "EventName": "MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM",
        "SampleAfterValue": "20011",
-        "BriefDescription": "Retired load uops which data sources were HitM responses from shared L3. ",
+        "BriefDescription": "Retired load uops which data sources were HitM responses from shared L3.",
        "CounterHTOff": "0,1,2,3",
        "Data_LA": "1"
    },
@ -792,7 +792,6 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "",
        "EventCode": "0xf4",
        "Counter": "0,1,2,3",
        "UMask": "0x10",
@ -802,262 +801,262 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "Counts all requests that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all requests hit in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3f803c8fff",
+        "MSRValue": "0x3F803C8FFF",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_REQUESTS.L3_HIT.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all requests that hit in the L3",
+        "BriefDescription": "Counts all requests hit in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x10003c07f7",
+        "MSRValue": "0x10003C07F7",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.L3_HIT.HITM_OTHER_CORE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "BriefDescription": "hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x04003c07f7",
+        "MSRValue": "0x04003C07F7",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.L3_HIT.HIT_OTHER_CORE_NO_FWD",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "BriefDescription": "hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand & prefetch code reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch code reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x04003c0244",
+        "MSRValue": "0x04003C0244",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_CODE_RD.L3_HIT.HIT_OTHER_CORE_NO_FWD",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand & prefetch code reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "BriefDescription": "Counts all demand & prefetch code reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch RFOs hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x10003c0122",
+        "MSRValue": "0x10003C0122",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_HIT.HITM_OTHER_CORE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "BriefDescription": "Counts all demand & prefetch RFOs hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch RFOs hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x04003c0122",
+        "MSRValue": "0x04003C0122",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_HIT.HIT_OTHER_CORE_NO_FWD",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "BriefDescription": "Counts all demand & prefetch RFOs hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x10003c0091",
+        "MSRValue": "0x10003C0091",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_HIT.HITM_OTHER_CORE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "BriefDescription": "Counts all demand & prefetch data reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x04003c0091",
+        "MSRValue": "0x04003C0091",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_HIT.HIT_OTHER_CORE_NO_FWD",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "BriefDescription": "Counts all demand & prefetch data reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts prefetch (that bring data to LLC only) code reads that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts prefetch (that bring data to LLC only) code reads hit in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3f803c0200",
+        "MSRValue": "0x3F803C0200",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L3_CODE_RD.L3_HIT.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts prefetch (that bring data to LLC only) code reads that hit in the L3",
+        "BriefDescription": "Counts prefetch (that bring data to LLC only) code reads hit in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs  that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs hit in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3f803c0100",
+        "MSRValue": "0x3F803C0100",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_HIT.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs  that hit in the L3",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs hit in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads hit in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3f803c0080",
+        "MSRValue": "0x3F803C0080",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_HIT.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that hit in the L3",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads hit in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) code reads that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) code reads hit in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3f803c0040",
+        "MSRValue": "0x3F803C0040",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_CODE_RD.L3_HIT.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) code reads that hit in the L3",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) code reads hit in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs hit in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3f803c0020",
+        "MSRValue": "0x3F803C0020",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that hit in the L3",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs hit in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads hit in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3f803c0010",
+        "MSRValue": "0x3F803C0010",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_HIT.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that hit in the L3",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads hit in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand code reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand code reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x10003c0004",
+        "MSRValue": "0x10003C0004",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_HIT.HITM_OTHER_CORE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand code reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "BriefDescription": "Counts all demand code reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand code reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand code reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x04003c0004",
+        "MSRValue": "0x04003C0004",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_HIT.HIT_OTHER_CORE_NO_FWD",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand code reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "BriefDescription": "Counts all demand code reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x10003c0002",
+        "MSRValue": "0x10003C0002",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "BriefDescription": "Counts all demand data writes (RFOs) hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x04003c0002",
+        "MSRValue": "0x04003C0002",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HIT_OTHER_CORE_NO_FWD",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "BriefDescription": "Counts all demand data writes (RFOs) hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts demand data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts demand data reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x10003c0001",
+        "MSRValue": "0x10003C0001",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts demand data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "BriefDescription": "Counts demand data reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts demand data reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts demand data reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x04003c0001",
+        "MSRValue": "0x04003C0001",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_NO_FWD",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts demand data reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "BriefDescription": "Counts demand data reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    }
--- a/tools/perf/pmu-events/arch/x86/haswell/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/haswell/floating-point.json
@ -1,22 +1,26 @@
 [
    {
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xC1",
        "Counter": "0,1,2,3",
        "UMask": "0x8",
        "Errata": "HSD56, HSM57",
        "EventName": "OTHER_ASSISTS.AVX_TO_SSE",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of transitions from AVX-256 to legacy SSE when penalty applicable.",
+        "BriefDescription": "Number of transitions from AVX-256 to legacy SSE when penalty applicable",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xC1",
        "Counter": "0,1,2,3",
        "UMask": "0x10",
        "Errata": "HSD56, HSM57",
        "EventName": "OTHER_ASSISTS.SSE_TO_AVX",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of transitions from SSE to AVX-256 when penalty applicable.",
+        "BriefDescription": "Number of transitions from legacy SSE to AVX-256 when penalty applicable",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
@ -30,53 +34,58 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "Number of X87 FP assists due to output values.",
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xCA",
        "Counter": "0,1,2,3",
        "UMask": "0x2",
        "EventName": "FP_ASSIST.X87_OUTPUT",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of X87 assists due to output value.",
+        "BriefDescription": "output - Numeric Overflow, Numeric Underflow, Inexact Result",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "Number of X87 FP assists due to input values.",
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xCA",
        "Counter": "0,1,2,3",
        "UMask": "0x4",
        "EventName": "FP_ASSIST.X87_INPUT",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of X87 assists due to input value.",
+        "BriefDescription": "input - Invalid Operation, Denormal Operand, SNaN Operand",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "Number of SIMD FP assists due to output values.",
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xCA",
        "Counter": "0,1,2,3",
        "UMask": "0x8",
        "EventName": "FP_ASSIST.SIMD_OUTPUT",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of SIMD FP assists due to Output values",
+        "BriefDescription": "SSE* FP micro-code assist when output value is invalid.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "Number of SIMD FP assists due to input values.",
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xCA",
        "Counter": "0,1,2,3",
        "UMask": "0x10",
        "EventName": "FP_ASSIST.SIMD_INPUT",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of SIMD FP assists due to input values",
+        "BriefDescription": "Any input SSE* FP Assist",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "Cycles with any input/output SSE* or FP assists.",
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xCA",
        "Counter": "0,1,2,3",
        "UMask": "0x1e",
        "EventName": "FP_ASSIST.ANY",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Cycles with any input/output SSE or FP assist",
+        "BriefDescription": "Counts any FP_ASSIST umask was incrementing",
        "CounterMask": "1",
        "CounterHTOff": "0,1,2,3"
    }
--- a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json
@ -1,158 +1,322 @@
 [
    {
-        "BriefDescription": "Instructions Per Cycle (per logical thread)",
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Frontend_Bound"
+    },
+    {
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Frontend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Bad_Speculation"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Bad_Speculation_SMT"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Backend_Bound"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Backend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. ",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Retiring"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Retiring_SMT"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Instructions Per Cycle (per logical thread)",
        "MetricGroup": "TopDownL1",
        "MetricName": "IPC"
    },
    {
-        "BriefDescription": "Uops Per Instruction",
        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY",
-        "MetricGroup": "Pipeline",
+        "BriefDescription": "Uops Per Instruction",
+        "MetricGroup": "Pipeline;Retiring",
        "MetricName": "UPI"
    },
    {
-        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Instruction per taken branch",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "IpTB"
+    },
+    {
+        "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Branch instructions per taken branch. ",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "BpTB"
+    },
+    {
        "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )",
-        "MetricGroup": "Frontend",
+        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely (includes speculatively fetches) consumed by program instructions",
+        "MetricGroup": "PGO",
        "MetricName": "IFetch_Line_Utilization"
    },
    {
-        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)",
-        "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )",
-        "MetricGroup": "DSB; Frontend_Bandwidth",
+        "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )",
+        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
+        "MetricGroup": "DSB;Frontend_Bandwidth",
        "MetricName": "DSB_Coverage"
    },
    {
-        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
+        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
    {
-        "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Per-thread actual clocks when the logical processor is active.",
        "MetricGroup": "Summary",
        "MetricName": "CLKS"
    },
    {
-        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
+        "MetricExpr": "4 * cycles",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
    {
-        "BriefDescription": "Total number of retired Instructions",
+        "MetricExpr": "4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
+        "MetricGroup": "TopDownL1_SMT",
+        "MetricName": "SLOTS_SMT"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS",
+        "BriefDescription": "Instructions per Load (lower number means loads are more frequent)",
+        "MetricGroup": "Instruction_Type;L1_Bound",
+        "MetricName": "IpL"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES",
+        "BriefDescription": "Instructions per Store",
+        "MetricGroup": "Instruction_Type;Store_Bound",
+        "MetricName": "IpS"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Instructions per Branch",
+        "MetricGroup": "Branches;Instruction_Type;Port_5;Port_6",
+        "MetricName": "IpB"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
+        "BriefDescription": "Instruction per (near) call",
+        "MetricGroup": "Branches",
+        "MetricName": "IpCall"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY",
+        "BriefDescription": "Total number of retired Instructions",
        "MetricGroup": "Summary",
        "MetricName": "Instructions"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / cycles",
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Instructions Per Cycle (per physical core)",
+        "MetricGroup": "SMT",
+        "MetricName": "CoreIPC_SMT"
+    },
+    {
+        "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) ) if #SMT_on else UOPS_EXECUTED.CORE / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)",
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) ) if #SMT_on else UOPS_EXECUTED.CORE / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
-        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL  - (( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION )) ) / RS_EVENTS.EMPTY_END)",
-        "MetricGroup": "Unknown_Branches",
-        "MetricName": "BAClear_Cost"
+        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)",
+        "MetricGroup": "Branch_Mispredicts",
+        "MetricName": "IpMispredict"
    },
    {
+        "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
        "BriefDescription": "Core actual clocks when any thread is active on the physical core",
-        "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD",
        "MetricGroup": "SMT",
        "MetricName": "CORE_CLKS"
    },
    {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads",
        "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )",
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads (in core cycles)",
        "MetricGroup": "Memory_Bound;Memory_Lat",
        "MetricName": "Load_Miss_Real_Latency"
    },
    {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-thread)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / cycles",
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
    {
-        "BriefDescription": "Average CPU Utilization",
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricGroup": "TLB_SMT",
+        "MetricName": "Page_Walks_Utilization_SMT"
+    },
+    {
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L1D_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L2_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time",
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L3_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L1MPKI"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache misses per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2HPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L3MPKI"
+    },
+    {
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
+        "BriefDescription": "Average CPU Utilization",
        "MetricGroup": "Summary",
        "MetricName": "CPU_Utilization"
    },
    {
-        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricGroup": "Power",
        "MetricName": "Turbo_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0",
+        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricGroup": "SMT;Summary",
        "MetricName": "SMT_2T_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricGroup": "Summary",
        "MetricName": "Kernel_Utilization"
    },
    {
-        "BriefDescription": "C3 residency percent per core",
+        "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000",
+        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "DRAM_BW_Use"
+    },
+    {
        "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per core",
        "MetricName": "C3_Core_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per core",
        "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per core",
        "MetricName": "C6_Core_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per core",
        "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per core",
        "MetricName": "C7_Core_Residency"
    },
    {
-        "BriefDescription": "C2 residency percent per package",
        "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C2 residency percent per package",
        "MetricName": "C2_Pkg_Residency"
    },
    {
-        "BriefDescription": "C3 residency percent per package",
        "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per package",
        "MetricName": "C3_Pkg_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per package",
        "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per package",
        "MetricName": "C6_Pkg_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per package",
        "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per package",
        "MetricName": "C7_Pkg_Residency"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/haswell/memory.json
+++ b/tools/perf/pmu-events/arch/x86/haswell/memory.json
@ -298,7 +298,7 @@
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4",
        "MSRIndex": "0x3F6",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Loads with latency value being above 4.",
+        "BriefDescription": "Randomly selected loads with latency value being above 4.",
        "TakenAlone": "1",
        "CounterHTOff": "3"
    },
@ -312,7 +312,7 @@
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8",
        "MSRIndex": "0x3F6",
        "SampleAfterValue": "50021",
-        "BriefDescription": "Loads with latency value being above 8.",
+        "BriefDescription": "Randomly selected loads with latency value being above 8.",
        "TakenAlone": "1",
        "CounterHTOff": "3"
    },
@ -326,7 +326,7 @@
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16",
        "MSRIndex": "0x3F6",
        "SampleAfterValue": "20011",
-        "BriefDescription": "Loads with latency value being above 16.",
+        "BriefDescription": "Randomly selected loads with latency value being above 16.",
        "TakenAlone": "1",
        "CounterHTOff": "3"
    },
@ -340,7 +340,7 @@
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32",
        "MSRIndex": "0x3F6",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Loads with latency value being above 32.",
+        "BriefDescription": "Randomly selected loads with latency value being above 32.",
        "TakenAlone": "1",
        "CounterHTOff": "3"
    },
@ -354,7 +354,7 @@
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64",
        "MSRIndex": "0x3F6",
        "SampleAfterValue": "2003",
-        "BriefDescription": "Loads with latency value being above 64.",
+        "BriefDescription": "Randomly selected loads with latency value being above 64.",
        "TakenAlone": "1",
        "CounterHTOff": "3"
    },
@ -368,7 +368,7 @@
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128",
        "MSRIndex": "0x3F6",
        "SampleAfterValue": "1009",
-        "BriefDescription": "Loads with latency value being above 128.",
+        "BriefDescription": "Randomly selected loads with latency value being above 128.",
        "TakenAlone": "1",
        "CounterHTOff": "3"
    },
@ -382,7 +382,7 @@
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256",
        "MSRIndex": "0x3F6",
        "SampleAfterValue": "503",
-        "BriefDescription": "Loads with latency value being above 256.",
+        "BriefDescription": "Randomly selected loads with latency value being above 256.",
        "TakenAlone": "1",
        "CounterHTOff": "3"
    },
@ -396,280 +396,280 @@
        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512",
        "MSRIndex": "0x3F6",
        "SampleAfterValue": "101",
-        "BriefDescription": "Loads with latency value being above 512.",
+        "BriefDescription": "Randomly selected loads with latency value being above 512.",
        "TakenAlone": "1",
        "CounterHTOff": "3"
    },
    {
-        "PublicDescription": "Counts all requests that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all requests miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc08fff",
+        "MSRValue": "0x3FFFC08FFF",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_REQUESTS.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all requests that miss in the L3",
+        "BriefDescription": "Counts all requests miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "miss the L3 and the data is returned from local dram",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x01004007f7",
+        "MSRValue": "0x01004007F7",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.L3_MISS.LOCAL_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "miss the L3 and the data is returned from local dram",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc007f7",
+        "MSRValue": "0x3FFFC007F7",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss in the L3",
+        "BriefDescription": "miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand & prefetch code reads that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch code reads miss the L3 and the data is returned from local dram",
        "EventCode": "0xB7, 0xBB",
        "MSRValue": "0x0100400244",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_CODE_RD.L3_MISS.LOCAL_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand & prefetch code reads that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand & prefetch code reads miss the L3 and the data is returned from local dram",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand & prefetch code reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch code reads miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc00244",
+        "MSRValue": "0x3FFFC00244",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_CODE_RD.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand & prefetch code reads that miss in the L3",
+        "BriefDescription": "Counts all demand & prefetch code reads miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand & prefetch RFOs that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch RFOs miss the L3 and the data is returned from local dram",
        "EventCode": "0xB7, 0xBB",
        "MSRValue": "0x0100400122",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_MISS.LOCAL_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand & prefetch RFOs that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand & prefetch RFOs miss the L3 and the data is returned from local dram",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand & prefetch RFOs that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch RFOs miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc00122",
+        "MSRValue": "0x3FFFC00122",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand & prefetch RFOs that miss in the L3",
+        "BriefDescription": "Counts all demand & prefetch RFOs miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads miss the L3 and the data is returned from local dram",
        "EventCode": "0xB7, 0xBB",
        "MSRValue": "0x0100400091",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_MISS.LOCAL_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand & prefetch data reads miss the L3 and the data is returned from local dram",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand & prefetch data reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc00091",
+        "MSRValue": "0x3FFFC00091",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand & prefetch data reads that miss in the L3",
+        "BriefDescription": "Counts all demand & prefetch data reads miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts prefetch (that bring data to LLC only) code reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts prefetch (that bring data to LLC only) code reads miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc00200",
+        "MSRValue": "0x3FFFC00200",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L3_CODE_RD.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts prefetch (that bring data to LLC only) code reads that miss in the L3",
+        "BriefDescription": "Counts prefetch (that bring data to LLC only) code reads miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs  that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc00100",
+        "MSRValue": "0x3FFFC00100",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs  that miss in the L3",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc00080",
+        "MSRValue": "0x3FFFC00080",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss in the L3",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) code reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) code reads miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc00040",
+        "MSRValue": "0x3FFFC00040",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_CODE_RD.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) code reads that miss in the L3",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) code reads miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc00020",
+        "MSRValue": "0x3FFFC00020",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that miss in the L3",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc00010",
+        "MSRValue": "0x3FFFC00010",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that miss in the L3",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand code reads that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand code reads miss the L3 and the data is returned from local dram",
        "EventCode": "0xB7, 0xBB",
        "MSRValue": "0x0100400004",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_MISS.LOCAL_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand code reads that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand code reads miss the L3 and the data is returned from local dram",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand code reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand code reads miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc00004",
+        "MSRValue": "0x3FFFC00004",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand code reads that miss in the L3",
+        "BriefDescription": "Counts all demand code reads miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand data writes (RFOs) that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) miss the L3 and the data is returned from local dram",
        "EventCode": "0xB7, 0xBB",
        "MSRValue": "0x0100400002",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.LOCAL_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand data writes (RFOs) that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand data writes (RFOs) miss the L3 and the data is returned from local dram",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts all demand data writes (RFOs) that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc00002",
+        "MSRValue": "0x3FFFC00002",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand data writes (RFOs) that miss in the L3",
+        "BriefDescription": "Counts all demand data writes (RFOs) miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts demand data reads that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts demand data reads miss the L3 and the data is returned from local dram",
        "EventCode": "0xB7, 0xBB",
        "MSRValue": "0x0100400001",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_MISS.LOCAL_DRAM",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts demand data reads that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts demand data reads miss the L3 and the data is returned from local dram",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "Counts demand data reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts demand data reads miss in the L3",
        "EventCode": "0xB7, 0xBB",
-        "MSRValue": "0x3fffc00001",
+        "MSRValue": "0x3FFFC00001",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_MISS.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
+        "MSRIndex": "0x1a6, 0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts demand data reads that miss in the L3",
+        "BriefDescription": "Counts demand data reads miss in the L3",
        "Offcore": "1",
        "CounterHTOff": "0,1,2,3"
    }
--- a/tools/perf/pmu-events/arch/x86/haswell/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/haswell/pipeline.json
@ -1,7 +1,6 @@
 [
    {
        "PublicDescription": "This event counts the number of instructions retired from execution. For instructions that consist of multiple micro-ops, this event counts the retirement of the last micro-op of the instruction. Counting continues during hardware interrupts, traps, and inside interrupt handlers. INST_RETIRED.ANY is counted by a designated fixed counter, leaving the programmable counters available for other events. Faulting executions of GETSEC/VM entry/VM Exit/MWait will not count as retired instructions.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 0",
        "UMask": "0x1",
        "Errata": "HSD140, HSD143",
@ -12,7 +11,6 @@
    },
    {
        "PublicDescription": "This event counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 1",
        "UMask": "0x2",
        "EventName": "CPU_CLK_UNHALTED.THREAD",
@ -21,7 +19,6 @@
        "CounterHTOff": "Fixed counter 1"
    },
    {
-        "EventCode": "0x00",
        "Counter": "Fixed counter 1",
        "UMask": "0x2",
        "AnyThread": "1",
@ -32,7 +29,6 @@
    },
    {
        "PublicDescription": "This event counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 2",
        "UMask": "0x3",
        "EventName": "CPU_CLK_UNHALTED.REF_TSC",
@ -1071,7 +1067,8 @@
        "CounterHTOff": "1"
    },
    {
-        "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts FP operations retired. For X87 FP operations that have no exceptions counting also includes flows that have several X87, or flows that use X87 uops in the exception handling.",
+        "PEBS": "1",
+        "PublicDescription": "This is a precise version (that is, uses PEBS) of the event that counts FP operations retired. For X87 FP operations that have no exceptions counting also includes flows that have several X87, or flows that use X87 uops in the exception handling.",
        "EventCode": "0xC0",
        "Counter": "0,1,2,3",
        "UMask": "0x2",
@ -1081,13 +1078,13 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "Number of microcode assists invoked by HW upon uop writeback.",
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xC1",
        "Counter": "0,1,2,3",
        "UMask": "0x40",
        "EventName": "OTHER_ASSISTS.ANY_WB_ASSIST",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of times any microcode assist is invoked by HW upon uop writeback.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
@ -1102,28 +1099,34 @@
        "Data_LA": "1"
    },
    {
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xC2",
        "Invert": "1",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "UOPS_RETIRED.STALL_CYCLES",
        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles without actually retired uops.",
+        "BriefDescription": "Cycles no executable uops retired",
        "CounterMask": "1",
        "CounterHTOff": "0,1,2,3"
    },
    {
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xC2",
        "Invert": "1",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
        "EventName": "UOPS_RETIRED.TOTAL_CYCLES",
        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles with less than 10 actually retired uops.",
+        "BriefDescription": "Number of cycles using always true condition applied to  PEBS uops retired event.",
        "CounterMask": "10",
        "CounterHTOff": "0,1,2,3"
    },
    {
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xC2",
        "Invert": "1",
        "Counter": "0,1,2,3",
@ -1131,7 +1134,7 @@
        "AnyThread": "1",
        "EventName": "UOPS_RETIRED.CORE_STALL_CYCLES",
        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles without actually retired uops.",
+        "BriefDescription": "Cycles no executable uops retired on core",
        "CounterMask": "1",
        "CounterHTOff": "0,1,2,3"
    },
@ -1245,13 +1248,14 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "Counts the number of not taken branch instructions retired.",
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xC4",
        "Counter": "0,1,2,3",
        "UMask": "0x10",
        "EventName": "BR_INST_RETIRED.NOT_TAKEN",
        "SampleAfterValue": "400009",
-        "BriefDescription": "Not taken branch instructions retired.",
+        "BriefDescription": "Counts all not taken macro branch instructions retired.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
@ -1265,13 +1269,14 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "Number of far branches retired.",
+        "PEBS": "1",
+        "PublicDescription": "",
        "EventCode": "0xC4",
        "Counter": "0,1,2,3",
        "UMask": "0x40",
        "EventName": "BR_INST_RETIRED.FAR_BRANCH",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Far branch instructions retired.",
+        "BriefDescription": "Counts the number of far branch instructions retired.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
--- a/tools/perf/pmu-events/arch/x86/haswellx/cache.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/cache.json
@ -64,18 +64,18 @@
    },
    {
        "EventCode": "0x24",
-        "UMask": "0x41",
+        "UMask": "0xc1",
        "BriefDescription": "Demand Data Read requests that hit L2 cache",
        "Counter": "0,1,2,3",
        "EventName": "L2_RQSTS.DEMAND_DATA_RD_HIT",
        "Errata": "HSD78",
-        "PublicDescription": "Demand data read requests that hit L2 cache.",
+        "PublicDescription": "Counts the number of demand Data Read requests, initiated by load instructions, that hit L2 cache",
        "SampleAfterValue": "200003",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
        "EventCode": "0x24",
-        "UMask": "0x42",
+        "UMask": "0xc2",
        "BriefDescription": "RFO requests that hit L2 cache",
        "Counter": "0,1,2,3",
        "EventName": "L2_RQSTS.RFO_HIT",
@ -85,7 +85,7 @@
    },
    {
        "EventCode": "0x24",
-        "UMask": "0x44",
+        "UMask": "0xc4",
        "BriefDescription": "L2 cache hits when fetching instructions, code reads.",
        "Counter": "0,1,2,3",
        "EventName": "L2_RQSTS.CODE_RD_HIT",
@ -95,7 +95,7 @@
    },
    {
        "EventCode": "0x24",
-        "UMask": "0x50",
+        "UMask": "0xd0",
        "BriefDescription": "L2 prefetch requests that hit L2 cache",
        "Counter": "0,1,2,3",
        "EventName": "L2_RQSTS.L2_PF_HIT",
@ -416,7 +416,7 @@
    {
        "EventCode": "0xD0",
        "UMask": "0x11",
-        "BriefDescription": "Retired load uops that miss the STLB. (precise Event)",
+        "BriefDescription": "Retired load uops that miss the STLB.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
@ -428,7 +428,7 @@
    {
        "EventCode": "0xD0",
        "UMask": "0x12",
-        "BriefDescription": "Retired store uops that miss the STLB. (precise Event)",
+        "BriefDescription": "Retired store uops that miss the STLB.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
@ -441,7 +441,7 @@
    {
        "EventCode": "0xD0",
        "UMask": "0x21",
-        "BriefDescription": "Retired load uops with locked access. (precise Event)",
+        "BriefDescription": "Retired load uops with locked access.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
@ -453,34 +453,32 @@
    {
        "EventCode": "0xD0",
        "UMask": "0x41",
-        "BriefDescription": "Retired load uops that split across a cacheline boundary. (precise Event)",
+        "BriefDescription": "Retired load uops that split across a cacheline boundary.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS",
        "Errata": "HSD29, HSM30",
-        "PublicDescription": "This event counts load uops retired which had memory addresses spilt across 2 cache lines. A line split is across 64B cache-lines which may include a page split (4K). This is a precise event.",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD0",
        "UMask": "0x42",
-        "BriefDescription": "Retired store uops that split across a cacheline boundary. (precise Event)",
+        "BriefDescription": "Retired store uops that split across a cacheline boundary.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.SPLIT_STORES",
        "Errata": "HSD29, HSM30",
        "L1_Hit_Indication": "1",
-        "PublicDescription": "This event counts store uops retired which had memory addresses spilt across 2 cache lines. A line split is across 64B cache-lines which may include a page split (4K). This is a precise event.",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD0",
        "UMask": "0x81",
-        "BriefDescription": "All retired load uops. (precise Event)",
+        "BriefDescription": "All retired load uops.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
@ -492,14 +490,13 @@
    {
        "EventCode": "0xD0",
        "UMask": "0x82",
-        "BriefDescription": "All retired store uops. (precise Event)",
+        "BriefDescription": "All retired store uops.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_UOPS_RETIRED.ALL_STORES",
        "Errata": "HSD29, HSM30",
        "L1_Hit_Indication": "1",
-        "PublicDescription": "This event counts all store uops retired. This is a precise event.",
        "SampleAfterValue": "2000003",
        "CounterHTOff": "0,1,2,3"
    },
@ -530,13 +527,13 @@
    {
        "EventCode": "0xD1",
        "UMask": "0x4",
-        "BriefDescription": "Miss in last-level (L3) cache. Excludes Unknown data-source.",
+        "BriefDescription": "Retired load uops which data sources were data hits in L3 without snoops required.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT",
        "Errata": "HSD74, HSD29, HSD25, HSM26, HSM30",
-        "PublicDescription": "This event counts retired load uops in which data sources were data hits in the L3 cache without snoops required. This does not include hardware prefetches. This is a precise event.",
+        "PublicDescription": "Retired load uops with L3 cache hits as data sources.",
        "SampleAfterValue": "50021",
        "CounterHTOff": "0,1,2,3"
    },
@ -549,19 +546,20 @@
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L1_MISS",
        "Errata": "HSM30",
-        "PublicDescription": "This event counts retired load uops in which data sources missed in the L1 cache. This does not include hardware prefetches. This is a precise event.",
+        "PublicDescription": "Retired load uops missed L1 cache as data sources.",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD1",
        "UMask": "0x10",
-        "BriefDescription": "Retired load uops with L2 cache misses as data sources.",
+        "BriefDescription": "Miss in mid-level (L2) cache. Excludes Unknown data-source.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_MISS",
        "Errata": "HSD29, HSM30",
+        "PublicDescription": "Retired load uops missed L2. Unknown data source excluded.",
        "SampleAfterValue": "50021",
        "CounterHTOff": "0,1,2,3"
    },
@ -574,6 +572,7 @@
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_RETIRED.L3_MISS",
        "Errata": "HSD74, HSD29, HSD25, HSM26, HSM30",
+        "PublicDescription": "Retired load uops missed L3. Excludes unknown data source .",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -604,26 +603,24 @@
    {
        "EventCode": "0xD2",
        "UMask": "0x2",
-        "BriefDescription": "Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache. ",
+        "BriefDescription": "Retired load uops which data sources were L3 and cross-core snoop hits in on-pkg core cache.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT",
        "Errata": "HSD29, HSD25, HSM26, HSM30",
-        "PublicDescription": "This event counts retired load uops that hit in the L3 cache, but required a cross-core snoop which resulted in a HIT in an on-pkg core cache. This does not include hardware prefetches. This is a precise event.",
        "SampleAfterValue": "20011",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD2",
        "UMask": "0x4",
-        "BriefDescription": "Retired load uops which data sources were HitM responses from shared L3. ",
+        "BriefDescription": "Retired load uops which data sources were HitM responses from shared L3.",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM",
        "Errata": "HSD29, HSD25, HSM26, HSM30",
-        "PublicDescription": "This event counts retired load uops that hit in the L3 cache, but required a cross-core snoop which resulted in a HITM (hit modified) in an on-pkg core cache. This does not include hardware prefetches. This is a precise event.",
        "SampleAfterValue": "20011",
        "CounterHTOff": "0,1,2,3"
    },
@ -642,19 +639,20 @@
    {
        "EventCode": "0xD3",
        "UMask": "0x1",
+        "BriefDescription": "Data from local DRAM either Snoop not needed or Snoop Miss (RspI)",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM",
        "Errata": "HSD74, HSD29, HSD25, HSM30",
-        "PublicDescription": "This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches. This is a precise event.",
+        "PublicDescription": "This event counts retired load uops where the data came from local DRAM. This does not include hardware prefetches.",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
    {
        "EventCode": "0xD3",
        "UMask": "0x4",
-        "BriefDescription": "Retired load uop whose Data Source was: remote DRAM either Snoop not needed or Snoop Miss (RspI) (Precise Event)",
+        "BriefDescription": "Retired load uop whose Data Source was: remote DRAM either Snoop not needed or Snoop Miss (RspI)",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
@ -666,7 +664,7 @@
    {
        "EventCode": "0xD3",
        "UMask": "0x10",
-        "BriefDescription": "Retired load uop whose Data Source was: Remote cache HITM (Precise Event)",
+        "BriefDescription": "Retired load uop whose Data Source was: Remote cache HITM",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
@ -678,7 +676,7 @@
    {
        "EventCode": "0xD3",
        "UMask": "0x20",
-        "BriefDescription": "Retired load uop whose Data Source was: forwarded from remote cache (Precise Event)",
+        "BriefDescription": "Retired load uop whose Data Source was: forwarded from remote cache",
        "Data_LA": "1",
        "PEBS": "1",
        "Counter": "0,1,2,3",
@ -833,7 +831,6 @@
        "BriefDescription": "Split locks in SQ",
        "Counter": "0,1,2,3",
        "EventName": "SQ_MISC.SPLIT_LOCK",
-        "PublicDescription": "",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -841,12 +838,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts demand data reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
-        "MSRValue": "0x04003c0001",
+        "BriefDescription": "Counts demand data reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "MSRValue": "0x04003C0001",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.LLC_HIT.HIT_OTHER_CORE_NO_FWD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts demand data reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts demand data reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -854,12 +851,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts demand data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
-        "MSRValue": "0x10003c0001",
+        "BriefDescription": "Counts demand data reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "MSRValue": "0x10003C0001",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.LLC_HIT.HITM_OTHER_CORE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts demand data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts demand data reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -867,12 +864,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
-        "MSRValue": "0x04003c0002",
+        "BriefDescription": "Counts all demand data writes (RFOs) hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "MSRValue": "0x04003C0002",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HIT_OTHER_CORE_NO_FWD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -880,12 +877,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
-        "MSRValue": "0x10003c0002",
+        "BriefDescription": "Counts all demand data writes (RFOs) hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "MSRValue": "0x10003C0002",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -893,12 +890,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand code reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
-        "MSRValue": "0x04003c0004",
+        "BriefDescription": "Counts all demand code reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "MSRValue": "0x04003C0004",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.LLC_HIT.HIT_OTHER_CORE_NO_FWD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand code reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand code reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -906,12 +903,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand code reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
-        "MSRValue": "0x10003c0004",
+        "BriefDescription": "Counts all demand code reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "MSRValue": "0x10003C0004",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.LLC_HIT.HITM_OTHER_CORE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand code reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand code reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -919,12 +916,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that hit in the L3",
-        "MSRValue": "0x3f803c0010",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads hit in the L3",
+        "MSRValue": "0x3F803C0010",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.LLC_HIT.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads hit in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -932,12 +929,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that hit in the L3",
-        "MSRValue": "0x3f803c0020",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs hit in the L3",
+        "MSRValue": "0x3F803C0020",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.LLC_HIT.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs hit in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -945,12 +942,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) code reads that hit in the L3",
-        "MSRValue": "0x3f803c0040",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) code reads hit in the L3",
+        "MSRValue": "0x3F803C0040",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_L2_CODE_RD.LLC_HIT.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) code reads that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) code reads hit in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -958,12 +955,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that hit in the L3",
-        "MSRValue": "0x3f803c0080",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads hit in the L3",
+        "MSRValue": "0x3F803C0080",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_LLC_DATA_RD.LLC_HIT.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads hit in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -971,12 +968,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that hit in the L3",
-        "MSRValue": "0x3f803c0100",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs hit in the L3",
+        "MSRValue": "0x3F803C0100",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_LLC_RFO.LLC_HIT.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs hit in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -984,12 +981,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts prefetch (that bring data to LLC only) code reads that hit in the L3",
-        "MSRValue": "0x3f803c0200",
+        "BriefDescription": "Counts prefetch (that bring data to LLC only) code reads hit in the L3",
+        "MSRValue": "0x3F803C0200",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_LLC_CODE_RD.LLC_HIT.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts prefetch (that bring data to LLC only) code reads that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts prefetch (that bring data to LLC only) code reads hit in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -997,12 +994,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
-        "MSRValue": "0x04003c0091",
+        "BriefDescription": "Counts all demand & prefetch data reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "MSRValue": "0x04003C0091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_HIT.HIT_OTHER_CORE_NO_FWD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -1010,12 +1007,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
-        "MSRValue": "0x10003c0091",
+        "BriefDescription": "Counts all demand & prefetch data reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "MSRValue": "0x10003C0091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_HIT.HITM_OTHER_CORE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -1023,12 +1020,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
-        "MSRValue": "0x04003c0122",
+        "BriefDescription": "Counts all demand & prefetch RFOs hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "MSRValue": "0x04003C0122",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.LLC_HIT.HIT_OTHER_CORE_NO_FWD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch RFOs hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -1036,12 +1033,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
-        "MSRValue": "0x10003c0122",
+        "BriefDescription": "Counts all demand & prefetch RFOs hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "MSRValue": "0x10003C0122",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.LLC_HIT.HITM_OTHER_CORE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch RFOs hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -1049,12 +1046,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch code reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
-        "MSRValue": "0x04003c0244",
+        "BriefDescription": "Counts all demand & prefetch code reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "MSRValue": "0x04003C0244",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_CODE_RD.LLC_HIT.HIT_OTHER_CORE_NO_FWD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch code reads that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch code reads hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -1062,12 +1059,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
-        "MSRValue": "0x04003c07f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
+        "MSRValue": "0x04003C07F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_HIT.HIT_OTHER_CORE_NO_FWD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) hit in the L3 and the snoops to sibling cores hit in either E/S state and the line is not forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -1075,12 +1072,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
-        "MSRValue": "0x10003c07f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
+        "MSRValue": "0x10003C07F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_HIT.HITM_OTHER_CORE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -1088,12 +1085,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all requests that hit in the L3",
-        "MSRValue": "0x3f803c8fff",
+        "BriefDescription": "Counts all requests hit in the L3",
+        "MSRValue": "0x3F803C8FFF",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_REQUESTS.LLC_HIT.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all requests that hit in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all requests hit in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    }
--- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json
@ -1,158 +1,340 @@
 [
    {
-        "BriefDescription": "Instructions Per Cycle (per logical thread)",
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Frontend_Bound"
+    },
+    {
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Frontend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Bad_Speculation"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Bad_Speculation_SMT"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Backend_Bound"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Backend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. ",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Retiring"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Retiring_SMT"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Instructions Per Cycle (per logical thread)",
        "MetricGroup": "TopDownL1",
        "MetricName": "IPC"
    },
    {
-        "BriefDescription": "Uops Per Instruction",
        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY",
-        "MetricGroup": "Pipeline",
+        "BriefDescription": "Uops Per Instruction",
+        "MetricGroup": "Pipeline;Retiring",
        "MetricName": "UPI"
    },
    {
-        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Instruction per taken branch",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "IpTB"
+    },
+    {
+        "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Branch instructions per taken branch. ",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "BpTB"
+    },
+    {
        "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )",
-        "MetricGroup": "Frontend",
+        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely (includes speculatively fetches) consumed by program instructions",
+        "MetricGroup": "PGO",
        "MetricName": "IFetch_Line_Utilization"
    },
    {
-        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)",
-        "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )",
-        "MetricGroup": "DSB; Frontend_Bandwidth",
+        "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )",
+        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
+        "MetricGroup": "DSB;Frontend_Bandwidth",
        "MetricName": "DSB_Coverage"
    },
    {
-        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
+        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
    {
-        "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Per-thread actual clocks when the logical processor is active.",
        "MetricGroup": "Summary",
        "MetricName": "CLKS"
    },
    {
-        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
+        "MetricExpr": "4 * cycles",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
    {
-        "BriefDescription": "Total number of retired Instructions",
+        "MetricExpr": "4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
+        "MetricGroup": "TopDownL1_SMT",
+        "MetricName": "SLOTS_SMT"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS",
+        "BriefDescription": "Instructions per Load (lower number means loads are more frequent)",
+        "MetricGroup": "Instruction_Type;L1_Bound",
+        "MetricName": "IpL"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES",
+        "BriefDescription": "Instructions per Store",
+        "MetricGroup": "Instruction_Type;Store_Bound",
+        "MetricName": "IpS"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Instructions per Branch",
+        "MetricGroup": "Branches;Instruction_Type;Port_5;Port_6",
+        "MetricName": "IpB"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
+        "BriefDescription": "Instruction per (near) call",
+        "MetricGroup": "Branches",
+        "MetricName": "IpCall"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY",
+        "BriefDescription": "Total number of retired Instructions",
        "MetricGroup": "Summary",
        "MetricName": "Instructions"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / cycles",
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Instructions Per Cycle (per physical core)",
+        "MetricGroup": "SMT",
+        "MetricName": "CoreIPC_SMT"
+    },
+    {
+        "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) ) if #SMT_on else UOPS_EXECUTED.CORE / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)",
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) ) if #SMT_on else UOPS_EXECUTED.CORE / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
-        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL  - (( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION )) ) / RS_EVENTS.EMPTY_END)",
-        "MetricGroup": "Unknown_Branches",
-        "MetricName": "BAClear_Cost"
+        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)",
+        "MetricGroup": "Branch_Mispredicts",
+        "MetricName": "IpMispredict"
    },
    {
+        "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
        "BriefDescription": "Core actual clocks when any thread is active on the physical core",
-        "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD",
        "MetricGroup": "SMT",
        "MetricName": "CORE_CLKS"
    },
    {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads",
        "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )",
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads (in core cycles)",
        "MetricGroup": "Memory_Bound;Memory_Lat",
        "MetricName": "Load_Miss_Real_Latency"
    },
    {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-thread)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / cycles",
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
    {
-        "BriefDescription": "Average CPU Utilization",
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricGroup": "TLB_SMT",
+        "MetricName": "Page_Walks_Utilization_SMT"
+    },
+    {
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L1D_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L2_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time",
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L3_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L1MPKI"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache misses per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2HPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L3MPKI"
+    },
+    {
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
+        "BriefDescription": "Average CPU Utilization",
        "MetricGroup": "Summary",
        "MetricName": "CPU_Utilization"
    },
    {
-        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricGroup": "Power",
        "MetricName": "Turbo_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0",
+        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricGroup": "SMT;Summary",
        "MetricName": "SMT_2T_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricGroup": "Summary",
        "MetricName": "Kernel_Utilization"
    },
    {
-        "BriefDescription": "C3 residency percent per core",
+        "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time",
+        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "DRAM_BW_Use"
+    },
+    {
+        "MetricExpr": "1000000000 * ( cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x35\\,umask\\=0x3\\,filter_opc\\=0x182@ ) / ( cbox_0@event\\=0x0@ / duration_time )",
+        "BriefDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches",
+        "MetricGroup": "Memory_Lat",
+        "MetricName": "DRAM_Read_Latency"
+    },
+    {
+        "MetricExpr": "cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182\\,thresh\\=1@",
+        "BriefDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "DRAM_Parallel_Reads"
+    },
+    {
+        "MetricExpr": "cbox_0@event\\=0x0@",
+        "BriefDescription": "Socket actual clocks when any core is active on that socket",
+        "MetricGroup": "",
+        "MetricName": "Socket_CLKS"
+    },
+    {
        "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per core",
        "MetricName": "C3_Core_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per core",
        "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per core",
        "MetricName": "C6_Core_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per core",
        "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per core",
        "MetricName": "C7_Core_Residency"
    },
    {
-        "BriefDescription": "C2 residency percent per package",
        "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C2 residency percent per package",
        "MetricName": "C2_Pkg_Residency"
    },
    {
-        "BriefDescription": "C3 residency percent per package",
        "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per package",
        "MetricName": "C3_Pkg_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per package",
        "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per package",
        "MetricName": "C6_Pkg_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per package",
        "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per package",
        "MetricName": "C7_Pkg_Residency"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/haswellx/memory.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/memory.json
@ -291,7 +291,7 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 4.",
+        "BriefDescription": "Randomly selected loads with latency value being above 4.",
        "PEBS": "2",
        "MSRValue": "0x4",
        "Counter": "3",
@ -305,7 +305,7 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 8.",
+        "BriefDescription": "Randomly selected loads with latency value being above 8.",
        "PEBS": "2",
        "MSRValue": "0x8",
        "Counter": "3",
@ -319,7 +319,7 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 16.",
+        "BriefDescription": "Randomly selected loads with latency value being above 16.",
        "PEBS": "2",
        "MSRValue": "0x10",
        "Counter": "3",
@ -333,7 +333,7 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 32.",
+        "BriefDescription": "Randomly selected loads with latency value being above 32.",
        "PEBS": "2",
        "MSRValue": "0x20",
        "Counter": "3",
@ -347,7 +347,7 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 64.",
+        "BriefDescription": "Randomly selected loads with latency value being above 64.",
        "PEBS": "2",
        "MSRValue": "0x40",
        "Counter": "3",
@ -361,7 +361,7 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 128.",
+        "BriefDescription": "Randomly selected loads with latency value being above 128.",
        "PEBS": "2",
        "MSRValue": "0x80",
        "Counter": "3",
@ -375,7 +375,7 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 256.",
+        "BriefDescription": "Randomly selected loads with latency value being above 256.",
        "PEBS": "2",
        "MSRValue": "0x100",
        "Counter": "3",
@ -389,7 +389,7 @@
    {
        "EventCode": "0xCD",
        "UMask": "0x1",
-        "BriefDescription": "Loads with latency value being above 512.",
+        "BriefDescription": "Randomly selected loads with latency value being above 512.",
        "PEBS": "2",
        "MSRValue": "0x200",
        "Counter": "3",
@ -404,12 +404,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts demand data reads that miss in the L3",
-        "MSRValue": "0x3fbfc00001",
+        "BriefDescription": "Counts demand data reads miss in the L3",
+        "MSRValue": "0x3FBFC00001",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts demand data reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts demand data reads miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -417,12 +417,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts demand data reads that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts demand data reads miss the L3 and the data is returned from local dram",
        "MSRValue": "0x0600400001",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.LLC_MISS.LOCAL_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts demand data reads that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts demand data reads miss the L3 and the data is returned from local dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -430,12 +430,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand data writes (RFOs) that miss in the L3",
-        "MSRValue": "0x3fbfc00002",
+        "BriefDescription": "Counts all demand data writes (RFOs) miss in the L3",
+        "MSRValue": "0x3FBFC00002",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand data writes (RFOs) that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -443,12 +443,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand data writes (RFOs) that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand data writes (RFOs) miss the L3 and the data is returned from local dram",
        "MSRValue": "0x0600400002",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.LOCAL_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand data writes (RFOs) that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) miss the L3 and the data is returned from local dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -456,12 +456,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand data writes (RFOs) that miss the L3 and the modified data is transferred from remote cache",
-        "MSRValue": "0x103fc00002",
+        "BriefDescription": "Counts all demand data writes (RFOs) miss the L3 and the modified data is transferred from remote cache",
+        "MSRValue": "0x103FC00002",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand data writes (RFOs) that miss the L3 and the modified data is transferred from remote cache Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand data writes (RFOs) miss the L3 and the modified data is transferred from remote cache",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -469,12 +469,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand code reads that miss in the L3",
-        "MSRValue": "0x3fbfc00004",
+        "BriefDescription": "Counts all demand code reads miss in the L3",
+        "MSRValue": "0x3FBFC00004",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand code reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand code reads miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -482,12 +482,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand code reads that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand code reads miss the L3 and the data is returned from local dram",
        "MSRValue": "0x0600400004",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.LLC_MISS.LOCAL_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand code reads that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand code reads miss the L3 and the data is returned from local dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -495,12 +495,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that miss in the L3",
-        "MSRValue": "0x3fbfc00010",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads miss in the L3",
+        "MSRValue": "0x3FBFC00010",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -508,12 +508,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that miss in the L3",
-        "MSRValue": "0x3fbfc00020",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs miss in the L3",
+        "MSRValue": "0x3FBFC00020",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -521,12 +521,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) code reads that miss in the L3",
-        "MSRValue": "0x3fbfc00040",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) code reads miss in the L3",
+        "MSRValue": "0x3FBFC00040",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_L2_CODE_RD.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) code reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) code reads miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -534,12 +534,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss in the L3",
-        "MSRValue": "0x3fbfc00080",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads miss in the L3",
+        "MSRValue": "0x3FBFC00080",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_LLC_DATA_RD.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -547,12 +547,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss in the L3",
-        "MSRValue": "0x3fbfc00100",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs miss in the L3",
+        "MSRValue": "0x3FBFC00100",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_LLC_RFO.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -560,12 +560,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts prefetch (that bring data to LLC only) code reads that miss in the L3",
-        "MSRValue": "0x3fbfc00200",
+        "BriefDescription": "Counts prefetch (that bring data to LLC only) code reads miss in the L3",
+        "MSRValue": "0x3FBFC00200",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.PF_LLC_CODE_RD.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts prefetch (that bring data to LLC only) code reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts prefetch (that bring data to LLC only) code reads miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -573,12 +573,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that miss in the L3",
-        "MSRValue": "0x3fbfc00091",
+        "BriefDescription": "Counts all demand & prefetch data reads miss in the L3",
+        "MSRValue": "0x3FBFC00091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -586,12 +586,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand & prefetch data reads miss the L3 and the data is returned from local dram",
        "MSRValue": "0x0600400091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_MISS.LOCAL_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads miss the L3 and the data is returned from local dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -599,12 +599,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from remote dram",
-        "MSRValue": "0x063f800091",
+        "BriefDescription": "Counts all demand & prefetch data reads miss the L3 and the data is returned from remote dram",
+        "MSRValue": "0x063F800091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_MISS.REMOTE_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from remote dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads miss the L3 and the data is returned from remote dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -612,12 +612,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and the modified data is transferred from remote cache",
-        "MSRValue": "0x103fc00091",
+        "BriefDescription": "Counts all demand & prefetch data reads miss the L3 and the modified data is transferred from remote cache",
+        "MSRValue": "0x103FC00091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_MISS.REMOTE_HITM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and the modified data is transferred from remote cache Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads miss the L3 and the modified data is transferred from remote cache",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -625,12 +625,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and clean or shared data is transferred from remote cache",
-        "MSRValue": "0x083fc00091",
+        "BriefDescription": "Counts all demand & prefetch data reads miss the L3 and clean or shared data is transferred from remote cache",
+        "MSRValue": "0x083FC00091",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.LLC_MISS.REMOTE_HIT_FORWARD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and clean or shared data is transferred from remote cache Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch data reads miss the L3 and clean or shared data is transferred from remote cache",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -638,12 +638,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch RFOs that miss in the L3",
-        "MSRValue": "0x3fbfc00122",
+        "BriefDescription": "Counts all demand & prefetch RFOs miss in the L3",
+        "MSRValue": "0x3FBFC00122",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch RFOs that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch RFOs miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -651,12 +651,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch RFOs that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand & prefetch RFOs miss the L3 and the data is returned from local dram",
        "MSRValue": "0x0600400122",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.LLC_MISS.LOCAL_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch RFOs that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch RFOs miss the L3 and the data is returned from local dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -664,12 +664,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch code reads that miss in the L3",
-        "MSRValue": "0x3fbfc00244",
+        "BriefDescription": "Counts all demand & prefetch code reads miss in the L3",
+        "MSRValue": "0x3FBFC00244",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_CODE_RD.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch code reads that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch code reads miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -677,12 +677,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all demand & prefetch code reads that miss the L3 and the data is returned from local dram",
+        "BriefDescription": "Counts all demand & prefetch code reads miss the L3 and the data is returned from local dram",
        "MSRValue": "0x0600400244",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_CODE_RD.LLC_MISS.LOCAL_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all demand & prefetch code reads that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all demand & prefetch code reads miss the L3 and the data is returned from local dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -690,12 +690,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss in the L3",
-        "MSRValue": "0x3fbfc007f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) miss in the L3",
+        "MSRValue": "0x3FBFC007F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -703,12 +703,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the data is returned from local dram",
-        "MSRValue": "0x06004007f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and the data is returned from local dram",
+        "MSRValue": "0x06004007F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_MISS.LOCAL_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the data is returned from local dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and the data is returned from local dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -716,12 +716,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the data is returned from remote dram",
-        "MSRValue": "0x063f8007f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and the data is returned from remote dram",
+        "MSRValue": "0x063F8007F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_MISS.REMOTE_DRAM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the data is returned from remote dram Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and the data is returned from remote dram",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -729,12 +729,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the modified data is transferred from remote cache",
-        "MSRValue": "0x103fc007f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and the modified data is transferred from remote cache",
+        "MSRValue": "0x103FC007F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_MISS.REMOTE_HITM",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and the modified data is transferred from remote cache Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and the modified data is transferred from remote cache",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -742,12 +742,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and clean or shared data is transferred from remote cache",
-        "MSRValue": "0x083fc007f7",
+        "BriefDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and clean or shared data is transferred from remote cache",
+        "MSRValue": "0x083FC007F7",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_READS.LLC_MISS.REMOTE_HIT_FORWARD",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) that miss the L3 and clean or shared data is transferred from remote cache Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all data/code/rfo reads (demand & prefetch) miss the L3 and clean or shared data is transferred from remote cache",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    },
@ -755,12 +755,12 @@
        "Offcore": "1",
        "EventCode": "0xB7, 0xBB",
        "UMask": "0x1",
-        "BriefDescription": "Counts all requests that miss in the L3",
-        "MSRValue": "0x3fbfc08fff",
+        "BriefDescription": "Counts all requests miss in the L3",
+        "MSRValue": "0x3FBFC08FFF",
        "Counter": "0,1,2,3",
        "EventName": "OFFCORE_RESPONSE.ALL_REQUESTS.LLC_MISS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
-        "PublicDescription": "Counts all requests that miss in the L3 Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "PublicDescription": "Counts all requests miss in the L3",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3"
    }
--- a/tools/perf/pmu-events/arch/x86/haswellx/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/pipeline.json
@ -1,6 +1,5 @@
 [
    {
-        "EventCode": "0x00",
        "UMask": "0x1",
        "BriefDescription": "Instructions retired from execution.",
        "Counter": "Fixed counter 0",
@ -11,7 +10,6 @@
        "CounterHTOff": "Fixed counter 0"
    },
    {
-        "EventCode": "0x00",
        "UMask": "0x2",
        "BriefDescription": "Core cycles when the thread is not in halt state.",
        "Counter": "Fixed counter 1",
@ -21,7 +19,6 @@
        "CounterHTOff": "Fixed counter 1"
    },
    {
-        "EventCode": "0x00",
        "UMask": "0x2",
        "BriefDescription": "Core cycles when at least one thread on the physical core is not in halt state.",
        "Counter": "Fixed counter 1",
@ -31,7 +28,6 @@
        "CounterHTOff": "Fixed counter 1"
    },
    {
-        "EventCode": "0x00",
        "UMask": "0x3",
        "BriefDescription": "Reference cycles when the core is not in halt state.",
        "Counter": "Fixed counter 2",
@ -1098,6 +1094,7 @@
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "UOPS_RETIRED.ALL",
+        "PublicDescription": "Counts the number of micro-ops retired. Use Cmask=1 and invert to count active cycles or stalled cycles.",
        "SampleAfterValue": "2000003",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -1142,6 +1139,7 @@
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "UOPS_RETIRED.RETIRE_SLOTS",
+        "PublicDescription": "This event counts the number of retirement slots used each cycle.  There are potentially 4 slots that can be used each cycle - meaning, 4 uops or 4 instructions could retire each cycle.",
        "SampleAfterValue": "2000003",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -1201,6 +1199,7 @@
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "BR_INST_RETIRED.CONDITIONAL",
+        "PublicDescription": "Counts the number of conditional branch instructions retired.",
        "SampleAfterValue": "400009",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -1241,6 +1240,7 @@
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "BR_INST_RETIRED.NEAR_RETURN",
+        "PublicDescription": "Counts the number of near return instructions retired.",
        "SampleAfterValue": "100003",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -1261,6 +1261,7 @@
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "BR_INST_RETIRED.NEAR_TAKEN",
+        "PublicDescription": "Number of near taken branches retired.",
        "SampleAfterValue": "400009",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
@ -1312,6 +1313,7 @@
        "PEBS": "1",
        "Counter": "0,1,2,3",
        "EventName": "BR_MISP_RETIRED.NEAR_TAKEN",
+        "PublicDescription": "Number of near branch instructions retired that were taken but mispredicted.",
        "SampleAfterValue": "400009",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
--- a/tools/perf/pmu-events/arch/x86/ivybridge/cache.json
+++ b/tools/perf/pmu-events/arch/x86/ivybridge/cache.json
@ -1012,7 +1012,7 @@
        "EventName": "OFFCORE_RESPONSE.SPLIT_LOCK_UC_LOCK.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts requests where the address of an atomic lock instruction spans a cache line boundary or the lock instruction is executed on uncacheable address ",
+        "BriefDescription": "Counts requests where the address of an atomic lock instruction spans a cache line boundary or the lock instruction is executed on uncacheable address",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -1036,7 +1036,7 @@
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand data reads ",
+        "BriefDescription": "Counts all demand data reads",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -1048,7 +1048,7 @@
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand rfo's ",
+        "BriefDescription": "Counts all demand rfo's",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -1084,7 +1084,7 @@
        "EventName": "OFFCORE_RESPONSE.ALL_RFO.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all demand & prefetch prefetch RFOs ",
+        "BriefDescription": "Counts all demand & prefetch prefetch RFOs",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -1096,7 +1096,7 @@
        "EventName": "OFFCORE_RESPONSE.ALL_READS.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": "Counts all data/code/rfo references (demand & prefetch) ",
+        "BriefDescription": "Counts all data/code/rfo references (demand & prefetch)",
        "CounterHTOff": "0,1,2,3"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json
@ -1,164 +1,340 @@
 [
    {
-        "BriefDescription": "Instructions Per Cycle (per logical thread)",
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Frontend_Bound"
+    },
+    {
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Frontend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Bad_Speculation"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Bad_Speculation_SMT"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Backend_Bound"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Backend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. ",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Retiring"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Retiring_SMT"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Instructions Per Cycle (per logical thread)",
        "MetricGroup": "TopDownL1",
        "MetricName": "IPC"
    },
    {
-        "BriefDescription": "Uops Per Instruction",
        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY",
-        "MetricGroup": "Pipeline",
+        "BriefDescription": "Uops Per Instruction",
+        "MetricGroup": "Pipeline;Retiring",
        "MetricName": "UPI"
    },
    {
-        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )",
-        "MetricGroup": "Frontend",
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Instruction per taken branch",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "IpTB"
+    },
+    {
+        "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Branch instructions per taken branch. ",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "BpTB"
+    },
+    {
+        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4 ) )",
+        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely (includes speculatively fetches) consumed by program instructions",
+        "MetricGroup": "PGO",
        "MetricName": "IFetch_Line_Utilization"
    },
    {
-        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)",
-        "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )",
-        "MetricGroup": "DSB; Frontend_Bandwidth",
+        "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )",
+        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
+        "MetricGroup": "DSB;Frontend_Bandwidth",
        "MetricName": "DSB_Coverage"
    },
    {
-        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
+        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
    {
-        "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Per-thread actual clocks when the logical processor is active.",
        "MetricGroup": "Summary",
        "MetricName": "CLKS"
    },
    {
-        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
+        "MetricExpr": "4 * cycles",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
    {
-        "BriefDescription": "Total number of retired Instructions",
+        "MetricExpr": "4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
+        "MetricGroup": "TopDownL1_SMT",
+        "MetricName": "SLOTS_SMT"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS",
+        "BriefDescription": "Instructions per Load (lower number means loads are more frequent)",
+        "MetricGroup": "Instruction_Type;L1_Bound",
+        "MetricName": "IpL"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES",
+        "BriefDescription": "Instructions per Store",
+        "MetricGroup": "Instruction_Type;Store_Bound",
+        "MetricName": "IpS"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Instructions per Branch",
+        "MetricGroup": "Branches;Instruction_Type;Port_5;Port_6",
+        "MetricName": "IpB"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
+        "BriefDescription": "Instruction per (near) call",
+        "MetricGroup": "Branches",
+        "MetricName": "IpCall"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY",
+        "BriefDescription": "Total number of retired Instructions",
        "MetricGroup": "Summary",
        "MetricName": "Instructions"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / cycles",
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Instructions Per Cycle (per physical core)",
+        "MetricGroup": "SMT",
+        "MetricName": "CoreIPC_SMT"
+    },
+    {
+        "MetricExpr": "(( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE )) / cycles",
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricGroup": "FLOPS",
+        "MetricName": "FLOPc"
+    },
+    {
+        "MetricExpr": "(( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE )) / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricGroup": "FLOPS_SMT",
+        "MetricName": "FLOPc_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
-        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFETCH_STALL ) / RS_EVENTS.EMPTY_END)",
-        "MetricGroup": "Unknown_Branches",
-        "MetricName": "BAClear_Cost"
+        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)",
+        "MetricGroup": "Branch_Mispredicts",
+        "MetricName": "IpMispredict"
    },
    {
+        "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
        "BriefDescription": "Core actual clocks when any thread is active on the physical core",
-        "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD",
        "MetricGroup": "SMT",
        "MetricName": "CORE_CLKS"
    },
    {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads",
        "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )",
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads (in core cycles)",
        "MetricGroup": "Memory_Bound;Memory_Lat",
        "MetricName": "Load_Miss_Real_Latency"
    },
    {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-thread)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / cycles",
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
    {
-        "BriefDescription": "Average CPU Utilization",
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricGroup": "TLB_SMT",
+        "MetricName": "Page_Walks_Utilization_SMT"
+    },
+    {
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L1D_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L2_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time",
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L3_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L1MPKI"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache misses per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2HPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.LLC_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L3MPKI"
+    },
+    {
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
+        "BriefDescription": "Average CPU Utilization",
        "MetricGroup": "Summary",
        "MetricName": "CPU_Utilization"
    },
    {
+        "MetricExpr": "( (( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE )) / 1000000000 ) / duration_time",
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },
    {
-        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricGroup": "Power",
        "MetricName": "Turbo_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0",
+        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricGroup": "SMT;Summary",
        "MetricName": "SMT_2T_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricGroup": "Summary",
        "MetricName": "Kernel_Utilization"
    },
    {
-        "BriefDescription": "C3 residency percent per core",
+        "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000",
+        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "DRAM_BW_Use"
+    },
+    {
        "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per core",
        "MetricName": "C3_Core_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per core",
        "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per core",
        "MetricName": "C6_Core_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per core",
        "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per core",
        "MetricName": "C7_Core_Residency"
    },
    {
-        "BriefDescription": "C2 residency percent per package",
        "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C2 residency percent per package",
        "MetricName": "C2_Pkg_Residency"
    },
    {
-        "BriefDescription": "C3 residency percent per package",
        "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per package",
        "MetricName": "C3_Pkg_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per package",
        "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per package",
        "MetricName": "C6_Pkg_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per package",
        "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per package",
        "MetricName": "C7_Pkg_Residency"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/ivybridge/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/ivybridge/pipeline.json
@ -1,6 +1,5 @@
 [
    {
-        "EventCode": "0x00",
        "Counter": "Fixed counter 0",
        "UMask": "0x1",
        "EventName": "INST_RETIRED.ANY",
@ -9,7 +8,6 @@
        "CounterHTOff": "Fixed counter 0"
    },
    {
-        "EventCode": "0x00",
        "Counter": "Fixed counter 1",
        "UMask": "0x2",
        "EventName": "CPU_CLK_UNHALTED.THREAD",
@ -19,7 +17,6 @@
    },
    {
        "PublicDescription": "Core cycles when at least one thread on the physical core is not in halt state.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 1",
        "UMask": "0x2",
        "AnyThread": "1",
@ -29,7 +26,6 @@
        "CounterHTOff": "Fixed counter 1"
    },
    {
-        "EventCode": "0x00",
        "Counter": "Fixed counter 2",
        "UMask": "0x3",
        "EventName": "CPU_CLK_UNHALTED.REF_TSC",
--- a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json
@ -1,164 +1,346 @@
 [
    {
-        "BriefDescription": "Instructions Per Cycle (per logical thread)",
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Frontend_Bound"
+    },
+    {
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Frontend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Bad_Speculation"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Bad_Speculation_SMT"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Backend_Bound"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Backend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. ",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Retiring"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Retiring_SMT"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Instructions Per Cycle (per logical thread)",
        "MetricGroup": "TopDownL1",
        "MetricName": "IPC"
    },
    {
-        "BriefDescription": "Uops Per Instruction",
        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY",
-        "MetricGroup": "Pipeline",
+        "BriefDescription": "Uops Per Instruction",
+        "MetricGroup": "Pipeline;Retiring",
        "MetricName": "UPI"
    },
    {
-        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )",
-        "MetricGroup": "Frontend",
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Instruction per taken branch",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "IpTB"
+    },
+    {
+        "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
+        "BriefDescription": "Branch instructions per taken branch. ",
+        "MetricGroup": "Branches;PGO",
+        "MetricName": "BpTB"
+    },
+    {
+        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4 ) )",
+        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely (includes speculatively fetches) consumed by program instructions",
+        "MetricGroup": "PGO",
        "MetricName": "IFetch_Line_Utilization"
    },
    {
-        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)",
-        "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )",
-        "MetricGroup": "DSB; Frontend_Bandwidth",
+        "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )",
+        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
+        "MetricGroup": "DSB;Frontend_Bandwidth",
        "MetricName": "DSB_Coverage"
    },
    {
-        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
+        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
    {
-        "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Per-thread actual clocks when the logical processor is active.",
        "MetricGroup": "Summary",
        "MetricName": "CLKS"
    },
    {
-        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
+        "MetricExpr": "4 * cycles",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
    {
-        "BriefDescription": "Total number of retired Instructions",
+        "MetricExpr": "4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
+        "MetricGroup": "TopDownL1_SMT",
+        "MetricName": "SLOTS_SMT"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS",
+        "BriefDescription": "Instructions per Load (lower number means loads are more frequent)",
+        "MetricGroup": "Instruction_Type;L1_Bound",
+        "MetricName": "IpL"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES",
+        "BriefDescription": "Instructions per Store",
+        "MetricGroup": "Instruction_Type;Store_Bound",
+        "MetricName": "IpS"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Instructions per Branch",
+        "MetricGroup": "Branches;Instruction_Type;Port_5;Port_6",
+        "MetricName": "IpB"
+    },
+    {
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
+        "BriefDescription": "Instruction per (near) call",
+        "MetricGroup": "Branches",
+        "MetricName": "IpCall"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY",
+        "BriefDescription": "Total number of retired Instructions",
        "MetricGroup": "Summary",
        "MetricName": "Instructions"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / cycles",
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Instructions Per Cycle (per physical core)",
+        "MetricGroup": "SMT",
+        "MetricName": "CoreIPC_SMT"
+    },
+    {
+        "MetricExpr": "(( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE )) / cycles",
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricGroup": "FLOPS",
+        "MetricName": "FLOPc"
+    },
+    {
+        "MetricExpr": "(( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE )) / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricGroup": "FLOPS_SMT",
+        "MetricName": "FLOPc_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
-        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFETCH_STALL ) / RS_EVENTS.EMPTY_END)",
-        "MetricGroup": "Unknown_Branches",
-        "MetricName": "BAClear_Cost"
+        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
+        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)",
+        "MetricGroup": "Branch_Mispredicts",
+        "MetricName": "IpMispredict"
    },
    {
+        "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
        "BriefDescription": "Core actual clocks when any thread is active on the physical core",
-        "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD",
        "MetricGroup": "SMT",
        "MetricName": "CORE_CLKS"
    },
    {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads",
        "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )",
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads (in core cycles)",
        "MetricGroup": "Memory_Bound;Memory_Lat",
        "MetricName": "Load_Miss_Real_Latency"
    },
    {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-thread)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / cycles",
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
    {
-        "BriefDescription": "Average CPU Utilization",
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricGroup": "TLB_SMT",
+        "MetricName": "Page_Walks_Utilization_SMT"
+    },
+    {
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L1D_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time",
+        "BriefDescription": "Average data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L2_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time",
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "L3_Cache_Fill_BW"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L1MPKI"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache misses per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2MPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L2HPKI_All"
+    },
+    {
+        "MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.LLC_MISS / INST_RETIRED.ANY",
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricGroup": "Cache_Misses;",
+        "MetricName": "L3MPKI"
+    },
+    {
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
+        "BriefDescription": "Average CPU Utilization",
        "MetricGroup": "Summary",
        "MetricName": "CPU_Utilization"
    },
    {
+        "MetricExpr": "( (( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE )) / 1000000000 ) / duration_time",
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },
    {
-        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricGroup": "Power",
        "MetricName": "Turbo_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0",
+        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricGroup": "SMT;Summary",
        "MetricName": "SMT_2T_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricGroup": "Summary",
        "MetricName": "Kernel_Utilization"
    },
    {
-        "BriefDescription": "C3 residency percent per core",
+        "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time",
+        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "DRAM_BW_Use"
+    },
+    {
+        "MetricExpr": "cbox_0@event\\=0x0@",
+        "BriefDescription": "Socket actual clocks when any core is active on that socket",
+        "MetricGroup": "",
+        "MetricName": "Socket_CLKS"
+    },
+    {
        "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per core",
        "MetricName": "C3_Core_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per core",
        "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per core",
        "MetricName": "C6_Core_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per core",
        "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per core",
        "MetricName": "C7_Core_Residency"
    },
    {
-        "BriefDescription": "C2 residency percent per package",
        "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C2 residency percent per package",
        "MetricName": "C2_Pkg_Residency"
    },
    {
-        "BriefDescription": "C3 residency percent per package",
        "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per package",
        "MetricName": "C3_Pkg_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per package",
        "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per package",
        "MetricName": "C6_Pkg_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per package",
        "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per package",
        "MetricName": "C7_Pkg_Residency"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/ivytown/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/ivytown/pipeline.json
@ -1,6 +1,5 @@
 [
    {
-        "EventCode": "0x00",
        "Counter": "Fixed counter 0",
        "UMask": "0x1",
        "EventName": "INST_RETIRED.ANY",
@ -9,7 +8,6 @@
        "CounterHTOff": "Fixed counter 0"
    },
    {
-        "EventCode": "0x00",
        "Counter": "Fixed counter 1",
        "UMask": "0x2",
        "EventName": "CPU_CLK_UNHALTED.THREAD",
@ -19,7 +17,6 @@
    },
    {
        "PublicDescription": "Core cycles when at least one thread on the physical core is not in halt state.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 1",
        "UMask": "0x2",
        "AnyThread": "1",
@ -29,7 +26,6 @@
        "CounterHTOff": "Fixed counter 1"
    },
    {
-        "EventCode": "0x00",
        "Counter": "Fixed counter 2",
        "UMask": "0x3",
        "EventName": "CPU_CLK_UNHALTED.REF_TSC",
--- a/tools/perf/pmu-events/arch/x86/jaketown/cache.json
+++ b/tools/perf/pmu-events/arch/x86/jaketown/cache.json
@ -31,7 +31,7 @@
    },
    {
        "PEBS": "1",
-        "PublicDescription": "This event counts line-split load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
+        "PublicDescription": "This event counts line-splitted load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
        "EventCode": "0xD0",
        "Counter": "0,1,2,3",
        "UMask": "0x41",
@ -42,7 +42,7 @@
    },
    {
        "PEBS": "1",
-        "PublicDescription": "This event counts line-split store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
+        "PublicDescription": "This event counts line-splitted store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
        "EventCode": "0xD0",
        "Counter": "0,1,2,3",
        "UMask": "0x42",
@ -179,7 +179,7 @@
        "CounterHTOff": "0,1,2,3"
    },
    {
-        "PublicDescription": "This event counts L1D data line replacements.  Replacements occur when a new line is brought into the cache, causing eviction of a line loaded earlier.  ",
+        "PublicDescription": "This event counts L1D data line replacements.  Replacements occur when a new line is brought into the cache, causing eviction of a line loaded earlier.",
        "EventCode": "0x51",
        "Counter": "0,1,2,3",
        "UMask": "0x1",
--- a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json
@ -1,140 +1,232 @@
 [
    {
-        "BriefDescription": "Instructions Per Cycle (per logical thread)",
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Frontend_Bound"
+    },
+    {
+        "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-ops (uops). Ideally the Frontend can issue 4 uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Frontend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Bad_Speculation"
+    },
+    {
+        "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Bad_Speculation_SMT"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * cycles)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * cycles)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Backend_Bound"
+    },
+    {
+        "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (( INT_MISC.RECOVERY_CYCLES_ANY / 2 )) ) / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) )",
+        "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Backend_Bound_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * cycles)",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. ",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "Retiring"
+    },
+    {
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))",
+        "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired per cycle has been achieved.  Maximizing Retiring typically increases the Instruction-Per-Cycle metric. Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Microcode assists are categorized under Retiring. They hurt performance and can often be avoided. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.",
+        "MetricGroup": "TopdownL1_SMT",
+        "MetricName": "Retiring_SMT"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Instructions Per Cycle (per logical thread)",
        "MetricGroup": "TopDownL1",
        "MetricName": "IPC"
    },
    {
-        "BriefDescription": "Uops Per Instruction",
        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY",
-        "MetricGroup": "Pipeline",
+        "BriefDescription": "Uops Per Instruction",
+        "MetricGroup": "Pipeline;Retiring",
        "MetricName": "UPI"
    },
    {
-        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )",
-        "MetricGroup": "Frontend",
+        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4 ) )",
+        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely (includes speculatively fetches) consumed by program instructions",
+        "MetricGroup": "PGO",
        "MetricName": "IFetch_Line_Utilization"
    },
    {
-        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)",
-        "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )",
-        "MetricGroup": "DSB; Frontend_Bandwidth",
+        "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )",
+        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
+        "MetricGroup": "DSB;Frontend_Bandwidth",
        "MetricName": "DSB_Coverage"
    },
    {
-        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
+        "BriefDescription": "Cycles Per Instruction (threaded)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
    {
-        "BriefDescription": "Per-thread actual clocks when the logical processor is active. This is called 'Clockticks' in VTune.",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
+        "BriefDescription": "Per-thread actual clocks when the logical processor is active.",
        "MetricGroup": "Summary",
        "MetricName": "CLKS"
    },
    {
-        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
+        "MetricExpr": "4 * cycles",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
    {
-        "BriefDescription": "Total number of retired Instructions",
+        "MetricExpr": "4 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Total issue-pipeline slots (per core)",
+        "MetricGroup": "TopDownL1_SMT",
+        "MetricName": "SLOTS_SMT"
+    },
+    {
        "MetricExpr": "INST_RETIRED.ANY",
+        "BriefDescription": "Total number of retired Instructions",
        "MetricGroup": "Summary",
        "MetricName": "Instructions"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / cycles",
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
+        "MetricExpr": "INST_RETIRED.ANY / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Instructions Per Cycle (per physical core)",
+        "MetricGroup": "SMT",
+        "MetricName": "CoreIPC_SMT"
+    },
+    {
+        "MetricExpr": "(( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE )) / cycles",
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricGroup": "FLOPS",
+        "MetricName": "FLOPc"
+    },
+    {
+        "MetricExpr": "(( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE )) / (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
+        "BriefDescription": "Floating Point Operations Per Cycle",
+        "MetricGroup": "FLOPS_SMT",
+        "MetricName": "FLOPc_SMT"
+    },
+    {
+        "MetricExpr": "UOPS_DISPATCHED.THREAD / (( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)",
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "UOPS_DISPATCHED.THREAD / (( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
+        "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
        "BriefDescription": "Core actual clocks when any thread is active on the physical core",
-        "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD",
        "MetricGroup": "SMT",
        "MetricName": "CORE_CLKS"
    },
    {
-        "BriefDescription": "Average CPU Utilization",
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
+        "BriefDescription": "Average CPU Utilization",
        "MetricGroup": "Summary",
        "MetricName": "CPU_Utilization"
    },
    {
+        "MetricExpr": "( (( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE )) / 1000000000 ) / duration_time",
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },
    {
-        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
        "MetricGroup": "Power",
        "MetricName": "Turbo_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricExpr": "1 - CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0",
+        "BriefDescription": "Fraction of cycles where both hardware threads were active",
        "MetricGroup": "SMT;Summary",
        "MetricName": "SMT_2T_Utilization"
    },
    {
-        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC:u / CPU_CLK_UNHALTED.REF_TSC",
+        "BriefDescription": "Fraction of cycles spent in Kernel mode",
        "MetricGroup": "Summary",
        "MetricName": "Kernel_Utilization"
    },
    {
-        "BriefDescription": "C3 residency percent per core",
+        "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time",
+        "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
+        "MetricGroup": "Memory_BW",
+        "MetricName": "DRAM_BW_Use"
+    },
+    {
+        "MetricExpr": "cbox_0@event\\=0x0@",
+        "BriefDescription": "Socket actual clocks when any core is active on that socket",
+        "MetricGroup": "",
+        "MetricName": "Socket_CLKS"
+    },
+    {
        "MetricExpr": "(cstate_core@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per core",
        "MetricName": "C3_Core_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per core",
        "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per core",
        "MetricName": "C6_Core_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per core",
        "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per core",
        "MetricName": "C7_Core_Residency"
    },
    {
-        "BriefDescription": "C2 residency percent per package",
        "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C2 residency percent per package",
        "MetricName": "C2_Pkg_Residency"
    },
    {
-        "BriefDescription": "C3 residency percent per package",
        "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C3 residency percent per package",
        "MetricName": "C3_Pkg_Residency"
    },
    {
-        "BriefDescription": "C6 residency percent per package",
        "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C6 residency percent per package",
        "MetricName": "C6_Pkg_Residency"
    },
    {
-        "BriefDescription": "C7 residency percent per package",
        "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100",
        "MetricGroup": "Power",
+        "BriefDescription": "C7 residency percent per package",
        "MetricName": "C7_Pkg_Residency"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/jaketown/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/jaketown/pipeline.json
@ -1,7 +1,6 @@
 [
    {
-        "PublicDescription": "This event counts the number of instructions retired from execution. For instructions that consist of multiple micro-ops, this event counts the retirement of the last micro-op of the instruction. Counting continues during hardware interrupts, traps, and inside interrupt handlers. ",
-        "EventCode": "0x00",
+        "PublicDescription": "This event counts the number of instructions retired from execution. For instructions that consist of multiple micro-ops, this event counts the retirement of the last micro-op of the instruction. Counting continues during hardware interrupts, traps, and inside interrupt handlers.",
        "Counter": "Fixed counter 1",
        "UMask": "0x1",
        "EventName": "INST_RETIRED.ANY",
@ -10,8 +9,7 @@
        "CounterHTOff": "Fixed counter 1"
    },
    {
-        "PublicDescription": "This event counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. ",
-        "EventCode": "0x00",
+        "PublicDescription": "This event counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events.",
        "Counter": "Fixed counter 2",
        "UMask": "0x2",
        "EventName": "CPU_CLK_UNHALTED.THREAD",
@ -20,8 +18,7 @@
        "CounterHTOff": "Fixed counter 2"
    },
    {
-        "PublicDescription": "This event counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. This event has a constant ratio with the CPU_CLK_UNHALTED.REF_XCLK event. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. ",
-        "EventCode": "0x00",
+        "PublicDescription": "This event counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. This event has a constant ratio with the CPU_CLK_UNHALTED.REF_XCLK event. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events.",
        "Counter": "Fixed counter 3",
        "UMask": "0x3",
        "EventName": "CPU_CLK_UNHALTED.REF_TSC",
@ -778,7 +775,7 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "This event counts loads that followed a store to the same address, where the data could not be forwarded inside the pipeline from the store to the load.  The most common reason why store forwarding would be blocked is when a load's address range overlaps with a preceding smaller uncompleted store.  See the table of not supported store forwards in the Intel? 64 and IA-32 Architectures Optimization Reference Manual.  The penalty for blocked store forwarding is that the load must wait for the store to complete before it can be issued.",
+        "PublicDescription": "This event counts loads that followed a store to the same address, where the data could not be forwarded inside the pipeline from the store to the load.  The most common reason why store forwarding would be blocked is when a load's address range overlaps with a preceeding smaller uncompleted store.  See the table of not supported store forwards in the Intel? 64 and IA-32 Architectures Optimization Reference Manual.  The penalty for blocked store forwarding is that the load must wait for the store to complete before it can be issued.",
        "EventCode": "0x03",
        "Counter": "0,1,2,3",
        "UMask": "0x2",
@ -1098,7 +1095,6 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "EventCode": "0x00",
        "Counter": "Fixed counter 2",
        "UMask": "0x2",
        "AnyThread": "1",
--- a/tools/perf/pmu-events/arch/x86/knightslanding/cache.json
+++ b/tools/perf/pmu-events/arch/x86/knightslanding/cache.json
--- a/tools/perf/pmu-events/arch/x86/knightslanding/memory.json
+++ b/tools/perf/pmu-events/arch/x86/knightslanding/memory.json
@ -9,18 +9,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100400070 ",
+        "MSRValue": "0x0100400070",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_PF_L2.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts any Prefetch requests that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts any Prefetch requests that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080200070 ",
+        "MSRValue": "0x0080200070",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_PF_L2.MCDRAM_NEAR",
@ -31,18 +31,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101000070 ",
+        "MSRValue": "0x0101000070",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_PF_L2.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts any Prefetch requests that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts any Prefetch requests that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080800070 ",
+        "MSRValue": "0x0080800070",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_PF_L2.DDR_NEAR",
@ -53,18 +53,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x01004032f7 ",
+        "MSRValue": "0x01004032f7",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_READ.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts any Read request  that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts any Read request  that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x00802032f7 ",
+        "MSRValue": "0x00802032f7",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_READ.MCDRAM_NEAR",
@ -75,18 +75,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x01010032f7 ",
+        "MSRValue": "0x01010032f7",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_READ.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts any Read request  that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts any Read request  that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x00808032f7 ",
+        "MSRValue": "0x00808032f7",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_READ.DDR_NEAR",
@ -97,18 +97,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100400044 ",
+        "MSRValue": "0x0100400044",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_CODE_RD.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Demand code reads and prefetch code read requests  that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts Demand code reads and prefetch code read requests  that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080200044 ",
+        "MSRValue": "0x0080200044",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_CODE_RD.MCDRAM_NEAR",
@ -119,18 +119,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101000044 ",
+        "MSRValue": "0x0101000044",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_CODE_RD.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Demand code reads and prefetch code read requests  that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts Demand code reads and prefetch code read requests  that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080800044 ",
+        "MSRValue": "0x0080800044",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_CODE_RD.DDR_NEAR",
@ -141,18 +141,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100400022 ",
+        "MSRValue": "0x0100400022",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_RFO.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Demand cacheable data write requests  that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts Demand cacheable data write requests  that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080200022 ",
+        "MSRValue": "0x0080200022",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_RFO.MCDRAM_NEAR",
@ -163,18 +163,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101000022 ",
+        "MSRValue": "0x0101000022",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_RFO.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Demand cacheable data write requests  that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts Demand cacheable data write requests  that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080800022 ",
+        "MSRValue": "0x0080800022",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_RFO.DDR_NEAR",
@ -185,18 +185,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100403091 ",
+        "MSRValue": "0x0100403091",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_DATA_RD.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Demand cacheable data and L1 prefetch data read requests  that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts Demand cacheable data and L1 prefetch data read requests  that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080203091 ",
+        "MSRValue": "0x0080203091",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_DATA_RD.MCDRAM_NEAR",
@ -207,18 +207,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101003091 ",
+        "MSRValue": "0x0101003091",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_DATA_RD.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Demand cacheable data and L1 prefetch data read requests  that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts Demand cacheable data and L1 prefetch data read requests  that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080803091 ",
+        "MSRValue": "0x0080803091",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_DATA_RD.DDR_NEAR",
@ -229,18 +229,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100408000 ",
+        "MSRValue": "0x0100408000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_REQUEST.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts any request that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts any request that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080208000 ",
+        "MSRValue": "0x0080208000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_REQUEST.MCDRAM_NEAR",
@ -251,18 +251,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101008000 ",
+        "MSRValue": "0x0101008000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_REQUEST.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts any request that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts any request that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080808000 ",
+        "MSRValue": "0x0080808000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_REQUEST.DDR_NEAR",
@ -273,18 +273,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100402000 ",
+        "MSRValue": "0x0100402000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L1_DATA_RD.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts L1 data HW prefetches that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts L1 data HW prefetches that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080202000 ",
+        "MSRValue": "0x0080202000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L1_DATA_RD.MCDRAM_NEAR",
@ -295,18 +295,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101002000 ",
+        "MSRValue": "0x0101002000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L1_DATA_RD.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts L1 data HW prefetches that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts L1 data HW prefetches that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080802000 ",
+        "MSRValue": "0x0080802000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L1_DATA_RD.DDR_NEAR",
@ -317,18 +317,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100401000 ",
+        "MSRValue": "0x0100401000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_SOFTWARE.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Software Prefetches that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts Software Prefetches that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080201000 ",
+        "MSRValue": "0x0080201000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_SOFTWARE.MCDRAM_NEAR",
@ -339,18 +339,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101001000 ",
+        "MSRValue": "0x0101001000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_SOFTWARE.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Software Prefetches that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts Software Prefetches that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080801000 ",
+        "MSRValue": "0x0080801000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_SOFTWARE.DDR_NEAR",
@ -361,18 +361,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100400400 ",
+        "MSRValue": "0x0100400400",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.BUS_LOCKS.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Bus locks and split lock requests that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts Bus locks and split lock requests that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080200400 ",
+        "MSRValue": "0x0080200400",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.BUS_LOCKS.MCDRAM_NEAR",
@ -383,18 +383,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101000400 ",
+        "MSRValue": "0x0101000400",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.BUS_LOCKS.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Bus locks and split lock requests that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts Bus locks and split lock requests that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080800400 ",
+        "MSRValue": "0x0080800400",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.BUS_LOCKS.DDR_NEAR",
@ -405,18 +405,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100400200 ",
+        "MSRValue": "0x0100400200",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.UC_CODE_READS.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts UC code reads (valid only for Outstanding response type)  that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts UC code reads (valid only for Outstanding response type)  that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080200200 ",
+        "MSRValue": "0x0080200200",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.UC_CODE_READS.MCDRAM_NEAR",
@ -427,18 +427,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101000200 ",
+        "MSRValue": "0x0101000200",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.UC_CODE_READS.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts UC code reads (valid only for Outstanding response type)  that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts UC code reads (valid only for Outstanding response type)  that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080800200 ",
+        "MSRValue": "0x0080800200",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.UC_CODE_READS.DDR_NEAR",
@ -449,18 +449,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100400100 ",
+        "MSRValue": "0x0100400100",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PARTIAL_WRITES.MCDRAM_FAR",
        "MSRIndex": "0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Partial writes (UC or WT or WP and should be programmed on PMC1) that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts Partial writes (UC or WT or WP and should be programmed on PMC1) that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080200100 ",
+        "MSRValue": "0x0080200100",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PARTIAL_WRITES.MCDRAM_NEAR",
@ -471,18 +471,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101000100 ",
+        "MSRValue": "0x0101000100",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PARTIAL_WRITES.DDR_FAR",
        "MSRIndex": "0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Partial writes (UC or WT or WP and should be programmed on PMC1) that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts Partial writes (UC or WT or WP and should be programmed on PMC1) that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080800100 ",
+        "MSRValue": "0x0080800100",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PARTIAL_WRITES.DDR_NEAR",
@ -493,7 +493,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x2000020080 ",
+        "MSRValue": "0x2000020080",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PARTIAL_READS.NON_DRAM",
@ -504,18 +504,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100400080 ",
+        "MSRValue": "0x0100400080",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PARTIAL_READS.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Partial reads (UC or WC and is valid only for Outstanding response type).  that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts Partial reads (UC or WC and is valid only for Outstanding response type).  that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080200080 ",
+        "MSRValue": "0x0080200080",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PARTIAL_READS.MCDRAM_NEAR",
@ -526,18 +526,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101000080 ",
+        "MSRValue": "0x0101000080",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PARTIAL_READS.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Partial reads (UC or WC and is valid only for Outstanding response type).  that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts Partial reads (UC or WC and is valid only for Outstanding response type).  that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080800080 ",
+        "MSRValue": "0x0080800080",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PARTIAL_READS.DDR_NEAR",
@ -548,18 +548,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100400040 ",
+        "MSRValue": "0x0100400040",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_CODE_RD.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts L2 code HW prefetches that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts L2 code HW prefetches that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080200040 ",
+        "MSRValue": "0x0080200040",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_CODE_RD.MCDRAM_NEAR",
@ -570,18 +570,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101000040 ",
+        "MSRValue": "0x0101000040",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_CODE_RD.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts L2 code HW prefetches that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts L2 code HW prefetches that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080800040 ",
+        "MSRValue": "0x0080800040",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_CODE_RD.DDR_NEAR",
@ -592,7 +592,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x2000020020 ",
+        "MSRValue": "0x2000020020",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.NON_DRAM",
@ -603,18 +603,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100400020 ",
+        "MSRValue": "0x0100400020",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts L2 data RFO prefetches (includes PREFETCHW instruction) that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts L2 data RFO prefetches (includes PREFETCHW instruction) that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080200020 ",
+        "MSRValue": "0x0080200020",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.MCDRAM_NEAR",
@ -625,18 +625,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101000020 ",
+        "MSRValue": "0x0101000020",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts L2 data RFO prefetches (includes PREFETCHW instruction) that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts L2 data RFO prefetches (includes PREFETCHW instruction) that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080800020 ",
+        "MSRValue": "0x0080800020",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.DDR_NEAR",
@ -647,18 +647,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100400004 ",
+        "MSRValue": "0x0100400004",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts demand code reads and prefetch code reads that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts demand code reads and prefetch code reads that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080200004 ",
+        "MSRValue": "0x0080200004",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.MCDRAM_NEAR",
@ -669,18 +669,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101000004 ",
+        "MSRValue": "0x0101000004",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts demand code reads and prefetch code reads that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts demand code reads and prefetch code reads that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080800004 ",
+        "MSRValue": "0x0080800004",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.DDR_NEAR",
@ -691,18 +691,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100400002 ",
+        "MSRValue": "0x0100400002",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Demand cacheable data writes that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts Demand cacheable data writes that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080200002 ",
+        "MSRValue": "0x0080200002",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.MCDRAM_NEAR",
@ -713,18 +713,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101000002 ",
+        "MSRValue": "0x0101000002",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts Demand cacheable data writes that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts Demand cacheable data writes that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080800002 ",
+        "MSRValue": "0x0080800002",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.DDR_NEAR",
@ -735,18 +735,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0100400001 ",
+        "MSRValue": "0x0100400001",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.MCDRAM_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts demand cacheable data and L1 prefetch data reads that accounts for data responses from MCDRAM Far or Other tile L2 hit far. ",
+        "BriefDescription": "Counts demand cacheable data and L1 prefetch data reads that accounts for data responses from MCDRAM Far or Other tile L2 hit far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080200001 ",
+        "MSRValue": "0x0080200001",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.MCDRAM_NEAR",
@ -757,18 +757,18 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0101000001 ",
+        "MSRValue": "0x0101000001",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.DDR_FAR",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100007",
-        "BriefDescription": "Counts demand cacheable data and L1 prefetch data reads that accounts for data responses from DRAM Far. ",
+        "BriefDescription": "Counts demand cacheable data and L1 prefetch data reads that accounts for data responses from DRAM Far.",
        "Offcore": "1"
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0080800001 ",
+        "MSRValue": "0x0080800001",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.DDR_NEAR",
@ -779,7 +779,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180600001 ",
+        "MSRValue": "0x0180600001",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.MCDRAM",
@ -790,7 +790,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180600002 ",
+        "MSRValue": "0x0180600002",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.MCDRAM",
@ -801,7 +801,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180600004 ",
+        "MSRValue": "0x0180600004",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.MCDRAM",
@ -812,7 +812,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180600020 ",
+        "MSRValue": "0x0180600020",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.MCDRAM",
@ -823,7 +823,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180600080 ",
+        "MSRValue": "0x0180600080",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PARTIAL_READS.MCDRAM",
@ -834,7 +834,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180600100 ",
+        "MSRValue": "0x0180600100",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PARTIAL_WRITES.MCDRAM",
@ -845,7 +845,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180600200 ",
+        "MSRValue": "0x0180600200",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.UC_CODE_READS.MCDRAM",
@ -856,7 +856,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180600400 ",
+        "MSRValue": "0x0180600400",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.BUS_LOCKS.MCDRAM",
@ -867,7 +867,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180601000 ",
+        "MSRValue": "0x0180601000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_SOFTWARE.MCDRAM",
@ -878,7 +878,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180608000 ",
+        "MSRValue": "0x0180608000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_REQUEST.MCDRAM",
@ -889,7 +889,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180603091 ",
+        "MSRValue": "0x0180603091",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_DATA_RD.MCDRAM",
@ -900,7 +900,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180600022 ",
+        "MSRValue": "0x0180600022",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_RFO.MCDRAM",
@ -911,7 +911,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180600044 ",
+        "MSRValue": "0x0180600044",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_CODE_RD.MCDRAM",
@ -922,7 +922,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x01806032f7 ",
+        "MSRValue": "0x01806032f7",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_READ.MCDRAM",
@ -933,7 +933,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0180600070 ",
+        "MSRValue": "0x0180600070",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_PF_L2.MCDRAM",
@ -944,7 +944,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181800001 ",
+        "MSRValue": "0x0181800001",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.DDR",
@ -955,7 +955,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181800002 ",
+        "MSRValue": "0x0181800002",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.DDR",
@ -966,7 +966,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181800004 ",
+        "MSRValue": "0x0181800004",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.DDR",
@ -977,7 +977,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181800020 ",
+        "MSRValue": "0x0181800020",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.DDR",
@ -988,7 +988,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181800040 ",
+        "MSRValue": "0x0181800040",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L2_CODE_RD.DDR",
@ -999,7 +999,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181800080 ",
+        "MSRValue": "0x0181800080",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PARTIAL_READS.DDR",
@ -1010,7 +1010,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181800200 ",
+        "MSRValue": "0x0181800200",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.UC_CODE_READS.DDR",
@ -1021,7 +1021,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181800400 ",
+        "MSRValue": "0x0181800400",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.BUS_LOCKS.DDR",
@ -1032,7 +1032,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181801000 ",
+        "MSRValue": "0x0181801000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_SOFTWARE.DDR",
@ -1043,7 +1043,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181802000 ",
+        "MSRValue": "0x0181802000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.PF_L1_DATA_RD.DDR",
@ -1054,7 +1054,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181808000 ",
+        "MSRValue": "0x0181808000",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_REQUEST.DDR",
@ -1065,7 +1065,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181803091 ",
+        "MSRValue": "0x0181803091",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_DATA_RD.DDR",
@ -1076,7 +1076,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181800022 ",
+        "MSRValue": "0x0181800022",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_RFO.DDR",
@ -1087,7 +1087,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x0181800044 ",
+        "MSRValue": "0x0181800044",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_CODE_RD.DDR",
@ -1098,7 +1098,7 @@
    },
    {
        "EventCode": "0xB7",
-        "MSRValue": "0x01818032f7 ",
+        "MSRValue": "0x01818032f7",
        "Counter": "0,1",
        "UMask": "0x1",
        "EventName": "OFFCORE_RESPONSE.ANY_READ.DDR",
--- a/tools/perf/pmu-events/arch/x86/knightslanding/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/knightslanding/pipeline.json
@ -144,7 +144,7 @@
        "BriefDescription": "Counts the number of micro-ops retired that are from the complex flows issued by the micro-sequencer (MS)."
    },
    {
-        "PublicDescription": "This event counts the number of micro-ops (uops) retired. The processor decodes complex macro instructions into a sequence of simpler uops. Most instructions are composed of one or two uops. Some instructions are decoded into longer sequences such as repeat instructions, floating point transcendental instructions, and assists. ",
+        "PublicDescription": "This event counts the number of micro-ops (uops) retired. The processor decodes complex macro instructions into a sequence of simpler uops. Most instructions are composed of one or two uops. Some instructions are decoded into longer sequences such as repeat instructions, floating point transcendental instructions, and assists.",
        "EventCode": "0xC2",
        "Counter": "0,1",
        "UMask": "0x10",
@ -218,7 +218,7 @@
        "UMask": "0x20",
        "EventName": "NO_ALLOC_CYCLES.RAT_STALL",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Counts the number of core cycles when no micro-ops are allocated and a RATstall (caused by reservation station full) is asserted.  "
+        "BriefDescription": "Counts the number of core cycles when no micro-ops are allocated and a RATstall (caused by reservation station full) is asserted."
    },
    {
        "PublicDescription": "This event counts the number of core cycles when no uops are allocated, the instruction queue is empty and the alloc pipe is stalled waiting for instructions to be fetched.",
@ -251,7 +251,7 @@
        "UMask": "0x1f",
        "EventName": "RS_FULL_STALL.ALL",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Counts the total number of core cycles the Alloc pipeline is stalled when any one of the reservation stations is full. "
+        "BriefDescription": "Counts the total number of core cycles the Alloc pipeline is stalled when any one of the reservation stations is full."
    },
    {
        "EventCode": "0xC0",
@ -268,11 +268,10 @@
        "UMask": "0x1",
        "EventName": "CYCLES_DIV_BUSY.ALL",
        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles the number of core cycles when divider is busy.  Does not imply a stall waiting for the divider.  "
+        "BriefDescription": "Cycles the number of core cycles when divider is busy.  Does not imply a stall waiting for the divider."
    },
    {
        "PublicDescription": "This event counts the number of instructions that retire.  For instructions that consist of multiple micro-ops, this event counts exactly once, as the last micro-op of the instruction retires.  The event continues counting while instructions retire, including during interrupt service routines caused by hardware interrupts, faults or traps.",
-        "EventCode": "0x00",
        "Counter": "Fixed counter 1",
        "UMask": "0x1",
        "EventName": "INST_RETIRED.ANY",
@ -296,8 +295,7 @@
        "BriefDescription": "Counts the number of unhalted reference clock cycles"
    },
    {
-        "PublicDescription": "This event counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter\r\n",
-        "EventCode": "0x00",
+        "PublicDescription": "This event counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter",
        "Counter": "Fixed counter 2",
        "UMask": "0x2",
        "EventName": "CPU_CLK_UNHALTED.THREAD",
@ -305,7 +303,6 @@
        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles"
    },
    {
-        "EventCode": "0x00",
        "Counter": "Fixed counter 3",
        "UMask": "0x3",
        "EventName": "CPU_CLK_UNHALTED.REF_TSC",
@ -343,7 +340,7 @@
        "UMask": "0x1",
        "EventName": "RECYCLEQ.LD_BLOCK_ST_FORWARD",
        "SampleAfterValue": "200003",
-        "BriefDescription": "Counts the number of occurences a retired load gets blocked because its address partially overlaps with a store ",
+        "BriefDescription": "Counts the number of occurences a retired load gets blocked because its address partially overlaps with a store",
        "Data_LA": "1"
    },
    {
--- a/tools/perf/pmu-events/arch/x86/knightslanding/virtual-memory.json
+++ b/tools/perf/pmu-events/arch/x86/knightslanding/virtual-memory.json
@ -36,7 +36,7 @@
        "EdgeDetect": "1"
    },
    {
-        "PublicDescription": "This event counts every cycle when an I-side (walks due to an instruction fetch) page walk is in progress. ",
+        "PublicDescription": "This event counts every cycle when an I-side (walks due to an instruction fetch) page walk is in progress.",
        "EventCode": "0x05",
        "Counter": "0,1",
        "UMask": "0x2",
--- a/tools/perf/pmu-events/arch/x86/sandybridge/cache.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/cache.json
@ -1,329 +1,4 @@
 [
-    {
-        "PEBS": "1",
-        "EventCode": "0xD0",
-        "Counter": "0,1,2,3",
-        "UMask": "0x11",
-        "EventName": "MEM_UOPS_RETIRED.STLB_MISS_LOADS",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Retired load uops that miss the STLB.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "EventCode": "0xD0",
-        "Counter": "0,1,2,3",
-        "UMask": "0x12",
-        "EventName": "MEM_UOPS_RETIRED.STLB_MISS_STORES",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Retired store uops that miss the STLB.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "EventCode": "0xD0",
-        "Counter": "0,1,2,3",
-        "UMask": "0x21",
-        "EventName": "MEM_UOPS_RETIRED.LOCK_LOADS",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Retired load uops with locked access.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "PublicDescription": "This event counts line-split load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
-        "EventCode": "0xD0",
-        "Counter": "0,1,2,3",
-        "UMask": "0x41",
-        "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Retired load uops that split across a cacheline boundary.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "PublicDescription": "This event counts line-split store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K).",
-        "EventCode": "0xD0",
-        "Counter": "0,1,2,3",
-        "UMask": "0x42",
-        "EventName": "MEM_UOPS_RETIRED.SPLIT_STORES",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Retired store uops that split across a cacheline boundary.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "PublicDescription": "This event counts the number of load uops retired",
-        "EventCode": "0xD0",
-        "Counter": "0,1,2,3",
-        "UMask": "0x81",
-        "EventName": "MEM_UOPS_RETIRED.ALL_LOADS",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "All retired load uops.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "PublicDescription": "This event counts the number of store uops retired.",
-        "EventCode": "0xD0",
-        "Counter": "0,1,2,3",
-        "UMask": "0x82",
-        "EventName": "MEM_UOPS_RETIRED.ALL_STORES",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "All retired store uops.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "EventCode": "0xD1",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "MEM_LOAD_UOPS_RETIRED.L1_HIT",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Retired load uops with L1 cache hits as data sources.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "EventCode": "0xD1",
-        "Counter": "0,1,2,3",
-        "UMask": "0x2",
-        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Retired load uops with L2 cache hits as data sources.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "PublicDescription": "This event counts retired load uops that hit in the last-level (L3) cache without snoops required.",
-        "EventCode": "0xD1",
-        "Counter": "0,1,2,3",
-        "UMask": "0x4",
-        "EventName": "MEM_LOAD_UOPS_RETIRED.LLC_HIT",
-        "SampleAfterValue": "50021",
-        "BriefDescription": "Retired load uops which data sources were data hits in LLC without snoops required.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "EventCode": "0xD1",
-        "Counter": "0,1,2,3",
-        "UMask": "0x40",
-        "EventName": "MEM_LOAD_UOPS_RETIRED.HIT_LFB",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "EventCode": "0xD2",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS",
-        "SampleAfterValue": "20011",
-        "BriefDescription": "Retired load uops which data sources were LLC hit and cross-core snoop missed in on-pkg core cache.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "PublicDescription": "This event counts retired load uops that hit in the last-level cache (L3) and were found in a non-modified state in a neighboring core's private cache (same package).  Since the last level cache is inclusive, hits to the L3 may require snooping the private L2 caches of any cores on the same socket that have the line.  In this case, a snoop was required, and another L2 had the line in a non-modified state.",
-        "EventCode": "0xD2",
-        "Counter": "0,1,2,3",
-        "UMask": "0x2",
-        "EventName": "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT",
-        "SampleAfterValue": "20011",
-        "BriefDescription": "Retired load uops which data sources were LLC and cross-core snoop hits in on-pkg core cache.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "PublicDescription": "This event counts retired load uops that hit in the last-level cache (L3) and were found in a non-modified state in a neighboring core's private cache (same package).  Since the last level cache is inclusive, hits to the L3 may require snooping the private L2 caches of any cores on the same socket that have the line.  In this case, a snoop was required, and another L2 had the line in a modified state, so the line had to be invalidated in that L2 cache and transferred to the requesting L2.",
-        "EventCode": "0xD2",
-        "Counter": "0,1,2,3",
-        "UMask": "0x4",
-        "EventName": "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM",
-        "SampleAfterValue": "20011",
-        "BriefDescription": "Retired load uops which data sources were HitM responses from shared LLC.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "EventCode": "0xD2",
-        "Counter": "0,1,2,3",
-        "UMask": "0x8",
-        "EventName": "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_NONE",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Retired load uops which data sources were hits in LLC without snoops required.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PEBS": "1",
-        "PublicDescription": "This event counts retired demand loads that missed the  last-level (L3) cache. This means that the load is usually satisfied from memory in a client system or possibly from the remote socket in a server. Demand loads are non speculative load uops.",
-        "EventCode": "0xD4",
-        "Counter": "0,1,2,3",
-        "UMask": "0x2",
-        "EventName": "MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS",
-        "SampleAfterValue": "100007",
-        "BriefDescription": "Retired load uops with unknown information as data source in cache serviced the load.",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "PublicDescription": "This event counts L1D data line replacements.  Replacements occur when a new line is brought into the cache, causing eviction of a line loaded earlier.  ",
-        "EventCode": "0x51",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "L1D.REPLACEMENT",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "L1D data line replacements.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x51",
-        "Counter": "0,1,2,3",
-        "UMask": "0x2",
-        "EventName": "L1D.ALLOCATED_IN_M",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Allocated L1D data cache lines in M state.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x51",
-        "Counter": "0,1,2,3",
-        "UMask": "0x4",
-        "EventName": "L1D.EVICTION",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "L1D data cache lines in M state evicted due to replacement.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x51",
-        "Counter": "0,1,2,3",
-        "UMask": "0x8",
-        "EventName": "L1D.ALL_M_REPLACEMENT",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cache lines in M state evicted out of L1D due to Snoop HitM or dirty line replacement.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x48",
-        "Counter": "2",
-        "UMask": "0x1",
-        "EventName": "L1D_PEND_MISS.PENDING",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "L1D miss oustandings duration in cycles.",
-        "CounterHTOff": "2"
-    },
-    {
-        "EventCode": "0x48",
-        "Counter": "2",
-        "UMask": "0x1",
-        "EventName": "L1D_PEND_MISS.PENDING_CYCLES",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles with L1D load Misses outstanding.",
-        "CounterMask": "1",
-        "CounterHTOff": "2"
-    },
-    {
-        "EventCode": "0x63",
-        "Counter": "0,1,2,3",
-        "UMask": "0x2",
-        "EventName": "LOCK_CYCLES.CACHE_LOCK_DURATION",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles when L1D is locked.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x60",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Offcore outstanding Demand Data Read transactions in uncore queue.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x60",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles when offcore outstanding Demand Data Read transactions are present in SuperQueue (SQ), queue to uncore.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x60",
-        "Counter": "0,1,2,3",
-        "UMask": "0x4",
-        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Offcore outstanding RFO store transactions in SuperQueue (SQ), queue to uncore.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x60",
-        "Counter": "0,1,2,3",
-        "UMask": "0x8",
-        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x60",
-        "Counter": "0,1,2,3",
-        "UMask": "0x8",
-        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles when offcore outstanding cacheable Core Data Read transactions are present in SuperQueue (SQ), queue to uncore.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0xB0",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_REQUESTS.DEMAND_DATA_RD",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Demand Data Read requests sent to uncore.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0xB0",
-        "Counter": "0,1,2,3",
-        "UMask": "0x2",
-        "EventName": "OFFCORE_REQUESTS.DEMAND_CODE_RD",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Cacheable and noncachaeble code read requests.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0xB0",
-        "Counter": "0,1,2,3",
-        "UMask": "0x4",
-        "EventName": "OFFCORE_REQUESTS.DEMAND_RFO",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Demand RFO requests including regular RFOs, locks, ItoM.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0xB0",
-        "Counter": "0,1,2,3",
-        "UMask": "0x8",
-        "EventName": "OFFCORE_REQUESTS.ALL_DATA_RD",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Demand and prefetch data reads.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0xB2",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_REQUESTS_BUFFER.SQ_FULL",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cases when offcore requests buffer cannot take more entries for core.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
    {
        "EventCode": "0x24",
        "Counter": "0,1,2,3",
@ -333,6 +8,15 @@
        "BriefDescription": "Demand Data Read requests that hit L2 cache.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
+    {
+        "EventCode": "0x24",
+        "Counter": "0,1,2,3",
+        "UMask": "0x3",
+        "EventName": "L2_RQSTS.ALL_DEMAND_DATA_RD",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Demand Data Read requests.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
    {
        "EventCode": "0x24",
        "Counter": "0,1,2,3",
@ -351,6 +35,15 @@
        "BriefDescription": "RFO requests that miss L2 cache.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
+    {
+        "EventCode": "0x24",
+        "Counter": "0,1,2,3",
+        "UMask": "0xc",
+        "EventName": "L2_RQSTS.ALL_RFO",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "RFO requests to L2 cache.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
    {
        "EventCode": "0x24",
        "Counter": "0,1,2,3",
@ -369,6 +62,15 @@
        "BriefDescription": "L2 cache misses when fetching instructions.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
+    {
+        "EventCode": "0x24",
+        "Counter": "0,1,2,3",
+        "UMask": "0x30",
+        "EventName": "L2_RQSTS.ALL_CODE_RD",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "L2 code requests.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
    {
        "EventCode": "0x24",
        "Counter": "0,1,2,3",
@ -387,6 +89,15 @@
        "BriefDescription": "Requests from the L2 hardware prefetchers that miss L2 cache.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
+    {
+        "EventCode": "0x24",
+        "Counter": "0,1,2,3",
+        "UMask": "0xc0",
+        "EventName": "L2_RQSTS.ALL_PF",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Requests from L2 hardware prefetchers.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
    {
        "EventCode": "0x27",
        "Counter": "0,1,2,3",
@ -468,6 +179,400 @@
        "BriefDescription": "Not rejected writebacks from L1D to L2 cache lines in any state.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
+    {
+        "EventCode": "0x2E",
+        "Counter": "0,1,2,3",
+        "UMask": "0x41",
+        "EventName": "LONGEST_LAT_CACHE.MISS",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Core-originated cacheable demand requests missed LLC.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x2E",
+        "Counter": "0,1,2,3",
+        "UMask": "0x4f",
+        "EventName": "LONGEST_LAT_CACHE.REFERENCE",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Core-originated cacheable demand requests that refer to LLC.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x48",
+        "Counter": "2",
+        "UMask": "0x1",
+        "EventName": "L1D_PEND_MISS.PENDING",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "L1D miss oustandings duration in cycles.",
+        "CounterHTOff": "2"
+    },
+    {
+        "EventCode": "0x48",
+        "Counter": "2",
+        "UMask": "0x1",
+        "EventName": "L1D_PEND_MISS.PENDING_CYCLES",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles with L1D load Misses outstanding.",
+        "CounterMask": "1",
+        "CounterHTOff": "2"
+    },
+    {
+        "EventCode": "0x48",
+        "Counter": "2",
+        "UMask": "0x1",
+        "AnyThread": "1",
+        "EventName": "L1D_PEND_MISS.PENDING_CYCLES_ANY",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles with L1D load Misses outstanding from any thread on physical core.",
+        "CounterMask": "1",
+        "CounterHTOff": "2"
+    },
+    {
+        "EventCode": "0x48",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "EventName": "L1D_PEND_MISS.FB_FULL",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles a demand request was blocked due to Fill Buffers inavailability.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "PublicDescription": "This event counts L1D data line replacements.  Replacements occur when a new line is brought into the cache, causing eviction of a line loaded earlier.",
+        "EventCode": "0x51",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "L1D.REPLACEMENT",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "L1D data line replacements.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x51",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "EventName": "L1D.ALLOCATED_IN_M",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Allocated L1D data cache lines in M state.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x51",
+        "Counter": "0,1,2,3",
+        "UMask": "0x4",
+        "EventName": "L1D.EVICTION",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "L1D data cache lines in M state evicted due to replacement.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x51",
+        "Counter": "0,1,2,3",
+        "UMask": "0x8",
+        "EventName": "L1D.ALL_M_REPLACEMENT",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cache lines in M state evicted out of L1D due to Snoop HitM or dirty line replacement.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Offcore outstanding Demand Data Read transactions in uncore queue.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles when offcore outstanding Demand Data Read transactions are present in SuperQueue (SQ), queue to uncore.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD_C6",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles with at least 6 offcore outstanding Demand Data Read transactions in uncore queue.",
+        "CounterMask": "6",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "Counter": "0,1,2,3",
+        "UMask": "0x4",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Offcore outstanding RFO store transactions in SuperQueue (SQ), queue to uncore.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "Counter": "0,1,2,3",
+        "UMask": "0x4",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "Counter": "0,1,2,3",
+        "UMask": "0x8",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "Counter": "0,1,2,3",
+        "UMask": "0x8",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles when offcore outstanding cacheable Core Data Read transactions are present in SuperQueue (SQ), queue to uncore.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x63",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "EventName": "LOCK_CYCLES.CACHE_LOCK_DURATION",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles when L1D is locked.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB0",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "OFFCORE_REQUESTS.DEMAND_DATA_RD",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Demand Data Read requests sent to uncore.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB0",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "EventName": "OFFCORE_REQUESTS.DEMAND_CODE_RD",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Cacheable and noncachaeble code read requests.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB0",
+        "Counter": "0,1,2,3",
+        "UMask": "0x4",
+        "EventName": "OFFCORE_REQUESTS.DEMAND_RFO",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Demand RFO requests including regular RFOs, locks, ItoM.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB0",
+        "Counter": "0,1,2,3",
+        "UMask": "0x8",
+        "EventName": "OFFCORE_REQUESTS.ALL_DATA_RD",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Demand and prefetch data reads.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB2",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "OFFCORE_REQUESTS_BUFFER.SQ_FULL",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cases when offcore requests buffer cannot take more entries for core.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xBF",
+        "Counter": "0,1,2,3",
+        "UMask": "0x5",
+        "EventName": "L1D_BLOCKS.BANK_CONFLICT_CYCLES",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Cycles when dispatched loads are cancelled due to L1D bank conflicts with other load ports.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "PEBS": "1",
+        "EventCode": "0xD0",
+        "Counter": "0,1,2,3",
+        "UMask": "0x11",
+        "EventName": "MEM_UOPS_RETIRED.STLB_MISS_LOADS",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Retired load uops that miss the STLB. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "EventCode": "0xD0",
+        "Counter": "0,1,2,3",
+        "UMask": "0x12",
+        "EventName": "MEM_UOPS_RETIRED.STLB_MISS_STORES",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Retired store uops that miss the STLB. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "EventCode": "0xD0",
+        "Counter": "0,1,2,3",
+        "UMask": "0x21",
+        "EventName": "MEM_UOPS_RETIRED.LOCK_LOADS",
+        "SampleAfterValue": "100007",
+        "BriefDescription": "Retired load uops with locked access. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "PublicDescription": "This event counts line-splitted load uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). (Precise Event - PEBS)",
+        "EventCode": "0xD0",
+        "Counter": "0,1,2,3",
+        "UMask": "0x41",
+        "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Retired load uops that split across a cacheline boundary. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "PublicDescription": "This event counts line-splitted store uops retired to the architected path. A line split is across 64B cache-line which includes a page split (4K). (Precise Event - PEBS)",
+        "EventCode": "0xD0",
+        "Counter": "0,1,2,3",
+        "UMask": "0x42",
+        "EventName": "MEM_UOPS_RETIRED.SPLIT_STORES",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Retired store uops that split across a cacheline boundary. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "PublicDescription": "This event counts the number of load uops retired (Precise Event)",
+        "EventCode": "0xD0",
+        "Counter": "0,1,2,3",
+        "UMask": "0x81",
+        "EventName": "MEM_UOPS_RETIRED.ALL_LOADS",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "All retired load uops. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "PublicDescription": "This event counts the number of store uops retired. (Precise Event - PEBS)",
+        "EventCode": "0xD0",
+        "Counter": "0,1,2,3",
+        "UMask": "0x82",
+        "EventName": "MEM_UOPS_RETIRED.ALL_STORES",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "All retired store uops. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "EventCode": "0xD1",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L1_HIT",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Retired load uops with L1 cache hits as data sources. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "EventCode": "0xD1",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Retired load uops with L2 cache hits as data sources. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "PublicDescription": "This event counts retired load uops that hit in the last-level (L3) cache without snoops required. (Precise Event - PEBS)",
+        "EventCode": "0xD1",
+        "Counter": "0,1,2,3",
+        "UMask": "0x4",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.LLC_HIT",
+        "SampleAfterValue": "50021",
+        "BriefDescription": "Retired load uops which data sources were data hits in LLC without snoops required. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "EventCode": "0xD1",
+        "Counter": "0,1,2,3",
+        "UMask": "0x40",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.HIT_LFB",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Retired load uops which data sources were load uops missed L1 but hit FB due to preceding miss to the same cache line with data not ready. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "EventCode": "0xD2",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS",
+        "SampleAfterValue": "20011",
+        "BriefDescription": "Retired load uops which data sources were LLC hit and cross-core snoop missed in on-pkg core cache. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "PublicDescription": "This event counts retired load uops that hit in the last-level cache (L3) and were found in a non-modified state in a neighboring core's private cache (same package).  Since the last level cache is inclusive, hits to the L3 may require snooping the private L2 caches of any cores on the same socket that have the line.  In this case, a snoop was required, and another L2 had the line in a non-modified state. (Precise Event - PEBS)",
+        "EventCode": "0xD2",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "EventName": "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT",
+        "SampleAfterValue": "20011",
+        "BriefDescription": "Retired load uops which data sources were LLC and cross-core snoop hits in on-pkg core cache. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "PublicDescription": "This event counts retired load uops that hit in the last-level cache (L3) and were found in a non-modified state in a neighboring core's private cache (same package).  Since the last level cache is inclusive, hits to the L3 may require snooping the private L2 caches of any cores on the same socket that have the line.  In this case, a snoop was required, and another L2 had the line in a modified state, so the line had to be invalidated in that L2 cache and transferred to the requesting L2. (Precise Event - PEBS)",
+        "EventCode": "0xD2",
+        "Counter": "0,1,2,3",
+        "UMask": "0x4",
+        "EventName": "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM",
+        "SampleAfterValue": "20011",
+        "BriefDescription": "Retired load uops which data sources were HitM responses from shared LLC. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "EventCode": "0xD2",
+        "Counter": "0,1,2,3",
+        "UMask": "0x8",
+        "EventName": "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_NONE",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Retired load uops which data sources were hits in LLC without snoops required. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "PEBS": "1",
+        "PublicDescription": "This event counts retired demand loads that missed the  last-level (L3) cache. This means that the load is usually satisfied from memory in a client system or possibly from the remote socket in a server. Demand loads are non speculative load uops. (Precise Event - PEBS)",
+        "EventCode": "0xD4",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "EventName": "MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS",
+        "SampleAfterValue": "100007",
+        "BriefDescription": "Retired load uops with unknown information as data source in cache serviced the load. (Precise Event - PEBS).",
+        "CounterHTOff": "0,1,2,3"
+    },
    {
        "EventCode": "0xF0",
        "Counter": "0,1,2,3",
@ -622,24 +727,6 @@
        "BriefDescription": "Dirty L2 cache lines filling the L2.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
-    {
-        "EventCode": "0x2E",
-        "Counter": "0,1,2,3",
-        "UMask": "0x41",
-        "EventName": "LONGEST_LAT_CACHE.MISS",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Core-originated cacheable demand requests missed LLC.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x2E",
-        "Counter": "0,1,2,3",
-        "UMask": "0x4f",
-        "EventName": "LONGEST_LAT_CACHE.REFERENCE",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Core-originated cacheable demand requests that refer to LLC.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
    {
        "EventCode": "0xF4",
        "Counter": "0,1,2,3",
@ -649,93 +736,6 @@
        "BriefDescription": "Split locks in SQ.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
-    {
-        "EventCode": "0x24",
-        "Counter": "0,1,2,3",
-        "UMask": "0x3",
-        "EventName": "L2_RQSTS.ALL_DEMAND_DATA_RD",
-        "SampleAfterValue": "200003",
-        "BriefDescription": "Demand Data Read requests.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x24",
-        "Counter": "0,1,2,3",
-        "UMask": "0xc",
-        "EventName": "L2_RQSTS.ALL_RFO",
-        "SampleAfterValue": "200003",
-        "BriefDescription": "RFO requests to L2 cache.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x24",
-        "Counter": "0,1,2,3",
-        "UMask": "0x30",
-        "EventName": "L2_RQSTS.ALL_CODE_RD",
-        "SampleAfterValue": "200003",
-        "BriefDescription": "L2 code requests.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x24",
-        "Counter": "0,1,2,3",
-        "UMask": "0xc0",
-        "EventName": "L2_RQSTS.ALL_PF",
-        "SampleAfterValue": "200003",
-        "BriefDescription": "Requests from L2 hardware prefetchers.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0xBF",
-        "Counter": "0,1,2,3",
-        "UMask": "0x5",
-        "EventName": "L1D_BLOCKS.BANK_CONFLICT_CYCLES",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Cycles when dispatched loads are cancelled due to L1D bank conflicts with other load ports.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x60",
-        "Counter": "0,1,2,3",
-        "UMask": "0x4",
-        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x60",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD_C6",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles with at least 6 offcore outstanding Demand Data Read transactions in uncore queue.",
-        "CounterMask": "6",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x48",
-        "Counter": "2",
-        "UMask": "0x1",
-        "AnyThread": "1",
-        "EventName": "L1D_PEND_MISS.PENDING_CYCLES_ANY",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles with L1D load Misses outstanding from any thread on physical core.",
-        "CounterMask": "1",
-        "CounterHTOff": "2"
-    },
-    {
-        "EventCode": "0x48",
-        "Counter": "0,1,2,3",
-        "UMask": "0x2",
-        "EventName": "L1D_PEND_MISS.FB_FULL",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles a demand request was blocked due to Fill Buffers inavailability.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
    {
        "EventCode": "0xB7, 0xBB",
        "MSRValue": "0x10003c0244",
@ -1825,7 +1825,7 @@
        "EventName": "OFFCORE_RESPONSE.DATA_IN.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": " REQUEST = DATA_INTO_CORE and RESPONSE = ANY_RESPONSE",
+        "BriefDescription": "REQUEST = DATA_INTO_CORE and RESPONSE = ANY_RESPONSE",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -1837,7 +1837,7 @@
        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT_M.HITM",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": " REQUEST = DEMAND_RFO and RESPONSE = LLC_HIT_M and SNOOP = HITM",
+        "BriefDescription": "REQUEST = DEMAND_RFO and RESPONSE = LLC_HIT_M and SNOOP = HITM",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -1849,7 +1849,7 @@
        "EventName": "OFFCORE_RESPONSE.PF_IFETCH.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": " REQUEST = PF_RFO and RESPONSE = ANY_RESPONSE",
+        "BriefDescription": "REQUEST = PF_RFO and RESPONSE = ANY_RESPONSE",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -1861,7 +1861,7 @@
        "EventName": "OFFCORE_RESPONSE.PF_L_DATA_RD.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": " REQUEST = PF_LLC_DATA_RD and RESPONSE = ANY_RESPONSE",
+        "BriefDescription": "REQUEST = PF_LLC_DATA_RD and RESPONSE = ANY_RESPONSE",
        "CounterHTOff": "0,1,2,3"
    },
    {
@ -1873,7 +1873,7 @@
        "EventName": "OFFCORE_RESPONSE.PF_L_IFETCH.ANY_RESPONSE",
        "MSRIndex": "0x1a6,0x1a7",
        "SampleAfterValue": "100003",
-        "BriefDescription": " REQUEST = PF_LLC_IFETCH and RESPONSE = ANY_RESPONSE",
+        "BriefDescription": "REQUEST = PF_LLC_IFETCH and RESPONSE = ANY_RESPONSE",
        "CounterHTOff": "0,1,2,3"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/sandybridge/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/floating-point.json
@ -1,67 +1,4 @@
 [
-    {
-        "EventCode": "0xC1",
-        "Counter": "0,1,2,3",
-        "UMask": "0x8",
-        "EventName": "OTHER_ASSISTS.AVX_STORE",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of GSSE memory assist for stores. GSSE microcode assist is being invoked whenever the hardware is unable to properly handle GSSE-256b operations.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0xC1",
-        "Counter": "0,1,2,3",
-        "UMask": "0x10",
-        "EventName": "OTHER_ASSISTS.AVX_TO_SSE",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of transitions from AVX-256 to legacy SSE when penalty applicable.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0xC1",
-        "Counter": "0,1,2,3",
-        "UMask": "0x20",
-        "EventName": "OTHER_ASSISTS.SSE_TO_AVX",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of transitions from SSE to AVX-256 when penalty applicable.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0xCA",
-        "Counter": "0,1,2,3",
-        "UMask": "0x2",
-        "EventName": "FP_ASSIST.X87_OUTPUT",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of X87 assists due to output value.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0xCA",
-        "Counter": "0,1,2,3",
-        "UMask": "0x4",
-        "EventName": "FP_ASSIST.X87_INPUT",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of X87 assists due to input value.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0xCA",
-        "Counter": "0,1,2,3",
-        "UMask": "0x8",
-        "EventName": "FP_ASSIST.SIMD_OUTPUT",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of SIMD FP assists due to Output values.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0xCA",
-        "Counter": "0,1,2,3",
-        "UMask": "0x10",
-        "EventName": "FP_ASSIST.SIMD_INPUT",
-        "SampleAfterValue": "100003",
-        "BriefDescription": "Number of SIMD FP assists due to input values.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
    {
        "EventCode": "0x10",
        "Counter": "0,1,2,3",
@ -125,6 +62,69 @@
        "BriefDescription": "Number of AVX-256 Computational FP double precision uops issued this cycle.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
+    {
+        "EventCode": "0xC1",
+        "Counter": "0,1,2,3",
+        "UMask": "0x8",
+        "EventName": "OTHER_ASSISTS.AVX_STORE",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Number of GSSE memory assist for stores. GSSE microcode assist is being invoked whenever the hardware is unable to properly handle GSSE-256b operations.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC1",
+        "Counter": "0,1,2,3",
+        "UMask": "0x10",
+        "EventName": "OTHER_ASSISTS.AVX_TO_SSE",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Number of transitions from AVX-256 to legacy SSE when penalty applicable.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC1",
+        "Counter": "0,1,2,3",
+        "UMask": "0x20",
+        "EventName": "OTHER_ASSISTS.SSE_TO_AVX",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Number of transitions from SSE to AVX-256 when penalty applicable.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xCA",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "EventName": "FP_ASSIST.X87_OUTPUT",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Number of X87 assists due to output value.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xCA",
+        "Counter": "0,1,2,3",
+        "UMask": "0x4",
+        "EventName": "FP_ASSIST.X87_INPUT",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Number of X87 assists due to input value.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xCA",
+        "Counter": "0,1,2,3",
+        "UMask": "0x8",
+        "EventName": "FP_ASSIST.SIMD_OUTPUT",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Number of SIMD FP assists due to Output values.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xCA",
+        "Counter": "0,1,2,3",
+        "UMask": "0x10",
+        "EventName": "FP_ASSIST.SIMD_INPUT",
+        "SampleAfterValue": "100003",
+        "BriefDescription": "Number of SIMD FP assists due to input values.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
    {
        "EventCode": "0xCA",
        "Counter": "0,1,2,3",
--- a/tools/perf/pmu-events/arch/x86/sandybridge/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/frontend.json
@ -1,23 +1,4 @@
 [
-    {
-        "EventCode": "0x80",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "ICACHE.HIT",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Number of Instruction Cache, Streaming Buffer and Victim Cache Reads. both cacheable and noncacheable, including UC fetches.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "PublicDescription": "This event counts the number of instruction cache, streaming buffer and victim cache misses. Counting includes unchacheable accesses.",
-        "EventCode": "0x80",
-        "Counter": "0,1,2,3",
-        "UMask": "0x2",
-        "EventName": "ICACHE.MISSES",
-        "SampleAfterValue": "200003",
-        "BriefDescription": "Instruction cache, streaming buffer and victim cache misses.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
    {
        "EventCode": "0x79",
        "Counter": "0,1,2,3",
@ -36,6 +17,16 @@
        "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) from MITE path.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
+    {
+        "EventCode": "0x79",
+        "Counter": "0,1,2,3",
+        "UMask": "0x4",
+        "EventName": "IDQ.MITE_CYCLES",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from MITE path.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
    {
        "EventCode": "0x79",
        "Counter": "0,1,2,3",
@ -45,6 +36,16 @@
        "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
+    {
+        "EventCode": "0x79",
+        "Counter": "0,1,2,3",
+        "UMask": "0x8",
+        "EventName": "IDQ.DSB_CYCLES",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
    {
        "EventCode": "0x79",
        "Counter": "0,1,2,3",
@ -54,6 +55,47 @@
        "BriefDescription": "Uops initiated by Decode Stream Buffer (DSB) that are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
+    {
+        "EventCode": "0x79",
+        "Counter": "0,1,2,3",
+        "UMask": "0x10",
+        "EventName": "IDQ.MS_DSB_CYCLES",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "Counter": "0,1,2,3",
+        "UMask": "0x10",
+        "EdgeDetect": "1",
+        "EventName": "IDQ.MS_DSB_OCCUR",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while Microcode Sequenser (MS) is busy.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "Counter": "0,1,2,3",
+        "UMask": "0x18",
+        "EventName": "IDQ.ALL_DSB_CYCLES_4_UOPS",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops.",
+        "CounterMask": "4",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "Counter": "0,1,2,3",
+        "UMask": "0x18",
+        "EventName": "IDQ.ALL_DSB_CYCLES_ANY_UOPS",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
    {
        "EventCode": "0x79",
        "Counter": "0,1,2,3",
@ -63,6 +105,26 @@
        "BriefDescription": "Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
+    {
+        "EventCode": "0x79",
+        "Counter": "0,1,2,3",
+        "UMask": "0x24",
+        "EventName": "IDQ.ALL_MITE_CYCLES_4_UOPS",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles MITE is delivering 4 Uops.",
+        "CounterMask": "4",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "Counter": "0,1,2,3",
+        "UMask": "0x24",
+        "EventName": "IDQ.ALL_MITE_CYCLES_ANY_UOPS",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles MITE is delivering any Uop.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
    {
        "EventCode": "0x79",
        "Counter": "0,1,2,3",
@ -73,7 +135,7 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
    {
-        "PublicDescription": "This event counts cycles during which the microcode sequencer assisted the front-end in delivering uops.  Microcode assists are used for complex instructions or scenarios that can't be handled by the standard decoder.  Using other instructions, if possible, will usually improve performance.  See the Intel? 64 and IA-32 Architectures Optimization Reference Manual for more information.",
+        "PublicDescription": "This event counts cycles during which the microcode sequencer assisted the front-end in delivering uops.  Microcode assists are used for complex instructions or scenarios that can't be handled by the standard decoder.  Using other instructions, if possible, will usually improve performance.  See the Intel\u00ae 64 and IA-32 Architectures Optimization Reference Manual for more information.",
        "EventCode": "0x79",
        "Counter": "0,1,2,3",
        "UMask": "0x30",
@ -83,6 +145,45 @@
        "CounterMask": "1",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
+    {
+        "EventCode": "0x79",
+        "Counter": "0,1,2,3",
+        "UMask": "0x30",
+        "EdgeDetect": "1",
+        "EventName": "IDQ.MS_SWITCHES",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "Counter": "0,1,2,3",
+        "UMask": "0x3c",
+        "EventName": "IDQ.MITE_ALL_UOPS",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) from MITE path.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x80",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "ICACHE.HIT",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Number of Instruction Cache, Streaming Buffer and Victim Cache Reads. both cacheable and noncacheable, including UC fetches.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "PublicDescription": "This event counts the number of instruction cache, streaming buffer and victim cache misses. Counting includes unchacheable accesses.",
+        "EventCode": "0x80",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "EventName": "ICACHE.MISSES",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Instruction cache, streaming buffer and victim cache misses.",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
    {
        "PublicDescription": "This event counts the number of uops not delivered to the back-end per cycle, per thread, when the back-end was not stalled.  In the ideal case 4 uops can be delivered each cycle.  The event counts the undelivered uops - so if 3 were delivered in one cycle, the counter would be incremented by 1 for that cycle (4 - 3). If the back-end is stalled, the count for this event is not incremented even when uops were not delivered, because the back-end would not have been able to accept them.  This event is used in determining the front-end bound category of the top-down pipeline slots characterization.",
        "EventCode": "0x9C",
@ -113,6 +214,48 @@
        "CounterMask": "3",
        "CounterHTOff": "0,1,2,3"
    },
+    {
+        "EventCode": "0x9C",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_2_UOP_DELIV.CORE",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles with less than 2 uops delivered by the front end.",
+        "CounterMask": "2",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0x9C",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_3_UOP_DELIV.CORE",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles with less than 3 uops delivered by the front end.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0x9C",
+        "Invert": "1",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_GE_1_UOP_DELIV.CORE",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Cycles when 1 or more uops were delivered to the by the front end.",
+        "CounterMask": "4",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0x9C",
+        "Invert": "1",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK",
+        "SampleAfterValue": "2000003",
+        "BriefDescription": "Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE.",
+        "CounterMask": "1",
+        "CounterHTOff": "0,1,2,3"
+    },
    {
        "EventCode": "0xAB",
        "Counter": "0,1,2,3",
@ -150,118 +293,6 @@
        "BriefDescription": "Cycles when Decode Stream Buffer (DSB) fill encounter more than 3 Decode Stream Buffer (DSB) lines.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
    },
-    {
-        "EventCode": "0x79",
-        "Counter": "0,1,2,3",
-        "UMask": "0x4",
-        "EventName": "IDQ.MITE_CYCLES",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from MITE path.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x79",
-        "Counter": "0,1,2,3",
-        "UMask": "0x8",
-        "EventName": "IDQ.DSB_CYCLES",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x79",
-        "Counter": "0,1,2,3",
-        "UMask": "0x10",
-        "EventName": "IDQ.MS_DSB_CYCLES",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x79",
-        "Counter": "0,1,2,3",
-        "UMask": "0x10",
-        "EdgeDetect": "1",
-        "EventName": "IDQ.MS_DSB_OCCUR",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Deliveries to Instruction Decode Queue (IDQ) initiated by Decode Stream Buffer (DSB) while Microcode Sequenser (MS) is busy.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x9C",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_2_UOP_DELIV.CORE",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles with less than 2 uops delivered by the front end.",
-        "CounterMask": "2",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "EventCode": "0x9C",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_3_UOP_DELIV.CORE",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles with less than 3 uops delivered by the front end.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "EventCode": "0x9C",
-        "Invert": "1",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_GE_1_UOP_DELIV.CORE",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles when 1 or more uops were delivered to the by the front end.",
-        "CounterMask": "4",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "EventCode": "0x79",
-        "Counter": "0,1,2,3",
-        "UMask": "0x18",
-        "EventName": "IDQ.ALL_DSB_CYCLES_4_UOPS",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops.",
-        "CounterMask": "4",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x79",
-        "Counter": "0,1,2,3",
-        "UMask": "0x18",
-        "EventName": "IDQ.ALL_DSB_CYCLES_ANY_UOPS",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x79",
-        "Counter": "0,1,2,3",
-        "UMask": "0x24",
-        "EventName": "IDQ.ALL_MITE_CYCLES_4_UOPS",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles MITE is delivering 4 Uops.",
-        "CounterMask": "4",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x79",
-        "Counter": "0,1,2,3",
-        "UMask": "0x24",
-        "EventName": "IDQ.ALL_MITE_CYCLES_ANY_UOPS",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Cycles MITE is delivering any Uop.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
    {
        "EventCode": "0xAC",
        "Counter": "0,1,2,3",
@ -270,36 +301,5 @@
        "SampleAfterValue": "2000003",
        "BriefDescription": "Cases of cancelling valid Decode Stream Buffer (DSB) fill not because of exceeding way limit.",
        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x9C",
-        "Invert": "1",
-        "Counter": "0,1,2,3",
-        "UMask": "0x1",
-        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3"
-    },
-    {
-        "EventCode": "0x79",
-        "Counter": "0,1,2,3",
-        "UMask": "0x3c",
-        "EventName": "IDQ.MITE_ALL_UOPS",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) from MITE path.",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
-    },
-    {
-        "EventCode": "0x79",
-        "Counter": "0,1,2,3",
-        "UMask": "0x30",
-        "EdgeDetect": "1",
-        "EventName": "IDQ.MS_SWITCHES",
-        "SampleAfterValue": "2000003",
-        "BriefDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer.",
-        "CounterMask": "1",
-        "CounterHTOff": "0,1,2,3,4,5,6,7"
    }
 ]
--- a/Show More
+++ b/Show More