From 3247343118daa73f2b94b7fa565425d1d9f9ac84 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 8 Nov 2013 18:52:21 +0100 Subject: [PATCH 001/282] uprobes: Add uprobe_task->dup_xol_work/dup_xol_addr uprobe_task->vaddr is a bit strange. The generic code uses it only to pass the additional argument to arch_uprobe_pre_xol(), and since it is always equal to instruction_pointer() this looks even more strange. And both utask->vaddr and and utask->autask have the same scope, they only have the meaning when the task executes the probed insn out-of-line, so it is safe to reuse both in UTASK_RUNNING state. This all means that logically ->vaddr belongs to arch_uprobe_task and we should probably move it there, arch_uprobe_pre_xol() can record instruction_pointer() itself. OTOH, it is also used by uprobe_copy_process() and dup_xol_work() for another purpose, this doesn't look clean and doesn't allow to move this member into arch_uprobe_task. This patch adds the union with 2 anonymous structs into uprobe_task. The first struct is autask + vaddr, this way we "almost" move vaddr into autask. The second struct has 2 new members for uprobe_copy_process() paths: ->dup_xol_addr which can be used instead ->vaddr, and ->dup_xol_work which can be used to avoid kmalloc() and simplify the code. Note that this union will likely have another member(s), we need something like "private_data_for_handlers" so that the tracing handlers could use it to communicate with call_fetch() methods. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Acked-by: Srikar Dronamraju --- include/linux/uprobes.h | 21 ++++++++++++++++----- kernel/events/uprobes.c | 16 ++++------------ 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 319eae70fe84..2225542624de 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -26,6 +26,7 @@ #include #include +#include struct vm_area_struct; struct mm_struct; @@ -72,14 +73,24 @@ enum uprobe_task_state { */ struct uprobe_task { enum uprobe_task_state state; - struct arch_uprobe_task autask; + + union { + struct { + struct arch_uprobe_task autask; + unsigned long vaddr; + }; + + struct { + struct callback_head dup_xol_work; + unsigned long dup_xol_addr; + }; + }; + + struct uprobe *active_uprobe; + unsigned long xol_vaddr; struct return_instance *return_instances; unsigned int depth; - struct uprobe *active_uprobe; - - unsigned long xol_vaddr; - unsigned long vaddr; }; /* diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 24b7d6ca871b..df4ef0971266 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1403,12 +1403,10 @@ static void uprobe_warn(struct task_struct *t, const char *msg) static void dup_xol_work(struct callback_head *work) { - kfree(work); - if (current->flags & PF_EXITING) return; - if (!__create_xol_area(current->utask->vaddr)) + if (!__create_xol_area(current->utask->dup_xol_addr)) uprobe_warn(current, "dup xol area"); } @@ -1419,7 +1417,6 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; - struct callback_head *work; struct xol_area *area; t->utask = NULL; @@ -1441,14 +1438,9 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) if (mm == t->mm) return; - /* TODO: move it into the union in uprobe_task */ - work = kmalloc(sizeof(*work), GFP_KERNEL); - if (!work) - return uprobe_warn(t, "dup xol area"); - - t->utask->vaddr = area->vaddr; - init_task_work(work, dup_xol_work); - task_work_add(t, work, true); + t->utask->dup_xol_addr = area->vaddr; + init_task_work(&t->utask->dup_xol_work, dup_xol_work); + task_work_add(t, &t->utask->dup_xol_work, true); } /* From 803200e24abf0f9ec18631290d26b2185477f3a6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 9 Nov 2013 17:58:54 +0100 Subject: [PATCH 002/282] uprobes: Don't assume that arch_uprobe->insn/ixol is u8[MAX_UINSN_BYTES] arch_uprobe should be opaque as much as possible to the generic code, but currently it assumes that insn/ixol must be u8[] of the known size. Remove this unnecessary dependency, we can use "&" and and sizeof() with the same effect. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index df4ef0971266..445962a72498 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -330,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); + return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -529,8 +529,8 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping = uprobe->inode->i_mapping; loff_t offs = uprobe->offset; - void *insn = uprobe->arch.insn; - int size = MAX_UINSN_BYTES; + void *insn = &uprobe->arch.insn; + int size = sizeof(uprobe->arch.insn); int len, err = -EIO; /* Copy only available bytes, -EIO if nothing was read */ @@ -569,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, goto out; ret = -ENOTSUPP; - if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) + if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); @@ -1264,7 +1264,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) /* Initialize the slot */ copy_to_page(area->page, xol_vaddr, - uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); + &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); /* * We probably need flush_icache_user_range() but it needs vma. * This should work on supported architectures too. From 3d78e945b6249d4ef2308192343f8b203b1d7ea5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 9 Nov 2013 18:44:19 +0100 Subject: [PATCH 003/282] uprobes/powerpc: Kill arch_uprobe->ainsn powerpc has both arch_uprobe->insn and arch_uprobe->ainsn to make the generic code happy. This is no longer needed after the previous change, powerpc can just use "u32 insn". Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli --- arch/powerpc/include/asm/uprobes.h | 5 ++--- arch/powerpc/kernel/uprobes.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/uprobes.h b/arch/powerpc/include/asm/uprobes.h index 75c6ecdb8f37..7422a999a39a 100644 --- a/arch/powerpc/include/asm/uprobes.h +++ b/arch/powerpc/include/asm/uprobes.h @@ -36,9 +36,8 @@ typedef ppc_opcode_t uprobe_opcode_t; struct arch_uprobe { union { - u8 insn[MAX_UINSN_BYTES]; - u8 ixol[MAX_UINSN_BYTES]; - u32 ainsn; + u32 insn; + u32 ixol; }; }; diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c index 59f419b935f2..003b20964ea0 100644 --- a/arch/powerpc/kernel/uprobes.c +++ b/arch/powerpc/kernel/uprobes.c @@ -186,7 +186,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) * emulate_step() returns 1 if the insn was successfully emulated. * For all other cases, we need to single-step in hardware. */ - ret = emulate_step(regs, auprobe->ainsn); + ret = emulate_step(regs, auprobe->insn); if (ret > 0) return true; From c912dae60ae6f659455f239298110adc67a5f3e9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 9 Nov 2013 19:49:39 +0100 Subject: [PATCH 004/282] uprobes: Cleanup !CONFIG_UPROBES decls, unexport xol_area 1. Don't include asm/uprobes.h unconditionally, we only need it if CONFIG_UPROBES. 2. Move the definition of "struct xol_area" into uprobes.c. Perhaps we should simply kill struct uprobes_state, it buys nothing. 3. Kill the dummy definition of uprobe_get_swbp_addr(), nobody except handle_swbp() needs it. 4. Purely cosmetic, but move the decl of uprobe_get_swbp_addr() up, close to other __weak helpers. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- include/linux/uprobes.h | 31 ++++--------------------------- kernel/events/uprobes.c | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 27 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 2225542624de..e32251e00e62 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -33,10 +33,6 @@ struct mm_struct; struct inode; struct notifier_block; -#ifdef CONFIG_ARCH_SUPPORTS_UPROBES -# include -#endif - #define UPROBE_HANDLER_REMOVE 1 #define UPROBE_HANDLER_MASK 1 @@ -61,6 +57,8 @@ struct uprobe_consumer { }; #ifdef CONFIG_UPROBES +#include + enum uprobe_task_state { UTASK_RUNNING, UTASK_SSTEP, @@ -93,24 +91,7 @@ struct uprobe_task { unsigned int depth; }; -/* - * On a breakpoint hit, thread contests for a slot. It frees the - * slot after singlestep. Currently a fixed number of slots are - * allocated. - */ -struct xol_area { - wait_queue_head_t wq; /* if all slots are busy */ - atomic_t slot_count; /* number of in-use slots */ - unsigned long *bitmap; /* 0 = free slot */ - struct page *page; - - /* - * We keep the vma's vm_start rather than a pointer to the vma - * itself. The probed process or a naughty kernel module could make - * the vma go away, and we must handle that reasonably gracefully. - */ - unsigned long vaddr; /* Page(s) of instruction slots */ -}; +struct xol_area; struct uprobes_state { struct xol_area *xol_area; @@ -120,6 +101,7 @@ extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsign extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); extern bool __weak is_trap_insn(uprobe_opcode_t *insn); +extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); @@ -131,7 +113,6 @@ extern void uprobe_end_dup_mmap(void); extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); extern void uprobe_copy_process(struct task_struct *t, unsigned long flags); -extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); extern int uprobe_post_sstep_notifier(struct pt_regs *regs); extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); extern void uprobe_notify_resume(struct pt_regs *regs); @@ -187,10 +168,6 @@ static inline bool uprobe_deny_signal(void) { return false; } -static inline unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) -{ - return 0; -} static inline void uprobe_free_utask(struct task_struct *t) { } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 445962a72498..51a7f535ff96 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -85,6 +85,25 @@ struct return_instance { struct return_instance *next; /* keep as stack */ }; +/* + * On a breakpoint hit, thread contests for a slot. It frees the + * slot after singlestep. Currently a fixed number of slots are + * allocated. + */ +struct xol_area { + wait_queue_head_t wq; /* if all slots are busy */ + atomic_t slot_count; /* number of in-use slots */ + unsigned long *bitmap; /* 0 = free slot */ + struct page *page; + + /* + * We keep the vma's vm_start rather than a pointer to the vma + * itself. The probed process or a naughty kernel module could make + * the vma go away, and we must handle that reasonably gracefully. + */ + unsigned long vaddr; /* Page(s) of instruction slots */ +}; + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have From ad439356ae5ae7688b39f1107fd5b874850fec18 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 19 Nov 2013 17:20:21 +0100 Subject: [PATCH 005/282] uprobes: Document xol_area and arch_uprobe->insn/ixol Document xol_area and arch_uprobe. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 51a7f535ff96..b886a5e7d4ff 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -73,6 +73,17 @@ struct uprobe { struct inode *inode; /* Also hold a ref to inode */ loff_t offset; unsigned long flags; + + /* + * The generic code assumes that it has two members of unknown type + * owned by the arch-specific code: + * + * insn - copy_insn() saves the original instruction here for + * arch_uprobe_analyze_insn(). + * + * ixol - potentially modified instruction to execute out of + * line, copied to xol_area by xol_get_insn_slot(). + */ struct arch_uprobe arch; }; @@ -86,6 +97,10 @@ struct return_instance { }; /* + * Execute out of line area: anonymous executable mapping installed + * by the probed task to execute the copy of the original instruction + * mangled by set_swbp(). + * * On a breakpoint hit, thread contests for a slot. It frees the * slot after singlestep. Currently a fixed number of slots are * allocated. From 71ad88efebbcde374bddf904b96f3a7fc82d45d4 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 12 Nov 2013 17:58:48 +0100 Subject: [PATCH 006/282] perf: Add active_entry list head to struct perf_event This patch adds a new field to the struct perf_event. It is intended to be used to chain events which are active (enabled). It helps in the hardware layer for PMUs which do not have actual counter restrictions, i.e., free running read-only counters. Active events are chained as opposed to being tracked via the counter they use. To save space we use a union with hlist_entry as both are mutually exclusive (suggested by Jiri Olsa). Signed-off-by: Stephane Eranian Reviewed-by: Andi Kleen Signed-off-by: Peter Zijlstra Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: zheng.z.yan@intel.com Cc: bp@alien8.de Cc: maria.n.dimakopoulou@gmail.com Link: http://lkml.kernel.org/r/1384275531-10892-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 5 ++++- kernel/events/core.c | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2e069d1288df..8f4a70f2eca8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -319,7 +319,10 @@ struct perf_event { */ struct list_head migrate_entry; - struct hlist_node hlist_entry; + union { + struct hlist_node hlist_entry; + struct list_head active_entry; + }; int nr_siblings; int group_flags; struct perf_event *group_leader; diff --git a/kernel/events/core.c b/kernel/events/core.c index 72348dc192c1..403b781daafb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6655,6 +6655,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->rb_entry); + INIT_LIST_HEAD(&event->active_entry); init_waitqueue_head(&event->waitq); init_irq_work(&event->pending, perf_pending_event); From 410136f5dd96b6013fe6d1011b523b1c247e1ccb Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 12 Nov 2013 17:58:49 +0100 Subject: [PATCH 007/282] tools/perf/stat: Add event unit and scale support This patch adds perf stat support for handling event units and scales as exported by the kernel. The kernel can export PMU events actual unit and scaling factor via sysfs: $ ls -1 /sys/devices/power/events/energy-* /sys/devices/power/events/energy-cores /sys/devices/power/events/energy-cores.scale /sys/devices/power/events/energy-cores.unit /sys/devices/power/events/energy-pkg /sys/devices/power/events/energy-pkg.scale /sys/devices/power/events/energy-pkg.unit $ cat /sys/devices/power/events/energy-cores.scale 2.3283064365386962890625e-10 $ cat cat /sys/devices/power/events/energy-cores.unit Joules This patch modifies the pmu event alias code to check for the presence of the .unit and .scale files to load the corresponding values. They are then used by perf stat transparently: # perf stat -a -e power/energy-pkg/,power/energy-cores/,cycles -I 1000 sleep 1000 # time counts unit events 1.000214717 3.07 Joules power/energy-pkg/ [100.00%] 1.000214717 0.53 Joules power/energy-cores/ 1.000214717 12965028 cycles [100.00%] 2.000749289 3.01 Joules power/energy-pkg/ 2.000749289 0.52 Joules power/energy-cores/ 2.000749289 15817043 cycles When the event does not have an explicit unit exported by the kernel, nothing is printed. In csv output mode, there will be an empty field. Special thanks to Jiri for providing the supporting code in the parser to trigger reading of the scale and unit files. Signed-off-by: Stephane Eranian Reviewed-by: Jiri Olsa Reviewed-by: Andi Kleen Signed-off-by: Peter Zijlstra Cc: zheng.z.yan@intel.com Cc: bp@alien8.de Cc: maria.n.dimakopoulou@gmail.com Cc: acme@redhat.com Link: http://lkml.kernel.org/r/1384275531-10892-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 114 ++++++++++++++++++++------- tools/perf/util/evsel.c | 2 + tools/perf/util/evsel.h | 3 + tools/perf/util/parse-events.c | 28 ++++--- tools/perf/util/pmu.c | 138 ++++++++++++++++++++++++++++++++- tools/perf/util/pmu.h | 3 +- 6 files changed, 244 insertions(+), 44 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index ee0d565f83e3..dab98b50c9fe 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -138,6 +138,7 @@ static const char *post_cmd = NULL; static bool sync_run = false; static unsigned int interval = 0; static unsigned int initial_delay = 0; +static unsigned int unit_width = 4; /* strlen("unit") */ static bool forever = false; static struct timespec ref_time; static struct cpu_map *aggr_map; @@ -461,17 +462,17 @@ static void print_interval(void) if (num_print_interval == 0 && !csv_output) { switch (aggr_mode) { case AGGR_SOCKET: - fprintf(output, "# time socket cpus counts events\n"); + fprintf(output, "# time socket cpus counts %*s events\n", unit_width, "unit"); break; case AGGR_CORE: - fprintf(output, "# time core cpus counts events\n"); + fprintf(output, "# time core cpus counts %*s events\n", unit_width, "unit"); break; case AGGR_NONE: - fprintf(output, "# time CPU counts events\n"); + fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit"); break; case AGGR_GLOBAL: default: - fprintf(output, "# time counts events\n"); + fprintf(output, "# time counts %*s events\n", unit_width, "unit"); } } @@ -516,6 +517,7 @@ static int __run_perf_stat(int argc, const char **argv) unsigned long long t0, t1; struct perf_evsel *counter; struct timespec ts; + size_t l; int status = 0; const bool forks = (argc > 0); @@ -565,6 +567,10 @@ static int __run_perf_stat(int argc, const char **argv) return -1; } counter->supported = true; + + l = strlen(counter->unit); + if (l > unit_width) + unit_width = l; } if (perf_evlist__apply_filters(evsel_list)) { @@ -704,14 +710,25 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr) static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg) { double msecs = avg / 1e6; - const char *fmt = csv_output ? "%.6f%s%s" : "%18.6f%s%-25s"; + const char *fmt_v, *fmt_n; char name[25]; + fmt_v = csv_output ? "%.6f%s" : "%18.6f%s"; + fmt_n = csv_output ? "%s" : "%-25s"; + aggr_printout(evsel, cpu, nr); scnprintf(name, sizeof(name), "%s%s", perf_evsel__name(evsel), csv_output ? "" : " (msec)"); - fprintf(output, fmt, msecs, csv_sep, name); + + fprintf(output, fmt_v, msecs, csv_sep); + + if (csv_output) + fprintf(output, "%s%s", evsel->unit, csv_sep); + else + fprintf(output, "%-*s%s", unit_width, evsel->unit, csv_sep); + + fprintf(output, fmt_n, name); if (evsel->cgrp) fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); @@ -908,21 +925,31 @@ static void print_ll_cache_misses(int cpu, static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0, total2; + double sc = evsel->scale; const char *fmt; - if (csv_output) - fmt = "%.0f%s%s"; - else if (big_num) - fmt = "%'18.0f%s%-25s"; - else - fmt = "%18.0f%s%-25s"; + if (csv_output) { + fmt = sc != 1.0 ? "%.2f%s" : "%.0f%s"; + } else { + if (big_num) + fmt = sc != 1.0 ? "%'18.2f%s" : "%'18.0f%s"; + else + fmt = sc != 1.0 ? "%18.2f%s" : "%18.0f%s"; + } aggr_printout(evsel, cpu, nr); if (aggr_mode == AGGR_GLOBAL) cpu = 0; - fprintf(output, fmt, avg, csv_sep, perf_evsel__name(evsel)); + fprintf(output, fmt, avg, csv_sep); + + if (evsel->unit) + fprintf(output, "%-*s%s", + csv_output ? 0 : unit_width, + evsel->unit, csv_sep); + + fprintf(output, "%-*s", csv_output ? 0 : 25, perf_evsel__name(evsel)); if (evsel->cgrp) fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); @@ -941,7 +968,10 @@ static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg) if (total && avg) { ratio = total / avg; - fprintf(output, "\n # %5.2f stalled cycles per insn", ratio); + fprintf(output, "\n"); + if (aggr_mode == AGGR_NONE) + fprintf(output, " "); + fprintf(output, " # %5.2f stalled cycles per insn", ratio); } } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && @@ -1061,6 +1091,7 @@ static void print_aggr(char *prefix) { struct perf_evsel *counter; int cpu, cpu2, s, s2, id, nr; + double uval; u64 ena, run, val; if (!(aggr_map || aggr_get_id)) @@ -1087,11 +1118,17 @@ static void print_aggr(char *prefix) if (run == 0 || ena == 0) { aggr_printout(counter, id, nr); - fprintf(output, "%*s%s%*s", + fprintf(output, "%*s%s", csv_output ? 0 : 18, counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, - csv_sep, - csv_output ? 0 : -24, + csv_sep); + + fprintf(output, "%-*s%s", + csv_output ? 0 : unit_width, + counter->unit, csv_sep); + + fprintf(output, "%*s", + csv_output ? 0 : -25, perf_evsel__name(counter)); if (counter->cgrp) @@ -1101,11 +1138,12 @@ static void print_aggr(char *prefix) fputc('\n', output); continue; } + uval = val * counter->scale; if (nsec_counter(counter)) - nsec_printout(id, nr, counter, val); + nsec_printout(id, nr, counter, uval); else - abs_printout(id, nr, counter, val); + abs_printout(id, nr, counter, uval); if (!csv_output) { print_noise(counter, 1.0); @@ -1128,16 +1166,21 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix) struct perf_stat *ps = counter->priv; double avg = avg_stats(&ps->res_stats[0]); int scaled = counter->counts->scaled; + double uval; if (prefix) fprintf(output, "%s", prefix); if (scaled == -1) { - fprintf(output, "%*s%s%*s", + fprintf(output, "%*s%s", csv_output ? 0 : 18, counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, - csv_sep, - csv_output ? 0 : -24, + csv_sep); + fprintf(output, "%-*s%s", + csv_output ? 0 : unit_width, + counter->unit, csv_sep); + fprintf(output, "%*s", + csv_output ? 0 : -25, perf_evsel__name(counter)); if (counter->cgrp) @@ -1147,10 +1190,12 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix) return; } + uval = avg * counter->scale; + if (nsec_counter(counter)) - nsec_printout(-1, 0, counter, avg); + nsec_printout(-1, 0, counter, uval); else - abs_printout(-1, 0, counter, avg); + abs_printout(-1, 0, counter, uval); print_noise(counter, avg); @@ -1177,6 +1222,7 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix) static void print_counter(struct perf_evsel *counter, char *prefix) { u64 ena, run, val; + double uval; int cpu; for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { @@ -1188,14 +1234,20 @@ static void print_counter(struct perf_evsel *counter, char *prefix) fprintf(output, "%s", prefix); if (run == 0 || ena == 0) { - fprintf(output, "CPU%*d%s%*s%s%*s", + fprintf(output, "CPU%*d%s%*s%s", csv_output ? 0 : -4, perf_evsel__cpus(counter)->map[cpu], csv_sep, csv_output ? 0 : 18, counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, - csv_sep, - csv_output ? 0 : -24, - perf_evsel__name(counter)); + csv_sep); + + fprintf(output, "%-*s%s", + csv_output ? 0 : unit_width, + counter->unit, csv_sep); + + fprintf(output, "%*s", + csv_output ? 0 : -25, + perf_evsel__name(counter)); if (counter->cgrp) fprintf(output, "%s%s", @@ -1205,10 +1257,12 @@ static void print_counter(struct perf_evsel *counter, char *prefix) continue; } + uval = val * counter->scale; + if (nsec_counter(counter)) - nsec_printout(cpu, 0, counter, val); + nsec_printout(cpu, 0, counter, uval); else - abs_printout(cpu, 0, counter, val); + abs_printout(cpu, 0, counter, uval); if (!csv_output) { print_noise(counter, 1.0); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 46dd4c2a41ce..dad64926170f 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -162,6 +162,8 @@ void perf_evsel__init(struct perf_evsel *evsel, evsel->idx = idx; evsel->attr = *attr; evsel->leader = evsel; + evsel->unit = ""; + evsel->scale = 1.0; INIT_LIST_HEAD(&evsel->node); hists__init(&evsel->hists); evsel->sample_size = __perf_evsel__sample_size(attr->sample_type); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 1ea7c92e6e33..8120eeb86ac1 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -68,6 +68,8 @@ struct perf_evsel { u32 ids; struct hists hists; char *name; + double scale; + const char *unit; struct event_format *tp_format; union { void *priv; @@ -138,6 +140,7 @@ extern const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX]; int __perf_evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, char *bf, size_t size); const char *perf_evsel__name(struct perf_evsel *evsel); + const char *perf_evsel__group_name(struct perf_evsel *evsel); int perf_evsel__group_desc(struct perf_evsel *evsel, char *buf, size_t size); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 6de6f89c2a61..969cb8f0d88d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -269,9 +269,10 @@ const char *event_type(int type) -static int __add_event(struct list_head *list, int *idx, - struct perf_event_attr *attr, - char *name, struct cpu_map *cpus) +static struct perf_evsel * +__add_event(struct list_head *list, int *idx, + struct perf_event_attr *attr, + char *name, struct cpu_map *cpus) { struct perf_evsel *evsel; @@ -279,19 +280,19 @@ static int __add_event(struct list_head *list, int *idx, evsel = perf_evsel__new_idx(attr, (*idx)++); if (!evsel) - return -ENOMEM; + return NULL; evsel->cpus = cpus; if (name) evsel->name = strdup(name); list_add_tail(&evsel->node, list); - return 0; + return evsel; } static int add_event(struct list_head *list, int *idx, struct perf_event_attr *attr, char *name) { - return __add_event(list, idx, attr, name, NULL); + return __add_event(list, idx, attr, name, NULL) ? 0 : -ENOMEM; } static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES], int size) @@ -633,6 +634,9 @@ int parse_events_add_pmu(struct list_head *list, int *idx, { struct perf_event_attr attr; struct perf_pmu *pmu; + struct perf_evsel *evsel; + char *unit; + double scale; pmu = perf_pmu__find(name); if (!pmu) @@ -640,7 +644,7 @@ int parse_events_add_pmu(struct list_head *list, int *idx, memset(&attr, 0, sizeof(attr)); - if (perf_pmu__check_alias(pmu, head_config)) + if (perf_pmu__check_alias(pmu, head_config, &unit, &scale)) return -EINVAL; /* @@ -652,8 +656,14 @@ int parse_events_add_pmu(struct list_head *list, int *idx, if (perf_pmu__config(pmu, &attr, head_config)) return -EINVAL; - return __add_event(list, idx, &attr, pmu_event_name(head_config), - pmu->cpus); + evsel = __add_event(list, idx, &attr, pmu_event_name(head_config), + pmu->cpus); + if (evsel) { + evsel->unit = unit; + evsel->scale = scale; + } + + return evsel ? 0 : -ENOMEM; } int parse_events__modifier_group(struct list_head *list, diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index c232d8dd410b..56fc10a5e288 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1,19 +1,23 @@ #include #include -#include #include #include #include #include "fs.h" +#include #include "util.h" #include "pmu.h" #include "parse-events.h" #include "cpumap.h" +#define UNIT_MAX_LEN 31 /* max length for event unit name */ + struct perf_pmu_alias { char *name; struct list_head terms; struct list_head list; + char unit[UNIT_MAX_LEN+1]; + double scale; }; struct perf_pmu_format { @@ -94,7 +98,80 @@ static int pmu_format(const char *name, struct list_head *format) return 0; } -static int perf_pmu__new_alias(struct list_head *list, char *name, FILE *file) +static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *name) +{ + struct stat st; + ssize_t sret; + char scale[128]; + int fd, ret = -1; + char path[PATH_MAX]; + char *lc; + + snprintf(path, PATH_MAX, "%s/%s.scale", dir, name); + + fd = open(path, O_RDONLY); + if (fd == -1) + return -1; + + if (fstat(fd, &st) < 0) + goto error; + + sret = read(fd, scale, sizeof(scale)-1); + if (sret < 0) + goto error; + + scale[sret] = '\0'; + /* + * save current locale + */ + lc = setlocale(LC_NUMERIC, NULL); + + /* + * force to C locale to ensure kernel + * scale string is converted correctly. + * kernel uses default C locale. + */ + setlocale(LC_NUMERIC, "C"); + + alias->scale = strtod(scale, NULL); + + /* restore locale */ + setlocale(LC_NUMERIC, lc); + + ret = 0; +error: + close(fd); + return ret; +} + +static int perf_pmu__parse_unit(struct perf_pmu_alias *alias, char *dir, char *name) +{ + char path[PATH_MAX]; + ssize_t sret; + int fd; + + snprintf(path, PATH_MAX, "%s/%s.unit", dir, name); + + fd = open(path, O_RDONLY); + if (fd == -1) + return -1; + + sret = read(fd, alias->unit, UNIT_MAX_LEN); + if (sret < 0) + goto error; + + close(fd); + + alias->unit[sret] = '\0'; + + return 0; +error: + close(fd); + alias->unit[0] = '\0'; + return -1; +} + +static int perf_pmu__new_alias(struct list_head *list, char *dir, char *name, FILE *file) { struct perf_pmu_alias *alias; char buf[256]; @@ -110,6 +187,9 @@ static int perf_pmu__new_alias(struct list_head *list, char *name, FILE *file) return -ENOMEM; INIT_LIST_HEAD(&alias->terms); + alias->scale = 1.0; + alias->unit[0] = '\0'; + ret = parse_events_terms(&alias->terms, buf); if (ret) { free(alias); @@ -117,7 +197,14 @@ static int perf_pmu__new_alias(struct list_head *list, char *name, FILE *file) } alias->name = strdup(name); + /* + * load unit name and scale if available + */ + perf_pmu__parse_unit(alias, dir, name); + perf_pmu__parse_scale(alias, dir, name); + list_add_tail(&alias->list, list); + return 0; } @@ -129,6 +216,7 @@ static int pmu_aliases_parse(char *dir, struct list_head *head) { struct dirent *evt_ent; DIR *event_dir; + size_t len; int ret = 0; event_dir = opendir(dir); @@ -143,13 +231,24 @@ static int pmu_aliases_parse(char *dir, struct list_head *head) if (!strcmp(name, ".") || !strcmp(name, "..")) continue; + /* + * skip .unit and .scale info files + * parsed in perf_pmu__new_alias() + */ + len = strlen(name); + if (len > 5 && !strcmp(name + len - 5, ".unit")) + continue; + if (len > 6 && !strcmp(name + len - 6, ".scale")) + continue; + snprintf(path, PATH_MAX, "%s/%s", dir, name); ret = -EINVAL; file = fopen(path, "r"); if (!file) break; - ret = perf_pmu__new_alias(head, name, file); + + ret = perf_pmu__new_alias(head, dir, name, file); fclose(file); } @@ -508,16 +607,42 @@ static struct perf_pmu_alias *pmu_find_alias(struct perf_pmu *pmu, return NULL; } + +static int check_unit_scale(struct perf_pmu_alias *alias, + char **unit, double *scale) +{ + /* + * Only one term in event definition can + * define unit and scale, fail if there's + * more than one. + */ + if ((*unit && alias->unit) || + (*scale && alias->scale)) + return -EINVAL; + + if (alias->unit) + *unit = alias->unit; + + if (alias->scale) + *scale = alias->scale; + + return 0; +} + /* * Find alias in the terms list and replace it with the terms * defined for the alias */ -int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms) +int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms, + char **unit, double *scale) { struct parse_events_term *term, *h; struct perf_pmu_alias *alias; int ret; + *unit = NULL; + *scale = 0; + list_for_each_entry_safe(term, h, head_terms, list) { alias = pmu_find_alias(pmu, term); if (!alias) @@ -525,6 +650,11 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms) ret = pmu_alias_terms(alias, &term->list); if (ret) return ret; + + ret = check_unit_scale(alias, unit, scale); + if (ret) + return ret; + list_del(&term->list); free(term); } diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 1179b26f244a..9183380e2038 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -28,7 +28,8 @@ int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr, int perf_pmu__config_terms(struct list_head *formats, struct perf_event_attr *attr, struct list_head *head_terms); -int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms); +int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms, + char **unit, double *scale); struct list_head *perf_pmu__alias(struct perf_pmu *pmu, struct list_head *head_terms); int perf_pmu_wrap(void); From 4788e5b4b2338f85fa42a712a182d8afd65d7c58 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 12 Nov 2013 17:58:50 +0100 Subject: [PATCH 008/282] perf/x86: Add Intel RAPL PMU support This patch adds a new uncore PMU to expose the Intel RAPL energy consumption counters. Up to 3 counters, each counting a particular RAPL event are exposed. The RAPL counters are available on Intel SandyBridge, IvyBridge, Haswell. The server skus add a 3rd counter. The following events are available and exposed in sysfs: - power/energy-cores: power consumption of all cores on socket - power/energy-pkg: power consumption of all cores + LLc cache - power/energy-dram: power consumption of DRAM (servers only) For each event both the unit (Joules) and scale (2^-32 J) is exposed in sysfs for use by perf stat and other tools. The files are: /sys/devices/power/events/energy-*.unit /sys/devices/power/events/energy-*.scale The RAPL PMU is uncore by nature and is implemented such that it only works in system-wide mode. Measuring only one CPU per socket is sufficient. The /sys/devices/power/cpumask file can be used by tools to figure out which CPUs to monitor by default. For instance, on a 2-socket system, 2 CPUs (one on each socket) will be shown. All the counters measure in the same unit (exposed via sysfs). The perf_events API exposes all RAPL counters as 64-bit integers counting in unit of 1/2^32 Joules (about 0.23 nJ). User level tools must convert the counts by multiplying them by 2^-32 to obtain Joules. The reason for this is that the kernel avoids doing floating point math whenever possible because it is expensive (user floating-point state must be saved). The method used avoids kernel floating-point usage. There is no loss of precision. Thanks to PeterZ for suggesting this approach. To convert the raw count in Watt: W = C * 2.3 / (1e10 * time) or ldexp(C, -32). RAPL PMU is a new standalone PMU which registers with the perf_event core subsystem. The PMU type (attr->type) is dynamically allocated and is available from /sys/device/power/type. Sampling is not supported by the RAPL PMU. There is no privilege level filtering either. Signed-off-by: Stephane Eranian Reviewed-by: Maria Dimakopoulou Reviewed-by: Andi Kleen Signed-off-by: Peter Zijlstra Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: zheng.z.yan@intel.com Cc: bp@alien8.de Link: http://lkml.kernel.org/r/1384275531-10892-4-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/perf_event_intel_rapl.c | 591 ++++++++++++++++++++ 2 files changed, 592 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/perf_event_intel_rapl.c diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 47b56a7e99cb..6359506a19ee 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -36,7 +36,7 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o endif obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o endif diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c new file mode 100644 index 000000000000..cfcd386b5d89 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -0,0 +1,591 @@ +/* + * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters + * Copyright (C) 2013 Google, Inc., Stephane Eranian + * + * Intel RAPL interface is specified in the IA-32 Manual Vol3b + * section 14.7.1 (September 2013) + * + * RAPL provides more controls than just reporting energy consumption + * however here we only expose the 3 energy consumption free running + * counters (pp0, pkg, dram). + * + * Each of those counters increments in a power unit defined by the + * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules + * but it can vary. + * + * Counter to rapl events mappings: + * + * pp0 counter: consumption of all physical cores (power plane 0) + * event: rapl_energy_cores + * perf code: 0x1 + * + * pkg counter: consumption of the whole processor package + * event: rapl_energy_pkg + * perf code: 0x2 + * + * dram counter: consumption of the dram domain (servers only) + * event: rapl_energy_dram + * perf code: 0x3 + * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * + * The events only support system-wide mode counting. There is no + * sampling support because it does not make sense and is not + * supported by the RAPL hardware. + * + * Because we want to avoid floating-point operations in the kernel, + * the events are all reported in fixed point arithmetic (32.32). + * Tools must adjust the counts to convert them to Watts using + * the duration of the measurement. Tools may use a function such as + * ldexp(raw_count, -32); + */ +#include +#include +#include +#include +#include "perf_event.h" + +/* + * RAPL energy status counters + */ +#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ +#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ +#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ +#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ +#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ +#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ + +/* Clients have PP0, PKG */ +#define RAPL_IDX_CLN (1<config + * any other bit is reserved + */ +#define RAPL_EVENT_MASK 0xFFULL + +#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __rapl_##_var##_show, NULL) + +#define RAPL_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ + .config = _config, \ +} + +#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ + +struct rapl_pmu { + spinlock_t lock; + int hw_unit; /* 1/2^hw_unit Joule */ + int n_active; /* number of active events */ + struct list_head active_list; + struct pmu *pmu; /* pointer to rapl_pmu_class */ +}; + +static struct pmu rapl_pmu_class; +static cpumask_t rapl_cpu_mask; +static int rapl_cntr_mask; + +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); + +static inline u64 rapl_read_counter(struct perf_event *event) +{ + u64 raw; + rdmsrl(event->hw.event_base, raw); + return raw; +} + +static inline u64 rapl_scale(u64 v) +{ + /* + * scale delta to smallest unit (1/2^32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ + return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit); +} + +static u64 rapl_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_raw_count, new_raw_count; + s64 delta, sdelta; + int shift = RAPL_CNTR_WIDTH; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + rdmsrl(event->hw.event_base, new_raw_count); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) { + cpu_relax(); + goto again; + } + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + sdelta = rapl_scale(delta); + + local64_add(sdelta, &event->count); + + return new_raw_count; +} + +static void __rapl_pmu_event_start(struct rapl_pmu *pmu, + struct perf_event *event) +{ + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + list_add_tail(&event->active_entry, &pmu->active_list); + + local64_set(&event->hw.prev_count, rapl_read_counter(event)); + + pmu->n_active++; +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + __rapl_pmu_event_start(pmu, event); + spin_unlock_irqrestore(&pmu->lock, flags); +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + + /* mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { + WARN_ON_ONCE(pmu->n_active <= 0); + pmu->n_active--; + + list_del(&event->active_entry); + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + /* check if update of sw counter is necessary */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + rapl_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } + + spin_unlock_irqrestore(&pmu->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __rapl_pmu_event_start(pmu, event); + + spin_unlock_irqrestore(&pmu->lock, flags); + + return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ + rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int rapl_pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int bit, msr, ret = 0; + + /* only look at RAPL events */ + if (event->attr.type != rapl_pmu_class.type) + return -ENOENT; + + /* check only supported bits are set */ + if (event->attr.config & ~RAPL_EVENT_MASK) + return -EINVAL; + + /* + * check event is known (determines counter) + */ + switch (cfg) { + case INTEL_RAPL_PP0: + bit = RAPL_IDX_PP0_NRG_STAT; + msr = MSR_PP0_ENERGY_STATUS; + break; + case INTEL_RAPL_PKG: + bit = RAPL_IDX_PKG_NRG_STAT; + msr = MSR_PKG_ENERGY_STATUS; + break; + case INTEL_RAPL_RAM: + bit = RAPL_IDX_RAM_NRG_STAT; + msr = MSR_DRAM_ENERGY_STATUS; + break; + default: + return -EINVAL; + } + /* check event supported */ + if (!(rapl_cntr_mask & (1 << bit))) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + /* must be done before validate_group */ + event->hw.event_base = msr; + event->hw.config = cfg; + event->hw.idx = bit; + + return ret; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ + rapl_event_update(event); +} + +static ssize_t rapl_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); + + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); + +static struct attribute *rapl_pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_attr_group = { + .attrs = rapl_pmu_attrs, +}; + +EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); +EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); + +EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); +EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); +EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); + +/* + * we compute in 0.23 nJ increments regardless of MSR + */ +EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); + +static struct attribute *rapl_events_srv_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + +static struct attribute *rapl_events_cln_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + NULL, +}; + +static struct attribute_group rapl_pmu_events_group = { + .name = "events", + .attrs = NULL, /* patched at runtime */ +}; + +DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); +static struct attribute *rapl_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_format_group = { + .name = "format", + .attrs = rapl_formats_attr, +}; + +const struct attribute_group *rapl_attr_groups[] = { + &rapl_pmu_attr_group, + &rapl_pmu_format_group, + &rapl_pmu_events_group, + NULL, +}; + +static struct pmu rapl_pmu_class = { + .attr_groups = rapl_attr_groups, + .task_ctx_nr = perf_invalid_context, /* system-wide only */ + .event_init = rapl_pmu_event_init, + .add = rapl_pmu_event_add, /* must have */ + .del = rapl_pmu_event_del, /* must have */ + .start = rapl_pmu_event_start, + .stop = rapl_pmu_event_stop, + .read = rapl_pmu_event_read, +}; + +static void rapl_cpu_exit(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + int i, phys_id = topology_physical_package_id(cpu); + int target = -1; + + /* find a new cpu on same package */ + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + /* + * clear cpu from cpumask + * if was set in cpumask and still some cpu on package, + * then move to new cpu + */ + if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) + cpumask_set_cpu(target, &rapl_cpu_mask); + + WARN_ON(cpumask_empty(&rapl_cpu_mask)); + /* + * migrate events and context to new cpu + */ + if (target >= 0) + perf_pmu_migrate_context(pmu->pmu, cpu, target); +} + +static void rapl_cpu_init(int cpu) +{ + int i, phys_id = topology_physical_package_id(cpu); + + /* check if phys_is is already covered */ + for_each_cpu(i, &rapl_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + /* was not found, so add it */ + cpumask_set_cpu(cpu, &rapl_cpu_mask); +} + +static int rapl_cpu_prepare(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + int phys_id = topology_physical_package_id(cpu); + + if (pmu) + return 0; + + if (phys_id < 0) + return -1; + + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -1; + + spin_lock_init(&pmu->lock); + + INIT_LIST_HEAD(&pmu->active_list); + + /* + * grab power unit as: 1/2^unit Joules + * + * we cache in local PMU instance + */ + rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit); + pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; + pmu->pmu = &rapl_pmu_class; + + /* set RAPL pmu for this cpu for now */ + per_cpu(rapl_pmu, cpu) = pmu; + per_cpu(rapl_pmu_to_free, cpu) = NULL; + + return 0; +} + +static void rapl_cpu_kfree(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); + + kfree(pmu); + + per_cpu(rapl_pmu_to_free, cpu) = NULL; +} + +static int rapl_cpu_dying(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + + if (!pmu) + return 0; + + per_cpu(rapl_pmu, cpu) = NULL; + + per_cpu(rapl_pmu_to_free, cpu) = pmu; + + return 0; +} + +static int rapl_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + rapl_cpu_prepare(cpu); + break; + case CPU_STARTING: + rapl_cpu_init(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + rapl_cpu_dying(cpu); + break; + case CPU_ONLINE: + case CPU_DEAD: + rapl_cpu_kfree(cpu); + break; + case CPU_DOWN_PREPARE: + rapl_cpu_exit(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static const struct x86_cpu_id rapl_cpu_match[] = { + [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, + [1] = {}, +}; + +static int __init rapl_pmu_init(void) +{ + struct rapl_pmu *pmu; + int cpu, ret; + + /* + * check for Intel processor family 6 + */ + if (!x86_match_cpu(rapl_cpu_match)) + return 0; + + /* check supported CPU */ + switch (boot_cpu_data.x86_model) { + case 42: /* Sandy Bridge */ + case 58: /* Ivy Bridge */ + case 60: /* Haswell */ + rapl_cntr_mask = RAPL_IDX_CLN; + rapl_pmu_events_group.attrs = rapl_events_cln_attr; + break; + case 45: /* Sandy Bridge-EP */ + case 62: /* IvyTown */ + rapl_cntr_mask = RAPL_IDX_SRV; + rapl_pmu_events_group.attrs = rapl_events_srv_attr; + break; + + default: + /* unsupported */ + return 0; + } + get_online_cpus(); + + for_each_online_cpu(cpu) { + rapl_cpu_prepare(cpu); + rapl_cpu_init(cpu); + } + + perf_cpu_notifier(rapl_cpu_notifier); + + ret = perf_pmu_register(&rapl_pmu_class, "power", -1); + if (WARN_ON(ret)) { + pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); + put_online_cpus(); + return -1; + } + + pmu = __get_cpu_var(rapl_pmu); + + pr_info("RAPL PMU detected, hw unit 2^-%d Joules," + " API unit is 2^-32 Joules," + " %d fixed counters\n", + pmu->hw_unit, + hweight32(rapl_cntr_mask)); + + put_online_cpus(); + + return 0; +} +device_initcall(rapl_pmu_init); From 65661f96d3b32f4b28fef26d21be81d7e173b965 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 12 Nov 2013 17:58:51 +0100 Subject: [PATCH 009/282] perf/x86: Add RAPL hrtimer support The RAPL PMU counters do not interrupt on overflow. Therefore, the kernel needs to poll the counters to avoid missing an overflow. This patch adds the hrtimer code to do this. The timer interval is calculated at boot time based on the power unit used by the HW. There is one hrtimer per-cpu to handle the case of multiple simultaneous use across cores on the same package + hotplug CPU. Thanks to Maria Dimakopoulou for her contributions to this patch especially on the math aspects. Signed-off-by: Stephane Eranian Reviewed-by: Maria Dimakopoulou Reviewed-by: Andi Kleen [ Applied 32-bit build fix. ] Signed-off-by: Peter Zijlstra Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: zheng.z.yan@intel.com Cc: bp@alien8.de Cc: maria.n.dimakopoulou@gmail.com Link: http://lkml.kernel.org/r/1384275531-10892-5-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_rapl.c | 74 ++++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index cfcd386b5d89..bf8e4a736d48 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -96,6 +96,8 @@ struct rapl_pmu { int n_active; /* number of active events */ struct list_head active_list; struct pmu *pmu; /* pointer to rapl_pmu_class */ + ktime_t timer_interval; /* in ktime_t unit */ + struct hrtimer hrtimer; }; static struct pmu rapl_pmu_class; @@ -158,6 +160,48 @@ again: return new_raw_count; } +static void rapl_start_hrtimer(struct rapl_pmu *pmu) +{ + __hrtimer_start_range_ns(&pmu->hrtimer, + pmu->timer_interval, 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void rapl_stop_hrtimer(struct rapl_pmu *pmu) +{ + hrtimer_cancel(&pmu->hrtimer); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct perf_event *event; + unsigned long flags; + + if (!pmu->n_active) + return HRTIMER_NORESTART; + + spin_lock_irqsave(&pmu->lock, flags); + + list_for_each_entry(event, &pmu->active_list, active_entry) { + rapl_event_update(event); + } + + spin_unlock_irqrestore(&pmu->lock, flags); + + hrtimer_forward_now(hrtimer, pmu->timer_interval); + + return HRTIMER_RESTART; +} + +static void rapl_hrtimer_init(struct rapl_pmu *pmu) +{ + struct hrtimer *hr = &pmu->hrtimer; + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = rapl_hrtimer_handle; +} + static void __rapl_pmu_event_start(struct rapl_pmu *pmu, struct perf_event *event) { @@ -171,6 +215,8 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu, local64_set(&event->hw.prev_count, rapl_read_counter(event)); pmu->n_active++; + if (pmu->n_active == 1) + rapl_start_hrtimer(pmu); } static void rapl_pmu_event_start(struct perf_event *event, int mode) @@ -195,6 +241,8 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode) if (!(hwc->state & PERF_HES_STOPPED)) { WARN_ON_ONCE(pmu->n_active <= 0); pmu->n_active--; + if (pmu->n_active == 0) + rapl_stop_hrtimer(pmu); list_del(&event->active_entry); @@ -423,6 +471,9 @@ static void rapl_cpu_exit(int cpu) */ if (target >= 0) perf_pmu_migrate_context(pmu->pmu, cpu, target); + + /* cancel overflow polling timer for CPU */ + rapl_stop_hrtimer(pmu); } static void rapl_cpu_init(int cpu) @@ -442,6 +493,7 @@ static int rapl_cpu_prepare(int cpu) { struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); int phys_id = topology_physical_package_id(cpu); + u64 ms; if (pmu) return 0; @@ -466,6 +518,22 @@ static int rapl_cpu_prepare(int cpu) pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; pmu->pmu = &rapl_pmu_class; + /* + * use reference of 200W for scaling the timeout + * to avoid missing counter overflows. + * 200W = 200 Joules/sec + * divide interval by 2 to avoid lockstep (2 * 100) + * if hw unit is 32, then we use 2 ms 1/200/2 + */ + if (pmu->hw_unit < 32) + ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); + else + ms = 2; + + pmu->timer_interval = ms_to_ktime(ms); + + rapl_hrtimer_init(pmu); + /* set RAPL pmu for this cpu for now */ per_cpu(rapl_pmu, cpu) = pmu; per_cpu(rapl_pmu_to_free, cpu) = NULL; @@ -580,9 +648,11 @@ static int __init rapl_pmu_init(void) pr_info("RAPL PMU detected, hw unit 2^-%d Joules," " API unit is 2^-32 Joules," - " %d fixed counters\n", + " %d fixed counters" + " %llu ms ovfl timer\n", pmu->hw_unit, - hweight32(rapl_cntr_mask)); + hweight32(rapl_cntr_mask), + ktime_to_ms(pmu->timer_interval)); put_online_cpus(); From 12e55569a244996a23cb401e8116e5a060b664f0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 19 Nov 2013 18:29:37 -0500 Subject: [PATCH 010/282] tools lib traceevent: Use helper trace-seq in print functions like kernel does Jiri Olsa reported that his plugin for scsi was chopping off part of the output. Investigating this, I found that Jiri used the same functions as what is in the kernel, which adds the following: trace_seq_putc(p, 0); This adds a '\0' to the output string. The reason this works in the kernel is that the "p" that is passed to the function helper is a temporary trace_seq. But in the libtraceevent library, it's the pointer to the trace_seq used to output. By adding the '\0', it truncates the line and nothing added after that will be printed. We can solve this in two ways. One is to have the helper functions for the library not add the unnecessary '\0'. The other is to change the library to also use a helper trace_seq structure that gets copied to the main trace_seq just like the kernel does. The latter allows the helper functions in the plugins to be the same as the kernel, which is the better solution. Signed-off-by: Steven Rostedt Reported-by: Jiri Olsa Tested-by: Jiri Olsa Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20131119182937.401668e3@gandalf.local.home Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/event-parse.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 217c82ee3665..900fca01bdd3 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -4099,6 +4099,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event unsigned long long val; struct func_map *func; const char *saveptr; + struct trace_seq p; char *bprint_fmt = NULL; char format[32]; int show_func; @@ -4306,8 +4307,12 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event format[len] = 0; if (!len_as_arg) len_arg = -1; - print_str_arg(s, data, size, event, + /* Use helper trace_seq */ + trace_seq_init(&p); + print_str_arg(&p, data, size, event, format, len_arg, arg); + trace_seq_terminate(&p); + trace_seq_puts(s, p.buffer); arg = arg->next; break; default: From 15e65c693d7de4c83d1e97fd89f87d47f2219782 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 14 Nov 2013 18:43:30 +0200 Subject: [PATCH 011/282] perf trace: Remove thread summary coloring Thread summary line coloring looks ugly. It doesn't add much value so remove coloring completely. Signed-off-by: Pekka Enberg Acked-by: David Ahern Link: http://lkml.kernel.org/r/1384447410-1771-1-git-send-email-penberg@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 8be17fc462ba..e9f345e2551a 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2158,7 +2158,6 @@ static int trace__fprintf_one_thread(struct thread *thread, void *priv) size_t printed = data->printed; struct trace *trace = data->trace; struct thread_trace *ttrace = thread->priv; - const char *color; double ratio; if (ttrace == NULL) @@ -2166,17 +2165,9 @@ static int trace__fprintf_one_thread(struct thread *thread, void *priv) ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; - color = PERF_COLOR_NORMAL; - if (ratio > 50.0) - color = PERF_COLOR_RED; - else if (ratio > 25.0) - color = PERF_COLOR_GREEN; - else if (ratio > 5.0) - color = PERF_COLOR_YELLOW; - - printed += color_fprintf(fp, color, " %s (%d), ", thread__comm_str(thread), thread->tid); + printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid); printed += fprintf(fp, "%lu events, ", ttrace->nr_events); - printed += color_fprintf(fp, color, "%.1f%%", ratio); + printed += fprintf(fp, "%.1f%%", ratio); printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms); printed += thread__dump_stats(ttrace, trace, fp); From bf80669e4f689f181f23a54dfe2a0f264147ad67 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 14 Nov 2013 20:51:30 -0700 Subject: [PATCH 012/282] perf top: Make -g refer to callchains In most commands -g is used for callchains. Make perf-top follow suit. Move group to just --group with no short cut making it similar to perf-record. Signed-off-by: David Ahern Acked-by: Ingo Molnar Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1384487490-6865-1-git-send-email-dsahern@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-top.txt | 5 ++--- tools/perf/builtin-top.c | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 7de01dd79688..cdd8d4946dba 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -50,7 +50,6 @@ Default is to monitor all CPUS. --count-filter=:: Only display functions with more events than this. --g:: --group:: Put the counters into a counter group. @@ -143,12 +142,12 @@ Default is to monitor all CPUS. --asm-raw:: Show raw instruction encoding of assembly instructions. --G:: +-g:: Enables call-graph (stack chain/backtrace) recording. --call-graph:: Setup and enable call-graph (stack chain/backtrace) recording, - implies -G. + implies -g. --max-stack:: Set the stack depth limit when parsing the callchain, anything diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 71e6402729a8..531522d3d97b 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1084,7 +1084,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) "dump the symbol table used for profiling"), OPT_INTEGER('f', "count-filter", &top.count_filter, "only display functions with more events than this"), - OPT_BOOLEAN('g', "group", &opts->group, + OPT_BOOLEAN(0, "group", &opts->group, "put the counters into a counter group"), OPT_BOOLEAN('i', "no-inherit", &opts->no_inherit, "child tasks do not inherit counters"), @@ -1105,7 +1105,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) " abort, in_tx, transaction"), OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples, "Show a column with the number of samples"), - OPT_CALLBACK_NOOPT('G', NULL, &top.record_opts, + OPT_CALLBACK_NOOPT('g', NULL, &top.record_opts, NULL, "enables call-graph recording", &callchain_opt), OPT_CALLBACK(0, "call-graph", &top.record_opts, From 2cf025e69543f5f4aa68e8549d60680515fef5ad Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Sun, 17 Nov 2013 21:43:23 +0530 Subject: [PATCH 013/282] perf completion: Introduce a layer of indirection Define the variables cur, words, cword, and prev outside the main completion function so that we have a chance to override it when we introduce zsh support. Signed-off-by: Ramkumar Ramachandra Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1384704807-15779-2-git-send-email-artagnon@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bash_completion | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tools/perf/bash_completion b/tools/perf/bash_completion index 62e157db2e2b..3efdc84f112c 100644 --- a/tools/perf/bash_completion +++ b/tools/perf/bash_completion @@ -89,15 +89,12 @@ __ltrim_colon_completions() fi } -type perf &>/dev/null && -_perf() +__perf_main () { - local cur words cword prev cmd - - COMPREPLY=() - _get_comp_words_by_ref -n =: cur words cword prev + local cmd cmd=${words[0]} + COMPREPLY=() # List perf subcommands or long options if [ $cword -eq 1 ]; then @@ -120,6 +117,14 @@ _perf() opts=$($cmd $subcmd --list-opts) COMPREPLY=( $( compgen -W '$opts' -- "$cur" ) ) fi +} + +type perf &>/dev/null && +_perf() +{ + local cur words cword prev + _get_comp_words_by_ref -n =: cur words cword prev + __perf_main } && complete -o bashdefault -o default -o nospace -F _perf perf 2>/dev/null \ From 12f9dd5042483698c74a133d9004ff1d1a6474f9 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Sun, 17 Nov 2013 21:43:24 +0530 Subject: [PATCH 014/282] perf completion: Factor out compgen stuff compgen is a bash-builtin; factor out the invocations into a separate function to give us a chance to override it with a zsh equivalent in future patches. Signed-off-by: Ramkumar Ramachandra Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1384704807-15779-3-git-send-email-artagnon@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bash_completion | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/perf/bash_completion b/tools/perf/bash_completion index 3efdc84f112c..824312681601 100644 --- a/tools/perf/bash_completion +++ b/tools/perf/bash_completion @@ -89,6 +89,11 @@ __ltrim_colon_completions() fi } +__perfcomp () +{ + COMPREPLY=( $( compgen -W "$1" -- "$2" ) ) +} + __perf_main () { local cmd @@ -99,23 +104,23 @@ __perf_main () # List perf subcommands or long options if [ $cword -eq 1 ]; then if [[ $cur == --* ]]; then - COMPREPLY=( $( compgen -W '--help --version \ + __perfcomp '--help --version \ --exec-path --html-path --paginate --no-pager \ - --perf-dir --work-tree --debugfs-dir' -- "$cur" ) ) + --perf-dir --work-tree --debugfs-dir' -- "$cur" else cmds=$($cmd --list-cmds) - COMPREPLY=( $( compgen -W '$cmds' -- "$cur" ) ) + __perfcomp "$cmds" "$cur" fi # List possible events for -e option elif [[ $prev == "-e" && "${words[1]}" == @(record|stat|top) ]]; then evts=$($cmd list --raw-dump) - COMPREPLY=( $( compgen -W '$evts' -- "$cur" ) ) + __perfcomp "$evts" "$cur" __ltrim_colon_completions $cur # List long option names elif [[ $cur == --* ]]; then subcmd=${words[1]} opts=$($cmd $subcmd --list-opts) - COMPREPLY=( $( compgen -W '$opts' -- "$cur" ) ) + __perfcomp "$opts" "$cur" fi } From 37e72c31061521d6f0e4b7fe47cd5748280ed691 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Sun, 17 Nov 2013 21:43:25 +0530 Subject: [PATCH 015/282] perf completion: Factor out call to __ltrim_colon_completions In our sole callsite, __ltrim_colon_completions is called after __perfcomp, to modify the COMPREPLY set by the invocation. This is problematic, because in the zsh equivalent (using compset/ compadd), we'll have to generate completions in one-shot. So factor out this entire callsite into a special override'able __perfcomp_colon function; we will override it when introducing zsh support. Signed-off-by: Ramkumar Ramachandra Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1384704807-15779-4-git-send-email-artagnon@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bash_completion | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/perf/bash_completion b/tools/perf/bash_completion index 824312681601..573599b42be2 100644 --- a/tools/perf/bash_completion +++ b/tools/perf/bash_completion @@ -94,6 +94,12 @@ __perfcomp () COMPREPLY=( $( compgen -W "$1" -- "$2" ) ) } +__perfcomp_colon () +{ + __perfcomp "$1" "$2" + __ltrim_colon_completions $cur +} + __perf_main () { local cmd @@ -114,8 +120,7 @@ __perf_main () # List possible events for -e option elif [[ $prev == "-e" && "${words[1]}" == @(record|stat|top) ]]; then evts=$($cmd list --raw-dump) - __perfcomp "$evts" "$cur" - __ltrim_colon_completions $cur + __perfcomp_colon "$evts" "$cur" # List long option names elif [[ $cur == --* ]]; then subcmd=${words[1]} From f38ab8af794c184c15f5e001d0eaa16f4a120978 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Sun, 17 Nov 2013 21:43:26 +0530 Subject: [PATCH 016/282] perf completion: Introduce zsh support __perfcomp(), __perfcomp_colon(), and _perf() have to be overridden. Inspired by the way the git.git completion system is structured. Signed-off-by: Ramkumar Ramachandra Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1384704807-15779-5-git-send-email-artagnon@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bash_completion | 63 +++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/tools/perf/bash_completion b/tools/perf/bash_completion index 573599b42be2..49494882d9bb 100644 --- a/tools/perf/bash_completion +++ b/tools/perf/bash_completion @@ -1,4 +1,4 @@ -# perf completion +# perf bash and zsh completion # Taken from git.git's completion script. __my_reassemble_comp_words_by_ref() @@ -129,6 +129,67 @@ __perf_main () fi } +if [[ -n ${ZSH_VERSION-} ]]; then + autoload -U +X compinit && compinit + + __perfcomp () + { + emulate -L zsh + + local c IFS=$' \t\n' + local -a array + + for c in ${=1}; do + case $c in + --*=*|*.) ;; + *) c="$c " ;; + esac + array[${#array[@]}+1]="$c" + done + + compset -P '*[=:]' + compadd -Q -S '' -a -- array && _ret=0 + } + + __perfcomp_colon () + { + emulate -L zsh + + local cur_="${2-$cur}" + local c IFS=$' \t\n' + local -a array + + if [[ "$cur_" == *:* ]]; then + local colon_word=${cur_%"${cur_##*:}"} + fi + + for c in ${=1}; do + case $c in + --*=*|*.) ;; + *) c="$c " ;; + esac + array[$#array+1]=${c#"$colon_word"} + done + + compset -P '*[=:]' + compadd -Q -S '' -a -- array && _ret=0 + } + + _perf () + { + local _ret=1 cur cword prev + cur=${words[CURRENT]} + prev=${words[CURRENT-1]} + let cword=CURRENT-1 + emulate ksh -c __perf_main + let _ret && _default && _ret=0 + return _ret + } + + compdef _perf perf + return +fi + type perf &>/dev/null && _perf() { From a8b4c7014cadfdacd4e1f4c963128593be6f20de Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Sun, 17 Nov 2013 21:43:27 +0530 Subject: [PATCH 017/282] perf completion: Rename file to reflect zsh support Signed-off-by: Ramkumar Ramachandra Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1384704807-15779-6-git-send-email-artagnon@gmail.com [ Fix 'make install' target ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 4 ++-- tools/perf/{bash_completion => perf-completion.sh} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename tools/perf/{bash_completion => perf-completion.sh} (100%) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 7fc8f179cae7..e416ccc7d831 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -840,9 +840,9 @@ ifndef NO_LIBPYTHON $(INSTALL) scripts/python/*.py -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python'; \ $(INSTALL) scripts/python/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin' endif - $(call QUIET_INSTALL, bash_completion-script) \ + $(call QUIET_INSTALL, perf_completion-script) \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d'; \ - $(INSTALL) bash_completion '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d/perf' + $(INSTALL) perf-completion.sh '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d/perf' $(call QUIET_INSTALL, tests) \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'; \ $(INSTALL) tests/attr.py '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'; \ diff --git a/tools/perf/bash_completion b/tools/perf/perf-completion.sh similarity index 100% rename from tools/perf/bash_completion rename to tools/perf/perf-completion.sh From e944d3d7d151eea149c62310eaff7b92c7732f58 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Nov 2013 14:34:52 +0900 Subject: [PATCH 018/282] perf script: Move evname print code to process_event() The print_sample_start() will be reused by other printing routine for internal events like COMM, FORK and EXIT from next patch. And because they're not tied to a specific event, move the evname print code to its caller. Signed-off-by: Namhyung Kim Reviewed-by: David Ahern Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1384752894-10974-1-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index baf17989a216..b392770766dd 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -288,7 +288,6 @@ static void print_sample_start(struct perf_sample *sample, struct perf_evsel *evsel) { struct perf_event_attr *attr = &evsel->attr; - const char *evname = NULL; unsigned long secs; unsigned long usecs; unsigned long long nsecs; @@ -323,11 +322,6 @@ static void print_sample_start(struct perf_sample *sample, usecs = nsecs / NSECS_PER_USEC; printf("%5lu.%06lu: ", secs, usecs); } - - if (PRINT_FIELD(EVNAME)) { - evname = perf_evsel__name(evsel); - printf("%s: ", evname ? evname : "[unknown]"); - } } static bool is_bts_event(struct perf_event_attr *attr) @@ -434,6 +428,11 @@ static void process_event(union perf_event *event, struct perf_sample *sample, print_sample_start(sample, thread, evsel); + if (PRINT_FIELD(EVNAME)) { + const char *evname = perf_evsel__name(evsel); + printf("%s: ", evname ? evname : "[unknown]"); + } + if (is_bts_event(attr)) { print_sample_bts(event, sample, evsel, machine, thread); return; From 3aa5939d71fa22a947808ba9c798b8537c35097a Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 15 Nov 2013 15:52:29 +0200 Subject: [PATCH 019/282] perf record: Make per-cpu mmaps the default. This affects the -p, -t and -u options that previously defaulted to per-thread mmaps. Consequently add an option to select per-thread mmaps to support the old behaviour. Note that per-thread can be used with a workload-only (i.e. none of -p, -t, -u, -a or -C is selected) to get a per-thread mmap with no inheritance. Signed-off-by: Adrian Hunter Acked-by: Ingo Molnar Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/5286271D.3020808@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 10 +++++----- tools/perf/builtin-record.c | 5 +++-- tools/perf/tests/attr/test-record-no-inherit | 2 +- tools/perf/util/evlist.c | 6 ++++-- tools/perf/util/evsel.c | 5 +++-- tools/perf/util/target.c | 11 ++++++++++- tools/perf/util/target.h | 4 +++- 7 files changed, 29 insertions(+), 14 deletions(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 43b42c4f4a91..6ac867e7667f 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -201,11 +201,11 @@ abort events and some memory events in precise mode on modern Intel CPUs. --transaction:: Record transaction flags for transaction related events. ---force-per-cpu:: -Force the use of per-cpu mmaps. By default, when tasks are specified (i.e. -p, --t or -u options) per-thread mmaps are created. This option overrides that and -forces per-cpu mmaps. A side-effect of that is that inheritance is -automatically enabled. Add the -i option also to disable inheritance. +--per-thread:: +Use per-thread mmaps. By default per-cpu mmaps are created. This option +overrides that and uses per-thread mmaps. A side-effect of that is that +inheritance is automatically disabled. --per-thread is ignored with a warning +if combined with -a or -C options. SEE ALSO -------- diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 7c8020a32784..f5b18b8fe8c9 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -800,6 +800,7 @@ static struct perf_record record = { .freq = 4000, .target = { .uses_mmap = true, + .default_per_cpu = true, }, }, }; @@ -888,8 +889,8 @@ const struct option record_options[] = { "sample by weight (on special events only)"), OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction, "sample transaction flags (special events only)"), - OPT_BOOLEAN(0, "force-per-cpu", &record.opts.target.force_per_cpu, - "force the use of per-cpu mmaps"), + OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread, + "use per-thread mmaps"), OPT_END() }; diff --git a/tools/perf/tests/attr/test-record-no-inherit b/tools/perf/tests/attr/test-record-no-inherit index 9079a25cd643..44edcb2edcd5 100644 --- a/tools/perf/tests/attr/test-record-no-inherit +++ b/tools/perf/tests/attr/test-record-no-inherit @@ -3,5 +3,5 @@ command = record args = -i kill >/dev/null 2>&1 [event:base-record] -sample_type=259 +sample_type=263 inherit=0 diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index bbc746aa5716..76fa76431329 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -819,8 +819,10 @@ int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target) if (evlist->threads == NULL) return -1; - if (target->force_per_cpu) - evlist->cpus = cpu_map__new(target->cpu_list); + if (target->default_per_cpu) + evlist->cpus = target->per_thread ? + cpu_map__dummy_new() : + cpu_map__new(target->cpu_list); else if (target__has_task(target)) evlist->cpus = cpu_map__dummy_new(); else if (!target__has_cpu(target) && !target->uses_mmap) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index dad64926170f..b5fe7f9b2e15 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -574,6 +574,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct perf_evsel *leader = evsel->leader; struct perf_event_attr *attr = &evsel->attr; int track = !evsel->idx; /* only the first counter needs these */ + bool per_cpu = opts->target.default_per_cpu && !opts->target.per_thread; attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1; attr->inherit = !opts->no_inherit; @@ -647,7 +648,7 @@ void perf_evsel__config(struct perf_evsel *evsel, } } - if (target__has_cpu(&opts->target) || opts->target.force_per_cpu) + if (target__has_cpu(&opts->target)) perf_evsel__set_sample_bit(evsel, CPU); if (opts->period) @@ -655,7 +656,7 @@ void perf_evsel__config(struct perf_evsel *evsel, if (!perf_missing_features.sample_id_all && (opts->sample_time || !opts->no_inherit || - target__has_cpu(&opts->target) || opts->target.force_per_cpu)) + target__has_cpu(&opts->target) || per_cpu)) perf_evsel__set_sample_bit(evsel, TIME); if (opts->raw_samples) { diff --git a/tools/perf/util/target.c b/tools/perf/util/target.c index 3c778a07b7cc..e74c5963dc7a 100644 --- a/tools/perf/util/target.c +++ b/tools/perf/util/target.c @@ -55,6 +55,13 @@ enum target_errno target__validate(struct target *target) ret = TARGET_ERRNO__UID_OVERRIDE_SYSTEM; } + /* THREAD and SYSTEM/CPU are mutually exclusive */ + if (target->per_thread && (target->system_wide || target->cpu_list)) { + target->per_thread = false; + if (ret == TARGET_ERRNO__SUCCESS) + ret = TARGET_ERRNO__SYSTEM_OVERRIDE_THREAD; + } + return ret; } @@ -100,6 +107,7 @@ static const char *target__error_str[] = { "UID switch overriding CPU", "PID/TID switch overriding SYSTEM", "UID switch overriding SYSTEM", + "SYSTEM/CPU switch overriding PER-THREAD", "Invalid User: %s", "Problems obtaining information for user %s", }; @@ -131,7 +139,8 @@ int target__strerror(struct target *target, int errnum, msg = target__error_str[idx]; switch (errnum) { - case TARGET_ERRNO__PID_OVERRIDE_CPU ... TARGET_ERRNO__UID_OVERRIDE_SYSTEM: + case TARGET_ERRNO__PID_OVERRIDE_CPU ... + TARGET_ERRNO__SYSTEM_OVERRIDE_THREAD: snprintf(buf, buflen, "%s", msg); break; diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h index 2d0c50690892..31dd2e9a27d0 100644 --- a/tools/perf/util/target.h +++ b/tools/perf/util/target.h @@ -12,7 +12,8 @@ struct target { uid_t uid; bool system_wide; bool uses_mmap; - bool force_per_cpu; + bool default_per_cpu; + bool per_thread; }; enum target_errno { @@ -33,6 +34,7 @@ enum target_errno { TARGET_ERRNO__UID_OVERRIDE_CPU, TARGET_ERRNO__PID_OVERRIDE_SYSTEM, TARGET_ERRNO__UID_OVERRIDE_SYSTEM, + TARGET_ERRNO__SYSTEM_OVERRIDE_THREAD, /* for target__parse_uid() */ TARGET_ERRNO__INVALID_UID, From 4bc437964ef540462bd15af4a713da62961809aa Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 18 Nov 2013 11:55:55 +0200 Subject: [PATCH 020/282] perf tools: Allow '--inherit' as the negation of '--no-inherit' Long options can be negated by prefixing them with 'no-'. However options that already start with 'no-', such as '--no-inherit' result in ugly double 'no's. Avoid that by accepting that the removal of 'no-' also negates the long option. Suggested-by: Peter Zijlstra Signed-off-by: Adrian Hunter Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1384768557-23331-3-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-options.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c index 31f404a032a9..b6b39ff14fbb 100644 --- a/tools/perf/util/parse-options.c +++ b/tools/perf/util/parse-options.c @@ -224,6 +224,24 @@ static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg, return 0; } if (!rest) { + if (!prefixcmp(options->long_name, "no-")) { + /* + * The long name itself starts with "no-", so + * accept the option without "no-" so that users + * do not have to enter "no-no-" to get the + * negation. + */ + rest = skip_prefix(arg, options->long_name + 3); + if (rest) { + flags |= OPT_UNSET; + goto match; + } + /* Abbreviated case */ + if (!prefixcmp(options->long_name + 3, arg)) { + flags |= OPT_UNSET; + goto is_abbreviated; + } + } /* abbreviated? */ if (!strncmp(options->long_name, arg, arg_end - arg)) { is_abbreviated: @@ -259,6 +277,7 @@ is_abbreviated: if (!rest) continue; } +match: if (*rest) { if (*rest != '=') continue; From 167faf32b07fc47637048fbcbdfcf4a89481686d Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 18 Nov 2013 11:55:56 +0200 Subject: [PATCH 021/282] perf tools: Add option macro OPT_BOOLEAN_SET OPT_BOOLEAN_SET records whether a boolean option was set by the user. That information can be used to change the default value for the option after the options have been parsed. Signed-off-by: Adrian Hunter Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1384768557-23331-4-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-options.c | 2 ++ tools/perf/util/parse-options.h | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c index b6b39ff14fbb..d22e3f8017dc 100644 --- a/tools/perf/util/parse-options.c +++ b/tools/perf/util/parse-options.c @@ -78,6 +78,8 @@ static int get_value(struct parse_opt_ctx_t *p, case OPTION_BOOLEAN: *(bool *)opt->value = unset ? false : true; + if (opt->set) + *(bool *)opt->set = true; return 0; case OPTION_INCR: diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h index b0241e28eaf7..cbf0149cf221 100644 --- a/tools/perf/util/parse-options.h +++ b/tools/perf/util/parse-options.h @@ -82,6 +82,9 @@ typedef int parse_opt_cb(const struct option *, const char *arg, int unset); * OPTION_{BIT,SET_UINT,SET_PTR} store the {mask,integer,pointer} to put in * the value when met. * CALLBACKS can use it like they want. + * + * `set`:: + * whether an option was set by the user */ struct option { enum parse_opt_type type; @@ -94,6 +97,7 @@ struct option { int flags; parse_opt_cb *callback; intptr_t defval; + bool *set; }; #define check_vtype(v, type) ( BUILD_BUG_ON_ZERO(!__builtin_types_compatible_p(typeof(v), type)) + v ) @@ -103,6 +107,10 @@ struct option { #define OPT_GROUP(h) { .type = OPTION_GROUP, .help = (h) } #define OPT_BIT(s, l, v, h, b) { .type = OPTION_BIT, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h), .defval = (b) } #define OPT_BOOLEAN(s, l, v, h) { .type = OPTION_BOOLEAN, .short_name = (s), .long_name = (l), .value = check_vtype(v, bool *), .help = (h) } +#define OPT_BOOLEAN_SET(s, l, v, os, h) \ + { .type = OPTION_BOOLEAN, .short_name = (s), .long_name = (l), \ + .value = check_vtype(v, bool *), .help = (h), \ + .set = check_vtype(os, bool *)} #define OPT_INCR(s, l, v, h) { .type = OPTION_INCR, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h) } #define OPT_SET_UINT(s, l, v, h, i) { .type = OPTION_SET_UINT, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned int *), .help = (h), .defval = (i) } #define OPT_SET_PTR(s, l, v, h, p) { .type = OPTION_SET_PTR, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (p) } From 69e7e5b02bc6a9e5cf4a54911b27ca133cc1f99f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 18 Nov 2013 11:55:57 +0200 Subject: [PATCH 022/282] perf record: Default -t option to no inheritance The change to per-cpu mmaps causes the -p, -t and -u options now to have inheritance enabled by default. Change that back to no inheritance but for the -t option only. Requested-by: Peter Zijlstra Signed-off-by: Adrian Hunter Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1384768557-23331-5-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 2 ++ tools/perf/builtin-record.c | 8 ++++++-- tools/perf/perf.h | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 6ac867e7667f..c407897f0435 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -57,6 +57,8 @@ OPTIONS -t:: --tid=:: Record events on existing thread ID (comma separated list). + This option also disables inheritance by default. Enable it by adding + --inherit. -u:: --uid=:: diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index f5b18b8fe8c9..65615a8bc25e 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -843,8 +843,9 @@ const struct option record_options[] = { OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), OPT_STRING('o', "output", &record.file.path, "file", "output file name"), - OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit, - "child tasks do not inherit counters"), + OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, + &record.opts.no_inherit_set, + "child tasks do not inherit counters"), OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"), OPT_CALLBACK('m', "mmap-pages", &record.opts.mmap_pages, "pages", "number of mmap data pages", @@ -939,6 +940,9 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) goto out_symbol_exit; } + if (rec->opts.target.tid && !rec->opts.no_inherit_set) + rec->opts.no_inherit = true; + err = target__validate(&rec->opts.target); if (err) { target__strerror(&rec->opts.target, err, errbuf, BUFSIZ); diff --git a/tools/perf/perf.h b/tools/perf/perf.h index b079304bd53d..b23fed527514 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -254,6 +254,7 @@ struct perf_record_opts { bool inherit_stat; bool no_delay; bool no_inherit; + bool no_inherit_set; bool no_samples; bool raw_samples; bool sample_address; From 0a8eb275cbdb8462854d5f7e1168d86cee4cc9ea Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 1 Nov 2013 20:25:45 +0400 Subject: [PATCH 023/282] perf timechart: Always try to print at least 15 tasks Always try to print at least 15 tasks no matter how long they run. Signed-off-by: Stanislav Fomichev Acked-by: Namhyung Kim Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1383323151-19810-2-git-send-email-stfomichev@yandex-team.ru Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-timechart.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 41c9bde2fb67..bb21e57ff9bb 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -945,15 +945,17 @@ static void write_svg_file(const char *filename) { u64 i; int count; + int thresh = TIME_THRESH; numcpus++; - count = determine_display_tasks(TIME_THRESH); - - /* We'd like to show at least 15 tasks; be less picky if we have fewer */ - if (count < 15) - count = determine_display_tasks(TIME_THRESH / 10); + /* We'd like to show at least proc_num tasks; + * be less picky if we have fewer */ + do { + count = determine_display_tasks(thresh); + thresh /= 10; + } while (!process_filter && thresh && count < 15); open_svg(filename, numcpus, count, first_time, last_time); From 54874e3236b834064943c02a647823ab5d97be57 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 1 Nov 2013 20:25:46 +0400 Subject: [PATCH 024/282] perf timechart: Add option to limit number of tasks Add -n option to specify min. number of tasks to print. Signed-off-by: Stanislav Fomichev Acked-by: Namhyung Kim Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1383323151-19810-3-git-send-email-stfomichev@yandex-team.ru Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-timechart.txt | 4 ++++ tools/perf/builtin-timechart.c | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-timechart.txt b/tools/perf/Documentation/perf-timechart.txt index 3ff8bd4f0b4d..4373144a43b4 100644 --- a/tools/perf/Documentation/perf-timechart.txt +++ b/tools/perf/Documentation/perf-timechart.txt @@ -54,6 +54,10 @@ $ perf timechart Written 10.2 seconds of trace to output.svg. +-n:: +--proc-num:: + Print task info for at least given number of tasks. + SEE ALSO -------- linkperf:perf-record[1] diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index bb21e57ff9bb..c352be418f98 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -41,6 +41,7 @@ #define SUPPORT_OLD_POWER_EVENTS 1 #define PWR_EVENT_EXIT -1 +static int proc_num = 15; static unsigned int numcpus; static u64 min_freq; /* Lowest CPU frequency seen */ @@ -955,7 +956,7 @@ static void write_svg_file(const char *filename) do { count = determine_display_tasks(thresh); thresh /= 10; - } while (!process_filter && thresh && count < 15); + } while (!process_filter && thresh && count < proc_num); open_svg(filename, numcpus, count, first_time, last_time); @@ -1102,6 +1103,8 @@ int cmd_timechart(int argc, const char **argv, parse_process), OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", "Look for files with symbols relative to this directory"), + OPT_INTEGER('n', "proc-num", &proc_num, + "min. number of tasks to print"), OPT_END() }; const char * const timechart_usage[] = { From 753c505dc49a87a4421d452bda048e4b93e8e42b Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 1 Nov 2013 20:25:47 +0400 Subject: [PATCH 025/282] perf timechart: Use proc_num to implement --power-only Don't use special flag to indicate power-only mode, just set proc_num to 0. Signed-off-by: Stanislav Fomichev Acked-by: Namhyung Kim Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1383323151-19810-4-git-send-email-stfomichev@yandex-team.ru Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-timechart.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index c352be418f98..6410c9ead9e2 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -912,7 +912,7 @@ static int determine_display_tasks(u64 threshold) /* no exit marker, task kept running to the end */ if (p->end_time == 0) p->end_time = last_time; - if (p->total_time >= threshold && !power_only) + if (p->total_time >= threshold) p->display = 1; c = p->all; @@ -923,7 +923,7 @@ static int determine_display_tasks(u64 threshold) if (c->start_time == 1) c->start_time = first_time; - if (c->total_time >= threshold && !power_only) { + if (c->total_time >= threshold) { c->display = 1; count++; } @@ -950,6 +950,8 @@ static void write_svg_file(const char *filename) numcpus++; + if (power_only) + proc_num = 0; /* We'd like to show at least proc_num tasks; * be less picky if we have fewer */ @@ -967,9 +969,11 @@ static void write_svg_file(const char *filename) svg_cpu_box(i, max_freq, turbo_frequency); draw_cpu_usage(); - draw_process_bars(); + if (proc_num) + draw_process_bars(); draw_c_p_states(); - draw_wakeups(); + if (proc_num) + draw_wakeups(); svg_close(); } From c87097d39dae1c42a5068e00dd3b76a4162ee0fc Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 1 Nov 2013 20:25:48 +0400 Subject: [PATCH 026/282] perf timechart: Add support for displaying only tasks related data In order to make SVG smaller and faster to browse add possibility to switch off power related information with -T switch. Signed-off-by: Stanislav Fomichev Acked-by: Namhyung Kim Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1383323151-19810-5-git-send-email-stfomichev@yandex-team.ru Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-timechart.txt | 3 +++ tools/perf/builtin-timechart.c | 11 ++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-timechart.txt b/tools/perf/Documentation/perf-timechart.txt index 4373144a43b4..d8441894d9ec 100644 --- a/tools/perf/Documentation/perf-timechart.txt +++ b/tools/perf/Documentation/perf-timechart.txt @@ -35,6 +35,9 @@ OPTIONS -P:: --power-only:: Only output the CPU power section of the diagram +-T:: +--tasks-only:: + Don't output processor state transitions -p:: --process:: Select the processes to display, by name or PID diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 6410c9ead9e2..b3f175a30d94 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -51,6 +51,7 @@ static u64 turbo_frequency; static u64 first_time, last_time; static bool power_only; +static bool tasks_only; struct per_pid; @@ -971,7 +972,8 @@ static void write_svg_file(const char *filename) draw_cpu_usage(); if (proc_num) draw_process_bars(); - draw_c_p_states(); + if (!tasks_only) + draw_c_p_states(); if (proc_num) draw_wakeups(); @@ -1102,6 +1104,8 @@ int cmd_timechart(int argc, const char **argv, OPT_STRING('o', "output", &output_name, "file", "output file name"), OPT_INTEGER('w', "width", &svg_page_width, "page width"), OPT_BOOLEAN('P', "power-only", &power_only, "output power data only"), + OPT_BOOLEAN('T', "tasks-only", &tasks_only, + "output processes data only"), OPT_CALLBACK('p', "process", NULL, "process", "process selector. Pass a pid or process name.", parse_process), @@ -1119,6 +1123,11 @@ int cmd_timechart(int argc, const char **argv, argc = parse_options(argc, argv, options, timechart_usage, PARSE_OPT_STOP_AT_NON_OPTION); + if (power_only && tasks_only) { + pr_err("-P and -T options cannot be used at the same time.\n"); + return -1; + } + symbol__init(); if (argc && !strncmp(argv[0], "rec", 3)) From cbb2e81e5232b1bca5cd2aa1d7a7eb1cd30f8304 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 1 Nov 2013 20:25:49 +0400 Subject: [PATCH 027/282] perf timechart: Group figures and add title with details Add titles to figures so we can run SVG interactively in Firefox and check event details in the tooltips. This also aids exploring SVG with Inkscape because when user clicks on one part of logical figure, all parts are selected. It's also possible to read titles with Inkscape in the object details. Signed-off-by: Stanislav Fomichev Acked-by: Namhyung Kim Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1383323151-19810-6-git-send-email-stfomichev@yandex-team.ru Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-timechart.c | 6 ++-- tools/perf/util/svghelper.c | 56 ++++++++++++++++++++++++++++++++-- tools/perf/util/svghelper.h | 5 +-- 3 files changed, 60 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index b3f175a30d94..6a848b8c4c16 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -798,11 +798,11 @@ static void draw_process_bars(void) sample = c->samples; while (sample) { if (sample->type == TYPE_RUNNING) - svg_sample(Y, sample->cpu, sample->start_time, sample->end_time); + svg_running(Y, sample->cpu, sample->start_time, sample->end_time); if (sample->type == TYPE_BLOCKED) - svg_box(Y, sample->start_time, sample->end_time, "blocked"); + svg_blocked(Y, sample->cpu, sample->start_time, sample->end_time); if (sample->type == TYPE_WAITING) - svg_waiting(Y, sample->start_time, sample->end_time); + svg_waiting(Y, sample->cpu, sample->start_time, sample->end_time); sample = sample->next; } diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c index 96c866045d60..9a5b41392e01 100644 --- a/tools/perf/util/svghelper.c +++ b/tools/perf/util/svghelper.c @@ -95,6 +95,7 @@ void open_svg(const char *filename, int cpus, int rows, u64 start, u64 end) total_height = (1 + rows + cpu2slot(cpus)) * SLOT_MULT; fprintf(svgfile, " \n"); + fprintf(svgfile, "\n"); fprintf(svgfile, "\n", svg_page_width, total_height); fprintf(svgfile, "\n